pax_global_header00006660000000000000000000000064147150432670014523gustar00rootroot0000000000000052 comment=6008f1ecc3d0482d5c6bd466b6fd217a74b3a31a crac-criu-1.5.0/000077500000000000000000000000001471504326700133765ustar00rootroot00000000000000crac-criu-1.5.0/.circleci/000077500000000000000000000000001471504326700152315ustar00rootroot00000000000000crac-criu-1.5.0/.circleci/config.yml000066400000000000000000000011021471504326700172130ustar00rootroot00000000000000version: 2.1 jobs: test-local-gcc: machine: image: ubuntu-2004:202010-01 working_directory: ~/criu steps: - checkout - run: name: "Test local with GCC" command: sudo -E make -C scripts/ci local test-local-clang: machine: image: ubuntu-2004:202010-01 working_directory: ~/criu steps: - checkout - run: name: "Test local with CLANG" command: sudo -E make -C scripts/ci local CLANG=1 workflows: version: 2 builds: jobs: - test-local-gcc - test-local-clang crac-criu-1.5.0/.cirrus.yml000066400000000000000000000127411471504326700155130ustar00rootroot00000000000000task: name: Vagrant Fedora based test (no VDSO) environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" compute_engine_instance: image_project: cirrus-images image: family/docker-kvm platform: linux cpu: 4 memory: 16G nested_virtualization: true setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-no-vdso task: name: CentOS Stream 9 based test environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" compute_engine_instance: image_project: centos-cloud image: family/centos-stream-9 platform: linux cpu: 4 memory: 8G setup_script: | ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto libdrm-devel systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed. # The Cirrus CI user runs as a service from selinux point of view and is # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0). # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode. setenforce 0 build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" task: name: Vagrant Fedora Rawhide based test environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" compute_engine_instance: image_project: cirrus-images image: family/docker-kvm platform: linux cpu: 4 memory: 16G nested_virtualization: true setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-rawhide task: name: Vagrant Fedora based test (non-root) environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" compute_engine_instance: image_project: cirrus-images image: family/docker-kvm platform: linux cpu: 4 memory: 16G nested_virtualization: true setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-non-root task: name: CentOS Stream 8 based test environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" compute_engine_instance: image_project: centos-cloud image: family/centos-stream-8 platform: linux cpu: 4 memory: 8G setup_script: | ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto # Do not fail if latest epel repository definition is already installed yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : yum install -y dnf-plugins-core yum config-manager --set-enabled powertools yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto libdrm-devel alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed # The Cirrus CI user runs as a service from selinux point of view and is # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode setenforce 0 build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" task: name: aarch64 build GCC (native) arm_container: image: docker.io/library/ubuntu:jammy cpu: 4 memory: 4G script: uname -a build_script: | scripts/ci/apt-install make ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci local task: name: aarch64 build CLANG (native) arm_container: image: docker.io/library/ubuntu:jammy cpu: 4 memory: 4G script: uname -a build_script: | scripts/ci/apt-install make ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci local CLANG=1 task: name: aarch64 Fedora Rawhide arm_container: image: registry.fedoraproject.org/fedora:rawhide cpu: 4 memory: 4G script: uname -a build_script: | scripts/ci/prepare-for-fedora-rawhide.sh ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 make -C test/zdtm -j 4 crac-criu-1.5.0/.clang-format000066400000000000000000000413371471504326700157610ustar00rootroot00000000000000# SPDX-License-Identifier: GPL-2.0 # # clang-format configuration file. Intended for clang-format >= 11. # # For more information, see: # # Documentation/process/clang-format.rst # https://clang.llvm.org/docs/ClangFormat.html # https://clang.llvm.org/docs/ClangFormatStyleOptions.html # --- AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlines: Left # Unknown to clang-format-4.0 AlignOperands: true AlignTrailingComments: true AlignConsecutiveMacros: true AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: None AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: false BinPackArguments: true BinPackParameters: true BraceWrapping: AfterClass: false AfterControlStatement: false AfterEnum: false AfterFunction: true AfterNamespace: true AfterObjCDeclaration: false AfterStruct: false AfterUnion: false AfterExternBlock: false # Unknown to clang-format-5.0 BeforeCatch: false BeforeElse: false IndentBraces: false SplitEmptyFunction: true # Unknown to clang-format-4.0 SplitEmptyRecord: true # Unknown to clang-format-4.0 SplitEmptyNamespace: true # Unknown to clang-format-4.0 BreakBeforeBinaryOperators: None BreakBeforeBraces: Custom BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0 BreakBeforeTernaryOperators: false BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false ColumnLimit: 0 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false # Unknown to clang-format-4.0 ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 8 ContinuationIndentWidth: 8 Cpp11BracedListStyle: false DerivePointerAlignment: false DisableFormat: false ExperimentalAutoDetectBinPacking: false FixNamespaceComments: false # Unknown to clang-format-4.0 # Taken from: # git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \ # | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ # | sort | uniq ForEachMacros: - 'for_each_pstree_item' - 'for_each_bit' - 'apei_estatus_for_each_section' - 'ata_for_each_dev' - 'ata_for_each_link' - '__ata_qc_for_each' - 'ata_qc_for_each' - 'ata_qc_for_each_raw' - 'ata_qc_for_each_with_internal' - 'ax25_for_each' - 'ax25_uid_for_each' - '__bio_for_each_bvec' - 'bio_for_each_bvec' - 'bio_for_each_bvec_all' - 'bio_for_each_integrity_vec' - '__bio_for_each_segment' - 'bio_for_each_segment' - 'bio_for_each_segment_all' - 'bio_list_for_each' - 'bip_for_each_vec' - 'bitmap_for_each_clear_region' - 'bitmap_for_each_set_region' - 'blkg_for_each_descendant_post' - 'blkg_for_each_descendant_pre' - 'blk_queue_for_each_rl' - 'bond_for_each_slave' - 'bond_for_each_slave_rcu' - 'bpf_for_each_spilled_reg' - 'btree_for_each_safe128' - 'btree_for_each_safe32' - 'btree_for_each_safe64' - 'btree_for_each_safel' - 'card_for_each_dev' - 'cgroup_taskset_for_each' - 'cgroup_taskset_for_each_leader' - 'cpufreq_for_each_entry' - 'cpufreq_for_each_entry_idx' - 'cpufreq_for_each_valid_entry' - 'cpufreq_for_each_valid_entry_idx' - 'css_for_each_child' - 'css_for_each_descendant_post' - 'css_for_each_descendant_pre' - 'device_for_each_child_node' - 'displayid_iter_for_each' - 'dma_fence_chain_for_each' - 'do_for_each_ftrace_op' - 'drm_atomic_crtc_for_each_plane' - 'drm_atomic_crtc_state_for_each_plane' - 'drm_atomic_crtc_state_for_each_plane_state' - 'drm_atomic_for_each_plane_damage' - 'drm_client_for_each_connector_iter' - 'drm_client_for_each_modeset' - 'drm_connector_for_each_possible_encoder' - 'drm_for_each_bridge_in_chain' - 'drm_for_each_connector_iter' - 'drm_for_each_crtc' - 'drm_for_each_crtc_reverse' - 'drm_for_each_encoder' - 'drm_for_each_encoder_mask' - 'drm_for_each_fb' - 'drm_for_each_legacy_plane' - 'drm_for_each_plane' - 'drm_for_each_plane_mask' - 'drm_for_each_privobj' - 'drm_mm_for_each_hole' - 'drm_mm_for_each_node' - 'drm_mm_for_each_node_in_range' - 'drm_mm_for_each_node_safe' - 'flow_action_for_each' - 'for_each_acpi_dev_match' - 'for_each_active_dev_scope' - 'for_each_active_drhd_unit' - 'for_each_active_iommu' - 'for_each_aggr_pgid' - 'for_each_available_child_of_node' - 'for_each_bio' - 'for_each_board_func_rsrc' - 'for_each_bvec' - 'for_each_card_auxs' - 'for_each_card_auxs_safe' - 'for_each_card_components' - 'for_each_card_dapms' - 'for_each_card_pre_auxs' - 'for_each_card_prelinks' - 'for_each_card_rtds' - 'for_each_card_rtds_safe' - 'for_each_card_widgets' - 'for_each_card_widgets_safe' - 'for_each_cgroup_storage_type' - 'for_each_child_of_node' - 'for_each_clear_bit' - 'for_each_clear_bit_from' - 'for_each_cmsghdr' - 'for_each_compatible_node' - 'for_each_component_dais' - 'for_each_component_dais_safe' - 'for_each_comp_order' - 'for_each_console' - 'for_each_cpu' - 'for_each_cpu_and' - 'for_each_cpu_not' - 'for_each_cpu_wrap' - 'for_each_dapm_widgets' - 'for_each_dev_addr' - 'for_each_dev_scope' - 'for_each_dma_cap_mask' - 'for_each_dpcm_be' - 'for_each_dpcm_be_rollback' - 'for_each_dpcm_be_safe' - 'for_each_dpcm_fe' - 'for_each_drhd_unit' - 'for_each_dss_dev' - 'for_each_dtpm_table' - 'for_each_efi_memory_desc' - 'for_each_efi_memory_desc_in_map' - 'for_each_element' - 'for_each_element_extid' - 'for_each_element_id' - 'for_each_endpoint_of_node' - 'for_each_evictable_lru' - 'for_each_fib6_node_rt_rcu' - 'for_each_fib6_walker_rt' - 'for_each_free_mem_pfn_range_in_zone' - 'for_each_free_mem_pfn_range_in_zone_from' - 'for_each_free_mem_range' - 'for_each_free_mem_range_reverse' - 'for_each_func_rsrc' - 'for_each_hstate' - 'for_each_if' - 'for_each_iommu' - 'for_each_ip_tunnel_rcu' - 'for_each_irq_nr' - 'for_each_link_codecs' - 'for_each_link_cpus' - 'for_each_link_platforms' - 'for_each_lru' - 'for_each_matching_node' - 'for_each_matching_node_and_match' - 'for_each_member' - 'for_each_memcg_cache_index' - 'for_each_mem_pfn_range' - '__for_each_mem_range' - 'for_each_mem_range' - '__for_each_mem_range_rev' - 'for_each_mem_range_rev' - 'for_each_mem_region' - 'for_each_migratetype_order' - 'for_each_msi_entry' - 'for_each_msi_entry_safe' - 'for_each_msi_vector' - 'for_each_net' - 'for_each_net_continue_reverse' - 'for_each_netdev' - 'for_each_netdev_continue' - 'for_each_netdev_continue_rcu' - 'for_each_netdev_continue_reverse' - 'for_each_netdev_feature' - 'for_each_netdev_in_bond_rcu' - 'for_each_netdev_rcu' - 'for_each_netdev_reverse' - 'for_each_netdev_safe' - 'for_each_net_rcu' - 'for_each_new_connector_in_state' - 'for_each_new_crtc_in_state' - 'for_each_new_mst_mgr_in_state' - 'for_each_new_plane_in_state' - 'for_each_new_private_obj_in_state' - 'for_each_node' - 'for_each_node_by_name' - 'for_each_node_by_type' - 'for_each_node_mask' - 'for_each_node_state' - 'for_each_node_with_cpus' - 'for_each_node_with_property' - 'for_each_nonreserved_multicast_dest_pgid' - 'for_each_of_allnodes' - 'for_each_of_allnodes_from' - 'for_each_of_cpu_node' - 'for_each_of_pci_range' - 'for_each_old_connector_in_state' - 'for_each_old_crtc_in_state' - 'for_each_old_mst_mgr_in_state' - 'for_each_oldnew_connector_in_state' - 'for_each_oldnew_crtc_in_state' - 'for_each_oldnew_mst_mgr_in_state' - 'for_each_oldnew_plane_in_state' - 'for_each_oldnew_plane_in_state_reverse' - 'for_each_oldnew_private_obj_in_state' - 'for_each_old_plane_in_state' - 'for_each_old_private_obj_in_state' - 'for_each_online_cpu' - 'for_each_online_node' - 'for_each_online_pgdat' - 'for_each_pci_bridge' - 'for_each_pci_dev' - 'for_each_pci_msi_entry' - 'for_each_pcm_streams' - 'for_each_physmem_range' - 'for_each_populated_zone' - 'for_each_possible_cpu' - 'for_each_present_cpu' - 'for_each_prime_number' - 'for_each_prime_number_from' - 'for_each_process' - 'for_each_process_thread' - 'for_each_prop_codec_conf' - 'for_each_prop_dai_codec' - 'for_each_prop_dai_cpu' - 'for_each_prop_dlc_codecs' - 'for_each_prop_dlc_cpus' - 'for_each_prop_dlc_platforms' - 'for_each_property_of_node' - 'for_each_registered_fb' - 'for_each_requested_gpio' - 'for_each_requested_gpio_in_range' - 'for_each_reserved_mem_range' - 'for_each_reserved_mem_region' - 'for_each_rtd_codec_dais' - 'for_each_rtd_components' - 'for_each_rtd_cpu_dais' - 'for_each_rtd_dais' - 'for_each_set_bit' - 'for_each_set_bit_from' - 'for_each_set_clump8' - 'for_each_sg' - 'for_each_sg_dma_page' - 'for_each_sg_page' - 'for_each_sgtable_dma_page' - 'for_each_sgtable_dma_sg' - 'for_each_sgtable_page' - 'for_each_sgtable_sg' - 'for_each_sibling_event' - 'for_each_subelement' - 'for_each_subelement_extid' - 'for_each_subelement_id' - '__for_each_thread' - 'for_each_thread' - 'for_each_unicast_dest_pgid' - 'for_each_vsi' - 'for_each_wakeup_source' - 'for_each_zone' - 'for_each_zone_zonelist' - 'for_each_zone_zonelist_nodemask' - 'fwnode_for_each_available_child_node' - 'fwnode_for_each_child_node' - 'fwnode_graph_for_each_endpoint' - 'gadget_for_each_ep' - 'genradix_for_each' - 'genradix_for_each_from' - 'hash_for_each' - 'hash_for_each_possible' - 'hash_for_each_possible_rcu' - 'hash_for_each_possible_rcu_notrace' - 'hash_for_each_possible_safe' - 'hash_for_each_rcu' - 'hash_for_each_safe' - 'hctx_for_each_ctx' - 'hlist_bl_for_each_entry' - 'hlist_bl_for_each_entry_rcu' - 'hlist_bl_for_each_entry_safe' - 'hlist_for_each' - 'hlist_for_each_entry' - 'hlist_for_each_entry_continue' - 'hlist_for_each_entry_continue_rcu' - 'hlist_for_each_entry_continue_rcu_bh' - 'hlist_for_each_entry_from' - 'hlist_for_each_entry_from_rcu' - 'hlist_for_each_entry_rcu' - 'hlist_for_each_entry_rcu_bh' - 'hlist_for_each_entry_rcu_notrace' - 'hlist_for_each_entry_safe' - 'hlist_for_each_entry_srcu' - '__hlist_for_each_rcu' - 'hlist_for_each_safe' - 'hlist_nulls_for_each_entry' - 'hlist_nulls_for_each_entry_from' - 'hlist_nulls_for_each_entry_rcu' - 'hlist_nulls_for_each_entry_safe' - 'i3c_bus_for_each_i2cdev' - 'i3c_bus_for_each_i3cdev' - 'ide_host_for_each_port' - 'ide_port_for_each_dev' - 'ide_port_for_each_present_dev' - 'idr_for_each_entry' - 'idr_for_each_entry_continue' - 'idr_for_each_entry_continue_ul' - 'idr_for_each_entry_ul' - 'in_dev_for_each_ifa_rcu' - 'in_dev_for_each_ifa_rtnl' - 'inet_bind_bucket_for_each' - 'inet_lhash2_for_each_icsk_rcu' - 'key_for_each' - 'key_for_each_safe' - 'klp_for_each_func' - 'klp_for_each_func_safe' - 'klp_for_each_func_static' - 'klp_for_each_object' - 'klp_for_each_object_safe' - 'klp_for_each_object_static' - 'kunit_suite_for_each_test_case' - 'kvm_for_each_memslot' - 'kvm_for_each_vcpu' - 'list_for_each' - 'list_for_each_codec' - 'list_for_each_codec_safe' - 'list_for_each_continue' - 'list_for_each_entry' - 'list_for_each_entry_continue' - 'list_for_each_entry_continue_rcu' - 'list_for_each_entry_continue_reverse' - 'list_for_each_entry_from' - 'list_for_each_entry_from_rcu' - 'list_for_each_entry_from_reverse' - 'list_for_each_entry_lockless' - 'list_for_each_entry_rcu' - 'list_for_each_entry_reverse' - 'list_for_each_entry_safe' - 'list_for_each_entry_safe_continue' - 'list_for_each_entry_safe_from' - 'list_for_each_entry_safe_reverse' - 'list_for_each_entry_srcu' - 'list_for_each_prev' - 'list_for_each_prev_safe' - 'list_for_each_safe' - 'llist_for_each' - 'llist_for_each_entry' - 'llist_for_each_entry_safe' - 'llist_for_each_safe' - 'mci_for_each_dimm' - 'media_device_for_each_entity' - 'media_device_for_each_intf' - 'media_device_for_each_link' - 'media_device_for_each_pad' - 'nanddev_io_for_each_page' - 'netdev_for_each_lower_dev' - 'netdev_for_each_lower_private' - 'netdev_for_each_lower_private_rcu' - 'netdev_for_each_mc_addr' - 'netdev_for_each_uc_addr' - 'netdev_for_each_upper_dev_rcu' - 'netdev_hw_addr_list_for_each' - 'nft_rule_for_each_expr' - 'nla_for_each_attr' - 'nla_for_each_nested' - 'nlmsg_for_each_attr' - 'nlmsg_for_each_msg' - 'nr_neigh_for_each' - 'nr_neigh_for_each_safe' - 'nr_node_for_each' - 'nr_node_for_each_safe' - 'of_for_each_phandle' - 'of_property_for_each_string' - 'of_property_for_each_u32' - 'pci_bus_for_each_resource' - 'pcl_for_each_chunk' - 'pcl_for_each_segment' - 'pcm_for_each_format' - 'ping_portaddr_for_each_entry' - 'plist_for_each' - 'plist_for_each_continue' - 'plist_for_each_entry' - 'plist_for_each_entry_continue' - 'plist_for_each_entry_safe' - 'plist_for_each_safe' - 'pnp_for_each_card' - 'pnp_for_each_dev' - 'protocol_for_each_card' - 'protocol_for_each_dev' - 'queue_for_each_hw_ctx' - 'radix_tree_for_each_slot' - 'radix_tree_for_each_tagged' - 'rb_for_each' - 'rbtree_postorder_for_each_entry_safe' - 'rdma_for_each_block' - 'rdma_for_each_port' - 'rdma_umem_for_each_dma_block' - 'resource_list_for_each_entry' - 'resource_list_for_each_entry_safe' - 'rhl_for_each_entry_rcu' - 'rhl_for_each_rcu' - 'rht_for_each' - 'rht_for_each_entry' - 'rht_for_each_entry_from' - 'rht_for_each_entry_rcu' - 'rht_for_each_entry_rcu_from' - 'rht_for_each_entry_safe' - 'rht_for_each_from' - 'rht_for_each_rcu' - 'rht_for_each_rcu_from' - '__rq_for_each_bio' - 'rq_for_each_bvec' - 'rq_for_each_segment' - 'scsi_for_each_prot_sg' - 'scsi_for_each_sg' - 'sctp_for_each_hentry' - 'sctp_skb_for_each' - 'shdma_for_each_chan' - '__shost_for_each_device' - 'shost_for_each_device' - 'sk_for_each' - 'sk_for_each_bound' - 'sk_for_each_entry_offset_rcu' - 'sk_for_each_from' - 'sk_for_each_rcu' - 'sk_for_each_safe' - 'sk_nulls_for_each' - 'sk_nulls_for_each_from' - 'sk_nulls_for_each_rcu' - 'snd_array_for_each' - 'snd_pcm_group_for_each_entry' - 'snd_soc_dapm_widget_for_each_path' - 'snd_soc_dapm_widget_for_each_path_safe' - 'snd_soc_dapm_widget_for_each_sink_path' - 'snd_soc_dapm_widget_for_each_source_path' - 'tb_property_for_each' - 'tcf_exts_for_each_action' - 'udp_portaddr_for_each_entry' - 'udp_portaddr_for_each_entry_rcu' - 'usb_hub_for_each_child' - 'v4l2_device_for_each_subdev' - 'v4l2_m2m_for_each_dst_buf' - 'v4l2_m2m_for_each_dst_buf_safe' - 'v4l2_m2m_for_each_src_buf' - 'v4l2_m2m_for_each_src_buf_safe' - 'virtio_device_for_each_vq' - 'while_for_each_ftrace_op' - 'xa_for_each' - 'xa_for_each_marked' - 'xa_for_each_range' - 'xa_for_each_start' - 'xas_for_each' - 'xas_for_each_conflict' - 'xas_for_each_marked' - 'xbc_array_for_each_value' - 'xbc_for_each_key_value' - 'xbc_node_for_each_array_value' - 'xbc_node_for_each_child' - 'xbc_node_for_each_key_value' - 'zorro_for_each_dev' IncludeBlocks: Preserve # Unknown to clang-format-5.0 IncludeCategories: - Regex: '.*' Priority: 1 IncludeIsMainRegex: '(Test)?$' IndentCaseLabels: false IndentGotoLabels: false IndentPPDirectives: None # Unknown to clang-format-5.0 IndentWidth: 8 IndentWrappedFunctionNames: false JavaScriptQuotes: Leave JavaScriptWrapImports: true KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0 ObjCBlockIndentWidth: 8 ObjCSpaceAfterProperty: true ObjCSpaceBeforeProtocolList: true # Taken from git's rules PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0 PenaltyBreakBeforeFirstCallParameter: 30 PenaltyBreakComment: 10 PenaltyBreakFirstLessLess: 0 PenaltyBreakString: 10 PenaltyExcessCharacter: 100 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Right ReflowComments: false SortIncludes: false SortUsingDeclarations: false # Unknown to clang-format-4.0 SpaceAfterCStyleCast: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0 SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0 SpaceBeforeParens: ControlStatementsExceptForEachMacros SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0 SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: false SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp03 TabWidth: 8 UseTab: Always ... crac-criu-1.5.0/.codespellrc000066400000000000000000000001761471504326700157020ustar00rootroot00000000000000[codespell] skip = ./.git,./test/pki ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen crac-criu-1.5.0/.github/000077500000000000000000000000001471504326700147365ustar00rootroot00000000000000crac-criu-1.5.0/.github/ISSUE_TEMPLATE.md000066400000000000000000000020201471504326700174350ustar00rootroot00000000000000 **Description** **Steps to reproduce the issue:** 1. 2. 3. **Describe the results you received:** **Describe the results you expected:** **Additional information you deem important (e.g. issue happens only occasionally):** **CRIU logs and information:**
CRIU full dump/restore logs:

``` (paste your output here) ```

Output of `criu --version`:

``` (paste your output here) ```

Output of `criu check --all`:

``` (paste your output here) ```

**Additional environment details:** crac-criu-1.5.0/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000014351471504326700205420ustar00rootroot00000000000000 crac-criu-1.5.0/.github/workflows/000077500000000000000000000000001471504326700167735ustar00rootroot00000000000000crac-criu-1.5.0/.github/workflows/ccpp.yml000066400000000000000000000040061471504326700204430ustar00rootroot00000000000000name: Build on: push: branches: - "*" tags: - 'release-*' workflow_dispatch: jobs: build: runs-on: ubuntu-latest outputs: BUNDLENAME: ${{ steps.compute.outputs.BUNDLENAME }} steps: - name: Compute parameters id: compute run: | if [ ${{ startsWith(github.ref, 'refs/tags/') }} = true ]; then TAG=${GITHUB_REF##refs/tags/} BUNDLENAME=crac-criu-${TAG} else TAG=notag BUNDLENAME=crac-criu-build-${{ github.run_number }} fi for i in TAG BUNDLENAME; do echo "::set-output name=${i}::${!i}" done - name: Start build container run: echo ${GITHUB_TOKEN} | docker login -u ${GITHUB_ACTOR} --password-stdin docker.pkg.github.com; docker run -d --name build -w $PWD -v /home/runner:/home/runner -u $(id -u):$(id -g) --entrypoint tail docker.pkg.github.com/crac/docker-build/image:ubuntu-16.04 -f /dev/null env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - uses: actions/checkout@v2 - name: Submodule init run: | git submodule init git submodule update - run: make DESTDIR=. PREFIX=/${{ steps.compute.outputs.BUNDLENAME }} install-criu V=1 shell: docker exec build bash -e {0} - run: tar -zcf ${{ steps.compute.outputs.BUNDLENAME }}.tar.gz ${{ steps.compute.outputs.BUNDLENAME }} - uses: actions/upload-artifact@v3 with: name: ${{ steps.compute.outputs.BUNDLENAME }} path: ${{ steps.compute.outputs.BUNDLENAME }}.tar.gz release: runs-on: ubuntu-latest needs: build if: ${{ startsWith(github.ref, 'refs/tags/') }} steps: - uses: actions/download-artifact@v3 with: name: ${{ needs.build.outputs.BUNDLENAME }} - uses: softprops/action-gh-release@v1 with: files: | ${{ needs.build.outputs.BUNDLENAME }}.tar.gz crac-criu-1.5.0/.github/workflows/check-commits.yml000066400000000000000000000023631471504326700222500ustar00rootroot00000000000000name: Verify self-contained commits on: pull_request # Cancel any preceding run on the pull request concurrency: group: commit-test-${{ github.event.pull_request.number }} jobs: build: runs-on: ubuntu-latest # Check if pull request does not have label "not-selfcontained-ok" if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" steps: - uses: actions/checkout@v3 with: # Needed to rebase against the base branch fetch-depth: 0 # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies run: sudo apt-get install -y libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" git config --global user.name "checkpoint-restore" - name: Configure base branch without switching current branch run: git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} - name: Build each commit run: git rebase ${{ github.base_ref }} -x "make -C scripts/ci check-commit" crac-criu-1.5.0/.github/workflows/codeql.yml000066400000000000000000000027201471504326700207660ustar00rootroot00000000000000name: "CodeQL" on: push: branches: [ "criu-dev", "master" ] pull_request: branches: [ "criu-dev" ] schedule: - cron: "11 6 * * 3" # Cancel any preceding run on the pull request. concurrency: group: codeql-test-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write strategy: fail-fast: false matrix: language: [ python, cpp ] steps: - name: Checkout uses: actions/checkout@v3 - name: Install Packages (cpp) if: ${{ matrix.language == 'cpp' }} run: | sudo scripts/ci/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev - name: Initialize CodeQL uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} queries: +security-and-quality - name: Autobuild uses: github/codeql-action/autobuild@v2 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 with: category: "/language:${{ matrix.language }}" crac-criu-1.5.0/.github/workflows/java-test.yml000066400000000000000000000006161471504326700214170ustar00rootroot00000000000000name: Java Test on: [push, pull_request] # Cancel any preceding run on the pull request. concurrency: group: java-test-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} jobs: build: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Run Java Test run: sudo make -C scripts/ci java-test crac-criu-1.5.0/.github/workflows/lint.yml000066400000000000000000000022271471504326700204670ustar00rootroot00000000000000name: Run code linter on: [push, pull_request] # Cancel any preceding run on the pull request. concurrency: group: lint-test-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} jobs: build: runs-on: ubuntu-latest container: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format ShellCheck - uses: actions/checkout@v2 - name: Set git safe directory # https://github.com/actions/checkout/issues/760 run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - name: Run make lint run: make lint - name: Run make indent continue-on-error: true run: | if [ -z "${{github.base_ref}}" ]; then git fetch --deepen=1 make indent else git fetch origin ${{github.base_ref}} make indent BASE=origin/${{github.base_ref}} fi - name: Raise in-line make indent warnings run: | git diff | ./scripts/github-indent-warnings.py crac-criu-1.5.0/.github/workflows/loongarch64-qemu-test.yml000066400000000000000000000006351471504326700235720ustar00rootroot00000000000000name: LoongArch64 Qemu Test on: [push, pull_request] # Cancel any preceding run on the pull request. concurrency: group: loongarch64-qemu-test-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} jobs: build: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - run: sudo make -C scripts/ci loongarch64-qemu-test crac-criu-1.5.0/.github/workflows/manage-labels.yml000066400000000000000000000006301471504326700222050ustar00rootroot00000000000000name: Remove labels on: [issue_comment, pull_request_review_comment] jobs: remove-labels-on-comments: name: Remove labels on comments if: github.event_name == 'issue_comment' runs-on: ubuntu-latest steps: - uses: mondeja/remove-labels-gh-action@v1 with: token: ${{ secrets.GITHUB_TOKEN }} labels: | changes requested awaiting reply crac-criu-1.5.0/.gitignore000066400000000000000000000012011471504326700153600ustar00rootroot00000000000000.config *.o *.d *.a *.img *.bin *.elf *.out *.swp *.swo *.so .git-ignore *.patch *.pyc cscope* tags TAGS Makefile.local compel/compel compel/compel-host-bin images/*.c images/*.h images/google/protobuf/*.c images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest criu/arch/*/sys-exec-tbl*.c # x86 syscalls-table is not generated !criu/arch/x86/sys-exec-tbl.c criu/arch/*/syscalls*.S criu/include/syscall-codes*.h criu/include/syscall*.h criu/include/version.h criu/pie/restorer-blob.h criu/pie/parasite-blob.h criu/protobuf-desc-gen.h lib/build/ lib/c/criu.pc compel/include/asm include/common/asm include/common/config.h build/** crac-criu-1.5.0/.gitmodules000066400000000000000000000001011471504326700155430ustar00rootroot00000000000000[submodule "lz4"] path = lz4 url = https://github.com/lz4/lz4/ crac-criu-1.5.0/.lgtm.yml000066400000000000000000000013561471504326700151470ustar00rootroot00000000000000extraction: cpp: prepare: packages: - "protobuf-c-compiler" - "libprotobuf-c-dev" - "libprotobuf-dev" - "build-essential" - "libprotobuf-dev" - "libprotobuf-c-dev" - "protobuf-c-compiler" - "protobuf-compiler" - "python3-protobuf" - "libnet-dev" - "pkg-config" - "libnl-3-dev" - "libbsd0" - "libbsd-dev" - "iproute2" - "libcap-dev" - "libaio-dev" - "libbsd-dev" - "python3-yaml" - "libnl-route-3-dev" - "gnutls-dev" configure: command: - "ls -laR images/google" - "ln -s /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto" - "ls -laR images/google" crac-criu-1.5.0/.mailmap000066400000000000000000000011621471504326700150170ustar00rootroot00000000000000Stanislav Kinsbursky Pavel Emelyanov Andrei Vagin Andrei Vagin Andrei Vagin Andrei Vagin Andrei Vagin Cyrill Gorcunov Alexander Mikhalitsyn Alexander Mikhalitsyn crac-criu-1.5.0/.travis.yml000066400000000000000000000012611471504326700155070ustar00rootroot00000000000000language: c os: linux dist: bionic services: - docker jobs: include: - os: linux arch: ppc64le env: TR_ARCH=local dist: bionic - os: linux arch: ppc64le env: TR_ARCH=local CLANG=1 dist: bionic - os: linux arch: s390x env: TR_ARCH=local dist: bionic - os: linux arch: arm64-graviton2 env: TR_ARCH=local RUN_TESTS=1 dist: focal group: edge virt: vm - os: linux arch: arm64-graviton2 env: TR_ARCH=local CLANG=1 RUN_TESTS=1 group: edge virt: vm dist: bionic script: - sudo make -C scripts/ci $TR_ARCH after_success: - make -C scripts/ci after_success crac-criu-1.5.0/CONTRIBUTING.md000066400000000000000000000426471471504326700156440ustar00rootroot00000000000000## How to contribute to CRIU CRIU project is (almost) the never-ending story, because we have to always keep up with the Linux kernel supporting checkpoint and restore for all the features it provides. Thus we're looking for contributors of all kinds -- feedback, bug reports, testing, coding, writing, etc. Here are some useful hints to get involved. * We have both -- [very simple](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; * CRIU does need [extensive testing](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); * Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; * Feedback is expected on the GitHub issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); * We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lists.openvz.org/mailman/listinfo/criu). Below we describe in more detail recommend practices for CRIU development. * Spread the word about CRIU in [social networks](http://criu.org/Contacts); * If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); ### Setting up the development environment Although `criu` could be run as non-root (see [Security](https://criu.org/Security)), development is better to be done as root. For example, some tests require root. So, it would be a good idea to set up some recent Linux distro on a virtual machine. ### Get the source code The CRIU sources are tracked by Git. Official CRIU repo is at https://github.com/checkpoint-restore/criu. The repository may contain multiple branches. Development happens in the **criu-dev** branch. To clone CRIU repo and switch to the proper branch, run: ``` git clone https://github.com/checkpoint-restore/criu criu cd criu git checkout criu-dev ``` ### Compile First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. To compile CRIU, run: ``` make ``` This should create the `./criu/criu` executable. ## Edit the source code When you change the source code, please keep in mind the following code conventions: * code is written to be read, so the code readability is the most important thing you need to have in mind when preparing patches * we prefer tabs and indentations to be 8 characters width * we prefer line length of 80 characters or less, more is allowed if it helps with code readability * CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community Other conventions can be learned from the source code itself. In short, make sure your new code looks similar to what is already there. ## Automatic tools to fix coding-style Important: These tools are there to advise you, but should not be considered as a "source of truth", as tools also make nasty mistakes from time to time which can completely break code readability. The following command can be used to automatically run a code linter for Python files (flake8), Shell scripts (shellcheck), text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). ``` make lint ``` In addition, we have adopted a [clang-format configuration file](https://www.kernel.org/doc/Documentation/process/clang-format.rst) based on the kernel source tree. However, compliance with the clang-format autoformat rules is optional. If the automatic code formatting results in decreased readability, we may choose to ignore these errors. Run the following command to check if your changes are compliant with the clang-format rules: ``` make indent ``` This command is built upon the `git-clang-format` tool and supports two options `BASE` and `OPTS`. The `BASE` option allows you to specify a range of commits to check for coding style issues. By default, it is set to `HEAD~1`, so that only the last commit is checked. If you are developing on top of the criu-dev branch and want to check all your commits for compliance with the clang-format rules, you can use `BASE=origin/criu-dev`. The `OPTS` option can be used to pass additional options to `git-clang-format`. For example, if you want to check the last *N* commits for formatting errors, without applying the changes to the codebase you can use the following command. ``` make indent OPTS=--diff BASE=HEAD~N ``` Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected we need to review the suggested changes and decide if they should be fixed before merging. Here are some bad examples of clang-format-ing: * if clang-format tries to force 120 characters and breaks readability - it is wrong: ``` @@ -58,8 +59,7 @@ static int register_membarriers(void) } if (!all_ok) { - fail("can't register membarrier()s - tried %#x, kernel %#x", - barriers_registered, barriers_supported); + fail("can't register membarrier()s - tried %#x, kernel %#x", barriers_registered, barriers_supported); return -1; } ``` * if clang-format breaks your beautiful readability friendly alignment in structures, comments or defines - it is wrong: ``` --- a/test/zdtm/static/membarrier.c +++ b/test/zdtm/static/membarrier.c @@ -27,9 +27,10 @@ static const struct { int register_cmd; int execute_cmd; } membarrier_cmds[] = { - { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, - { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, - { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, + { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, + { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, + { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, }; ``` ## Test your changes CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run ``` make test ``` The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. In case you'd rather have someone else run the tests, you can use travis-ci for your own GitHub fork of CRIU. It will check the compilation for various supported platforms, as well as run most of the tests from the suite. See https://travis-ci.org/checkpoint-restore/criu for more details. ## Describe your changes Describe your problem. Whether your change is a one-line bug fix or 5000 lines of a new feature, there must be an underlying problem that motivated you to do this work. Convince the reviewer that there is a problem worth fixing and that it makes sense for them to read past the first paragraph. Once the problem is established, describe what you are actually doing about it in technical detail. It's important to describe the change in plain English for the reviewer to verify that the code is behaving as you intend it to. Solve only one problem per commit. If your description starts to get long, that's a sign that you probably need to split up your commit. See [Separate your changes](#separate-your-changes). Describe your changes in imperative mood, e.g. "make xyzzy do frotz" instead of "[This commit] makes xyzzy do frotz" or "[I] changed xyzzy to do frotz", as if you are giving orders to the codebase to change its behaviour. If your change fixes a bug in a specific commit, e.g. you found an issue using `git bisect`, please use the `Fixes:` tag with the abbreviation of the SHA-1 ID, and the one line summary. For example: ``` Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") ``` The following `git config` settings can be used to add a pretty format for outputting the above style in the `git log` or `git show` commands: ``` [pretty] fixes = Fixes: %h (\"%s\") ``` If your change address an issue listed in GitHub, please use `Fixes:` tag with the number of the issue. For instance: ``` Fixes: #339 ``` The `Fixes:` tags should be put at the end of the detailed description. Please add a prefix to your commit subject line describing the part of the project your change is related to. This can be either the name of the file or directory you changed, or just a general word. If your patch is touching multiple components you may separate prefixes with "/"-es. Here are some good examples of subject lines from git log: ``` criu-ns: Convert to python3 style print() syntax compel: Calculate sh_addr if not provided by linker style: Enforce kernel style -Wstrict-prototypes rpc/libcriu: Add lsm-profile option ``` You may refer to [How to Write a Git Commit Message](https://chris.beams.io/posts/git-commit/) article for recommendations for good commit message. ## Separate your changes Separate each **logical change** into a separate commit. For example, if your changes include both bug fixes and performance enhancements for a single driver, separate those changes into two or more commits. If your changes include an API update, and a new driver which uses that new API, separate those into two commits. On the other hand, if you make a single change to numerous files, group those changes into a single commit. Thus a single logical change is contained within a single commit. The point to remember is that each commit should make an easily understood change that can be verified by reviewers. Each commit should be justifiable on its own merits. When dividing your change into a series of commits, take special care to ensure that CRIU builds and runs properly after each commit in the series. Developers using `git bisect` to track down a problem can end up splitting your patch series at any point; they will not thank you if you introduce bugs in the middle. ## Sign your work To improve tracking of who did what, we ask you to sign off the commits in your fork of CRIU or the patches that are to be emailed. The sign-off is a simple line at the end of the explanation for the patch, which certifies that you wrote it or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify the below: ### Developer's Certificate of Origin 1.1 By making a contribution to this project, I certify that: (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it. (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. then you just add a line saying ``` Signed-off-by: Random J Developer ``` using your real name (please, no pseudonyms or anonymous contributions if it possible). Hint: you can use `git commit -s` to add Signed-off-by line to your commit message. To append such line to a commit you already made, use `git commit --amend -s`. ``` From: Random J Developer Subject: [PATCH] component: Short patch description Long patch description (could be skipped if patch is trivial enough) Signed-off-by: Random J Developer --- Patch body here ``` ## Submit your work upstream We accept GitHub pull requests and this is the preferred way to contribute to CRIU. For that you should push your work to your fork of CRIU at [GitHub](https://github.com) and create a [pull request](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) ### Pull request guidelines Pull request comment should contain description of the problem your changes solve and a brief outline of the changes included in the pull request. Please avoid pushing fixup commits to an existent pull request. Each commit should be self contained and there should not be fixup commits in a patch series. Pull requests that contain one commit which breaks something and another commit which fixes it, will be rejected. Please merge the fixup commits into the commits that has introduced the problem before creating a pull request. It may happen that the reviewers were not completely happy with your changes and requested changes to your patches. After you updated your changes please close the old pull request and create a new one that contains the following: * Description of the problem your changes solve and a brief outline of the changes * Link to the previous version of the pull request * Brief description of the changes between old and new versions of the pull request. If there were more than one previous pull request, all the revisions should be listed. For example: ``` v3: rebase on the current criu-dev v2: add commit to foo() and update bar() coding style ``` If there are only minor updates to the commits in a pull request, it is possible to force-push them into an existing pull request. This only applies to small changes and should be used with care. If you update an existing pull request, remember to add the description of the changes from the previous version. ### Mailing list submission Historically, CRIU worked with mailing lists and patches so if you still prefer this way continue reading till the end of this section. ### Make a patch To create a patch, run ``` git format-patch --signoff origin/criu-dev ``` You might need to read GIT documentation on how to prepare patches for mail submission. Take a look at http://book.git-scm.com/ and/or http://git-scm.com/documentation for details. It should not be hard at all. We recommend to post patches using `git send-email` ``` git send-email --cover-letter --no-chain-reply-to --annotate \ --confirm=always --to=criu@openvz.org criu-dev ``` Note that the `git send-email` subcommand may not be in the main git package and using it may require installation of a separate package, for example the "git-email" package in Fedora and Debian. If this is your first time using git send-email, you might need to configure it to point it to your SMTP server with something like: ``` git config --global sendemail.smtpServer stmp.example.net ``` If you get tired of typing `--to=criu@openvz.org` all the time, you can configure that to be automatically handled as well: ``` git config sendemail.to criu@openvz.org ``` If a developer is sending another version of the patch (e.g. to address review comments), they are advised to note differences to previous versions after the `---` line in the patch so that it helps reviewers but doesn't become part of git history. Moreover, such patch needs to be prefixed correctly with `--subject-prefix=PATCHv2` appended to `git send-email` (substitute `v2` with the correct version if needed though). ### Mail patches The patches should be sent to CRIU development mailing list, `criu AT openvz.org`. Note that you need to be subscribed first in order to post. The list web interface is available at https://openvz.org/mailman/listinfo/criu; you can also use standard mailman aliases to work with it. Please make sure the email client you're using doesn't screw your patch (line wrapping and so on). > **Note:** When sending a patch set that consists of more than one patch, please, push your changes in your local repo and provide the URL of the branch in the cover-letter ### Wait for response Be patient. Most CRIU developers are pretty busy people so if there is no immediate response on your patch — don't be surprised, sometimes a patch may fly around a week before it gets reviewed. ## Continuous integration Wiki article: [Continuous integration](https://criu.org/Continuous_integration) CRIU tests are run for each series sent to the mailing list. If you get a message from our patchwork that patches failed to pass the tests, you have to investigate what is wrong. We also recommend you to [enable Travis CI for your repo](https://criu.org/Continuous_integration#Enable_Travis_CI_for_your_repo) to check patches in your git branch, before sending them to the mailing list. crac-criu-1.5.0/COPYING000066400000000000000000001305421471504326700144360ustar00rootroot00000000000000This software is licensed under the GNU GENERAL PUBLIC LICENCE Version 2. Except that any software in the lib/ directory is for the creation of a linkable library to the tools and is licensed under the GNU LESSER GENERAL PUBLIC LICENCE Version 2.1. Contributing Authors agree that their code is submitted under the licence appropriate for its location within the source tree (GPL except for LGPL in lib/) and agree that any future patches, provided they are accepted into the project, may change the licence of their code from GPL to LGPL by moving pieces of it into lib/ or LGPL to GPL by moving pieces of it out of lib/ Note that the only valid version of the GPL is THIS particular version of the license (ie v2, not v2.2 or v3.x or whatever), unless explicitly otherwise stated. ---------------------------------------- GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. --------------------------------------- GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. , 1 April 1990 Ty Coon, President of Vice That's all there is to it! crac-criu-1.5.0/CREDITS000066400000000000000000000004701471504326700144170ustar00rootroot00000000000000The following people provided invaluable help to CRIU project (in alphabetical order) ------------------------------------------------------------------- Andrew Morton David Miller Eric Dumazet Eric W. Biederman H. Peter Anvin Kees Cook KOSAKI Motohiro Li Yu Linus Torvalds Oleg Nesterov Serge Hallyn Tejun Heo crac-criu-1.5.0/Documentation/000077500000000000000000000000001471504326700162075ustar00rootroot00000000000000crac-criu-1.5.0/Documentation/.gitattributes000066400000000000000000000000211471504326700210730ustar00rootroot00000000000000*.txt whitespace crac-criu-1.5.0/Documentation/.gitignore000066400000000000000000000000531471504326700201750ustar00rootroot00000000000000*.xml *.html *.[1-8] *.pdf *.ps footer.txt crac-criu-1.5.0/Documentation/HOWTO.cross-compile000066400000000000000000000036561471504326700216220ustar00rootroot00000000000000How to cross-compile CRIU on x86: Use the Dockerfile provided: scripts/build/Dockerfile.armv7-cross Historical guide how-to do it without docker container: [Unsupported, may not work anymore!] 1. Download the protobuf sources. 2. Apply the patch http://16918.selcdn.ru/crtools/aarch64/0001-protobuf-added-the-support-for-the-acrchitecture-AAr.patch 3. Configure protobuf to be compiled for the target architecture: ./configure --prefix=$X86_PREFIX --disable-shared --enable-static 4. Compile protobuf. 5. Download protobuf-c sources. 6. Configure protobuf-c for the architecture x86: export PATH=$PATH:$X86_PREFIX/bin export PKG_CONFIG_PATH=$X86_PREFIX/lib/pkgconfig CPPFLAGS=`pkg-config --cflags protobuf` LDFLAGS=`pkg-config --libs protobuf` ./configure --prefix=$X86_PREFIX --disable-shared --enable-static 7. Compile and install protobuf-c. 8. Configure protobuf to be compiled for the target architecture: ./configure --prefix=$ARCH_PREFIX --disable-shared --enable-static --with-protoc=protoc --host=$TARGET 9. Compile and install protobuf. 10. Let PKG_CONFIG_PATH=$ARCH_PREFIX/lib/pkgconfig. 11. Configure protobuf-c to be compiled for the target architecture: CPPFLAGS=`pkg-config --cflags protobuf` LDFLAGS=`pkg-config --libs protobuf` ./configure --prefix=$ARCH_PREFIX --disable-shared --enable-static --disable-protoc --host=$TARGET 12. Compile and install protobuf-c. 13. Compile CRIU: ARCH= CROSS_COMPILE=$TARGET- CFLAGS=`pkg-config --cflags libprotobuf-c` LDFLAGS="`pkg-config --libs libprotobuf-c`" make Special notes for Android NDK cross compile: 1, Android NDK doesn't have some headers required by CRIU build, they are , 2, Android NDK doesn't have some function required by CRIU build, they are aio*, fanotify_init, fanotify_mark, povit_root, index. 3, in order to pass build with Android NDK, you implement them yourself, and link them to CRIU. crac-criu-1.5.0/Documentation/Makefile000066400000000000000000000050161471504326700176510ustar00rootroot00000000000000__nmk_dir ?= ../scripts/nmk/scripts/ include $(__nmk_dir)include.mk include $(__nmk_dir)macro.mk ifneq ($(USE_ASCIIDOCTOR),) ASCIIDOC := asciidoctor XMLTO := else ASCIIDOC := asciidoc XMLTO := xmlto endif FOOTER := footer.txt SRC1 += crit.txt SRC1 += criu-ns.txt SRC1 += compel.txt SRC1 += criu-amdgpu-plugin.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) MAN1S := $(patsubst %.txt,%.1,$(SRC1)) MAN8S := $(patsubst %.txt,%.8,$(SRC8)) MANS := $(MAN1S) $(MAN8S) MAN1DIR := $(MANDIR)/man1 MAN8DIR := $(MANDIR)/man8 GROFF :=groff PAPER :=$(shell paperconf 2>/dev/null || echo letter) GROFF_OPTS := -Tps -t -dpaper=$(PAPER) -P-p$(PAPER) -man -msafer -rC1 -rD1 -rS11 PSS := $(patsubst %,%.ps,$(basename $(MANS))) PDFS := $(patsubst %,%.pdf,$(basename $(MANS))) all: check $(MANS) ps: $(PSS) pdf: $(PDFS) .PHONY: all ps pdf check check: $(Q) for B in $(ASCIIDOC) $(XMLTO); do \ $$B --version > /dev/null || exit 1; \ done ifeq ($(CRIU_VERSION),) include ../Makefile.versions endif $(FOOTER): ../Makefile.versions $(call msg-gen, $@) $(Q) echo ":doctype: manpage" > $@ $(Q) echo ":man source: criu" >> $@ $(Q) echo ":man version: $(CRIU_VERSION)" >> $@ $(Q) echo ":man manual: CRIU Manual" >> $@ %.1: %.txt $(FOOTER) custom.xsl $(call msg-gen, $@) ifneq ($(USE_ASCIIDOCTOR),) $(Q) $(ASCIIDOC) -b manpage -d manpage -o $@ $< else $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.1,%.xml,$@) $< $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.1,%.xml,$@) endif %.8: %.txt $(FOOTER) custom.xsl $(call msg-gen, $@) ifneq ($(USE_ASCIIDOCTOR),) $(Q) $(ASCIIDOC) -b manpage -d manpage -o $@ $< else $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.8,%.xml,$@) $< $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.8,%.xml,$@) endif %.ps: %.1 $(call msg-gen, $@) $(Q) $(GROFF) $(GROFF_OPTS) $^ > $@ %.ps: %.8 $(call msg-gen, $@) $(Q) $(GROFF) $(GROFF_OPTS) $^ > $@ %.pdf: %.ps $(call msg-gen, $@) $(Q) ps2pdf $< $@ clean: $(call msg-clean, "Documentation") $(Q) rm -f $(XMLS) $(MANS) $(PSS) $(PDFS) $(FOOTER) install: check $(MANS) $(E) " INSTALL " $(MAN8S) $(Q) mkdir -p $(DESTDIR)$(MAN8DIR) $(Q) install -m 644 $(MAN8S) $(DESTDIR)$(MAN8DIR) $(E) " INSTALL " $(MAN1S) $(Q) mkdir -p $(DESTDIR)$(MAN1DIR) $(Q) install -m 644 $(MAN1S) $(DESTDIR)$(MAN1DIR) uninstall: $(E) " UNINSTALL" $(MAN1S) $(Q) $(RM) $(addprefix $(DESTDIR)$(MAN1DIR)/,$(MAN1S)) $(E) " UNINSTALL" $(MAN8S) $(Q) $(RM) $(addprefix $(DESTDIR)$(MAN8DIR)/,$(MAN8S)) .PHONY: clean install uninstall crac-criu-1.5.0/Documentation/asciidoc.conf000066400000000000000000000000011471504326700206230ustar00rootroot00000000000000 crac-criu-1.5.0/Documentation/compel.txt000066400000000000000000000065741471504326700202430ustar00rootroot00000000000000COMPEL(1) ========== include::footer.txt[] NAME ---- compel - Execute parasitic code within another process. SYNOPSIS -------- *compel* 'hgen' ['option' ...] *compel* 'plugins' ['PLUGIN_NAME' ...] *compel* ['--compat'] 'includes' | 'cflags' | 'ldflags' *compel* ['--compat'] ['--static'] 'libs' DESCRIPTION ------------ *compel* is a utility to execute arbitrary code, also called parasite code, in the context of a foreign process. The parasitic code, once compiled with compel flags and packed, can be executed in the context of other tasks. Currently there is only one way to load the parasitic blob into victim task using libcompel.a, called c-header. ARGUMENTS ---------- Positional Arguments ~~~~~~~~~~~~~~~~~~~~ *hgen*:: create a header from the .po file, which is the parasite binary. *plugins*:: prints the plugins available. *ldflags*:: prints the ldflags available to compel during linking of parasite code. *cflags*:: prints the compel cflags to be used during compilation of parasitic code. *includes*:: prints list of standard include directories. *libs*:: prints list of static or dynamic libraries that compel can link with. OPTIONS -------- *-f*, *--file* 'FILE':: Path to the binary file, 'FILE', which *compel* must turn into a header *-o*, *--output* 'FILE':: Path to the header file, 'FILE', where compel must write the resulting header. *-p*, *--prefix* 'NAME':: Specify prefix for var names *-l*, *--log-level* 'NUM':: Default log level of compel. *-h*, *--help*:: Prints usage and exits. *-V*, *--version*:: Prints version number of compel. SOURCE EXAMPLES ---------------- Parasitic Code ~~~~~~~~~~~~~~ *#include * *int parasite_trap_cmd(int cmd, void *args);* //gets called by compel_run_in_thread() *int parasite_daemon_cmd(int cmd, void *arg);* // gets called by compel_rpc_call() and compel_rpc_call_sync() *void parasite_cleanup(void);* //gets called on parasite unload by compel_cure() Infecting code ~~~~~~~~~~~~~~ The parasitic code is compiled and converted to a header using *compel*, and included here. *#include * *#include "parasite.h"* Following steps are performed to infect the victim process: - stop the task: *int compel_stop_task(int pid);* - prepare infection handler: *struct parasite_ctl *compel_prepare(int pid);* - execute system call: *int compel_syscall(ctl, int syscall_nr, long *ret, int arg ...);* - infect victim: *int compel_infect(ctl, nr_thread, size_of_args_area);* - cure the victim: *int compel_cure(ctl);* //ctl pointer is freed by this call - Resume victim: *int compel_resume_task(pid, orig_state, state)* or *int compel_resume_task_sig(pid, orig_state, state, stop_signo).* //compel_resume_task_sig() could be used in case when victim is in stopped state. stop_signo could be read by calling compel_parse_stop_signo(). *ctl* must be configured with blob information by calling *PREFIX_setup_c_header()*, with ctl as its argument. *PREFIX* is the argument given to *-p* when calling hgen, else it is deduced from file name. EXAMPLES --------- To generate a header file(.h) from a parasite binary file(.po) use: ---------- compel hgen -f parasite.po -o parasite.h ---------- 'parasite.po' file is obtained by compiling the parasite source with compel flags and linking it with the compel plugins. AUTHOR ------ The CRIU team. crac-criu-1.5.0/Documentation/crit.txt000066400000000000000000000014601471504326700177120ustar00rootroot00000000000000CRIT(1) ======= include::footer.txt[] NAME ---- crit - CRiu Image Tool SYNOPSIS -------- *crit* 'decode' [-h] [-i IN] [-o OUT] [--pretty] *crit* 'encode' [-h] [-i IN] [-o OUT] *crit* 'info' [-h] in *crit* 'x' [-h] dir {ps,fds,mems} *crit* 'show' [-h] in DESCRIPTION ----------- *crit* is a feature-rich replacement for existing *criu* show. ARGUMENTS --------- Positional Arguments ~~~~~~~~~~~~~~~~~~~~ *decode*:: convert *criu* image from binary type JSON *encode*:: convert *criu* image from JSON type to binary *info*:: show info about image *x*:: explore image directory *show*:: convert *criu* image from binary to human-readable JSON Optional Arguments ~~~~~~~~~~~~~~~~~~ *-h*, *--help*:: Print some help and exit SEE ALSO -------- criu(8) AUTHOR ------ The CRIU team crac-criu-1.5.0/Documentation/criu-amdgpu-plugin.txt000066400000000000000000000063341471504326700224670ustar00rootroot00000000000000ROCM Support(1) =============== NAME ---- criu-amdgpu-plugin - A plugin extension to CRIU to support checkpoint/restore in userspace for AMD GPUs. CURRENT SUPPORT --------------- Single and Multi GPU systems (Gfx9) Checkpoint / Restore on different system Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer DESCRIPTION ----------- Though *criu* is a great tool for checkpointing and restoring running applications, it has certain limitations such as it cannot handle applications that have device files open. In order to support *ROCm* based workloads with *criu* we need to augment criu's core functionality with a plugin based extension mechanism. *criu-amdgpu-plugin* provides the necessary support to criu to allow Checkpoint / Restore with ROCm. Dependencies ~~~~~~~~~~~~~~ *amdkfd support*:: In order to snapshot the *VRAM* and other *GPU* device states, we require an updated version of amdkfd(amdgpu) driver. The kernel patches are under review currently. *criu 3.16*:: This work is rebased on latest criu release available at this time. OPTIONS ------- Optional parameters can be passed in as environment variables before executing criu command. *KFD_FW_VER_CHECK*:: Enable or disable firmware version check. If enabled, firmware version on restored gpu needs to be greater than or equal firmware version on checkpointed GPU. Default:Enabled E.g: KFD_FW_VER_CHECK=0 *KFD_SDMA_FW_VER_CHECK*:: Enable or disable SDMA firmware version check. If enabled, SDMA firmware version on restored gpu needs to be greater than or equal firmware version on checkpointed GPU. Default:Enabled E.g: KFD_SDMA_FW_VER_CHECK=0 *KFD_CACHES_COUNT_CHECK*:: Enable or disable caches count check. If enabled, the caches count on restored GPU needs to be greater than or equal caches count on checkpointed GPU. Default:Enabled E.g: KFD_CACHES_COUNT_CHECK=0 *KFD_NUM_GWS_CHECK*:: Enable or disable num_gws check. If enabled, the num_gws on restored GPU needs to be greater than or equal num_gws on checkpointed GPU. Default:Enabled E.g: KFD_NUM_GWS_CHECK=0 *KFD_VRAM_SIZE_CHECK*:: Enable or disable VRAM size check. If enabled, the VRAM size on restored GPU needs to be greater than or equal VRAM size on checkpointed GPU. Default:Enabled E.g: KFD_VRAM_SIZE_CHECK=0 *KFD_NUMA_CHECK*:: Enable or disable NUMA CPU region check. If enabled, the plugin will restore GPUs that belong to one CPU NUMA region to the same CPU NUMA region. Default:Enabled E.g: KFD_NUMA_CHECK=1 *KFD_CAPABILITY_CHECK*:: Enable or disable capability check. If enabled, the capability on restored GPU needs to be equal to the capability on the checkpointed GPU. Default:Enabled E.g: KFD_CAPABILITY_CHECK=1 *KFD_MAX_BUFFER_SIZE*:: On some systems, VRAM sizes may exceed RAM sizes, and so buffers for dumping and restoring VRAM may be unable to fit. Set to a nonzero value (in bytes) to set a limit on the plugin's memory usage. Default:0 (Disabled) E.g: KFD_MAX_BUFFER_SIZE="2G" AUTHOR ------ The AMDKFD team. COPYRIGHT --------- Copyright \(C) 2020-2021, Advanced Micro Devices, Inc. (AMD) crac-criu-1.5.0/Documentation/criu-ns.txt000066400000000000000000000011451471504326700203310ustar00rootroot00000000000000CRIU-NS(1) ========== include::footer.txt[] NAME ---- criu-ns - run criu in different namespaces SYNOPSIS -------- *criu-ns* 'dump' -t PID [] *criu-ns* 'pre-dump' -t PID [] *criu-ns* 'restore' [] *criu-ns* 'check' [] DESCRIPTION ----------- The *criu-ns* command executes 'criu' in a new PID and mount namespace. The purpose of this wrapper script is to enable restoring a process tree that might require a specific PID that is already used on the system; so called "PID mismatch" problem. SEE ALSO -------- nsenter(1) namespaces(7) criu(8) AUTHOR ------ The CRIU team crac-criu-1.5.0/Documentation/criu.txt000066400000000000000000001106161471504326700177170ustar00rootroot00000000000000CRIU(8) ======= include::footer.txt[] NAME ---- criu - checkpoint/restore in userspace SYNOPSIS -------- *criu* 'command' ['option' ...] DESCRIPTION ----------- *criu* is a tool for checkpointing and restoring running applications. It does this by saving their state as a collection of files (see the *dump* command) and creating equivalent processes from those files (see the *restore* command). The restore operation can be performed at a later time, on a different system, or both. OPTIONS ------- Most of the long flags can be prefixed with *no-* to negate the option (example: *--display-stats* and *--no-display-stats*). Common options ~~~~~~~~~~~~~~ Common options are applicable to any 'command'. *-v*[*v*...], *--verbosity*:: Increase verbosity up from the default level. In case of short option, multiple *v* can be used, each increasing verbosity by one. **-v**__num__, **--verbosity=**__num__:: Set verbosity level to _num_. The higher the level, the more output is produced. + The following levels are available: * *-v0* no output; * *-v1* only errors; * *-v2* above plus warnings (this is the default level); * *-v3* above plus information messages and timestamps; * *-v4* above plus lots of debug. *--config* 'file':: Pass a specific configuration file to criu. *--no-default-config*:: Disable parsing of default configuration files. *--pidfile* 'file':: Write root task, service or page-server pid into a 'file'. *-o*, *--log-file* 'file':: Write logging messages to a 'file'. *--display-stats*:: During dump, as well as during restore, *criu* collects some statistics, like the time required to dump or restore the process, or the number of pages dumped or restored. This information is always saved to the *stats-dump* and *stats-restore* files, and can be shown using *crit*(1). The option *--display-stats* prints out this information on the console at the end of a dump or restore operation. *-D*, *--images-dir* 'path':: Use 'path' as a base directory where to look for sets of image files. *--stream*:: dump/restore images using criu-image-streamer. See https://github.com/checkpoint-restore/criu-image-streamer for detailed usage. *--prev-images-dir* 'path':: Use 'path' as a parent directory where to look for sets of image files. This option makes sense in case of incremental dumps. *-W*, *--work-dir* 'dir':: Use directory 'dir' for putting logs, pidfiles and statistics. If not specified, 'path' from *-D* option is taken. *--close* 'fd':: Close file descriptor 'fd' before performing any actions. *-L*, *--libdir* 'path':: Path to plugins directory. *--enable-fs* ['fs'[,'fs'...]]:: Specify a comma-separated list of filesystem names that should be auto-detected. The value 'all' enables auto-detection for all filesystems. + Note: This option is not safe, use at your own risk. Auto-detecting a filesystem mount assumes that the mountpoint can be restored with *mount(src, mountpoint, flags, options)*. When used, *dump* is expected to always succeed if a mountpoint is to be auto-detected, however *restore* may fail (or do something wrong) if the assumption for restore logic is incorrect. This option is not compatible with *--external* *dev*. *--action-script* 'script':: Add an external action script to be executed at certain stages. The environment variable *CRTOOLS_SCRIPT_ACTION* is available to the script to find out which action is being executed, and its value can be one of the following: *pre-dump*::: run prior to beginning a *dump* *post-dump*::: run upon *dump* completion *pre-restore*::: run prior to beginning a *restore* *post-restore*::: run upon *restore* completion *pre-resume*::: run when all processes and resources are restored but tasks are stopped waiting for final kick to run. Must not fail. *post-resume*::: called at the very end, when everything is restored and processes were resumed *network-lock*::: run to lock network in a target network namespace *network-unlock*::: run to unlock network in a target network namespace *setup-namespaces*::: run once root task has just been created with required namespaces. Note it is an early stage of restore, when nothing is restored yet, except for namespaces themselves *post-setup-namespaces*::: called after the namespaces are configured *orphan-pts-master*::: called after master pty is opened and unlocked. This hook can be used only in the RPC mode, and the notification message contains a file descriptor for the master pty *query-ext-files*::: called after the process tree is stopped and network is locked. This hook is used only in the RPC mode. The notification reply contains file ids to be added to external file list (may be empty). *--unprivileged*:: This option tells *criu* to accept the limitations when running as non-root. Running as non-root requires *criu* at least to have *CAP_SYS_ADMIN* or *CAP_CHECKPOINT_RESTORE*. For details about running *criu* as non-root please consult the *NON-ROOT* section. *-V*, *--version*:: Print program version and exit. *-h*, *--help*:: Print some help and exit. *pre-dump* ~~~~~~~~~~ Performs the pre-dump procedure, during which *criu* creates a snapshot of memory changes since the previous *pre-dump*. Note that during this *criu* also creates the fsnotify cache which speeds up the *restore* procedure. *pre-dump* requires at least *-t* option (see *dump* below). In addition, *page-server* options may be specified. *--track-mem*:: Turn on memory changes tracker in the kernel. If the option is not passed the memory tracker get turned on implicitly. *--pre-dump-mode*='mode':: There are two 'mode' to operate pre-dump algorithm. The 'splice' mode is parasite based, whereas 'read' mode is based on process_vm_readv syscall. The 'read' mode incurs reduced frozen time and reduced memory pressure as compared to 'splice' mode. Default is 'splice' mode. *dump* ~~~~~~ Performs a checkpoint procedure. *-t*, *--tree* 'pid':: Checkpoint the whole process tree starting from 'pid'. *-R*, *--leave-running*:: Leave tasks in running state after checkpoint, instead of killing. This option is pretty dangerous and should be used only if you understand what you are doing. + Note if task is about to run after been checkpointed, it can modify TCP connections, delete files and do other dangerous actions. Therefore, *criu* can not guarantee that the next *restore* action will succeed. Most likely if this option is used, at least the file system snapshot must be made with the help of *post-dump* action script. + In other words, do not use it unless really needed. *-s*, *--leave-stopped*:: Leave tasks in stopped state after checkpoint, instead of killing. *--external* __type__**[**__id__**]:**__value__:: Dump an instance of an external resource. The generic syntax is 'type' of resource, followed by resource 'id' (enclosed in literal square brackets), and optional 'value' (prepended by a literal colon). The following resource types are currently supported: *mnt*, *dev*, *file*, *tty*, *unix*. Syntax depends on type. Note to restore external resources, either *--external* or *--inherit-fd* is used, depending on resource type. *--external* **mnt[**__mountpoint__**]:**__name__:: Dump an external bind mount referenced by 'mountpoint', saving it to image under the identifier 'name'. *--external* **mnt[]:**__flags__:: Dump all external bind mounts, autodetecting those. Optional 'flags' can contain *m* to also dump external master mounts, *s* to also dump external shared mounts (default behavior is to abort dumping if such mounts are found). If 'flags' are not provided, colon is optional. *--external* **dev[**__major__**/**__minor__**]:**__name__:: Allow to dump a mount namespace having a real block device mounted. A block device is identified by its 'major' and 'minor' numbers, and *criu* saves its information to image under the identifier 'name'. *--external* **file[**__mnt_id__**:**__inode__**]**:: Dump an external file, i.e. an opened file that is can not be resolved from the current mount namespace, which can not be dumped without using this option. The file is identified by 'mnt_id' (a field obtained from **/proc/**__pid__**/fdinfo/**__N__) and 'inode' (as returned by *stat*(2)). *--external* **tty[**__rdev__**:**__dev__**]**:: Dump an external TTY, identified by *st_rdev* and *st_dev* fields returned by *stat*(2). *--external* **unix[**__id__**]**:: Tell *criu* that one end of a pair of UNIX sockets (created by *socketpair*(2)) with the given _id_ is OK to be disconnected. *--external* **net[**__inode__**]:**__name__:: Mark a network namespace as external and do not include it in the checkpoint. The label 'name' can be used with *--inherit-fd* during restore to specify a file descriptor to a preconfigured network namespace. *--external* **pid[**__inode__**]:**__name__:: Mark a PID namespace as external. This can be later used to restore a process into an existing PID namespace. The label 'name' can be used to assign another PID namespace during restore with the help of *--inherit-fd*. *--freeze-cgroup*:: Use cgroup freezer to collect processes. *--manage-cgroups*:: Collect cgroups into the image thus they gonna be restored then. Without this option, *criu* will not save cgroups configuration associated with a task. *--cgroup-props* 'spec':: Specify controllers and their properties to be saved into the image file. *criu* predefines specifications for common controllers, but since the kernel can add new controllers and modify their properties, there should be a way to specify ones matched the kernel. + 'spec' argument describes the controller and properties specification in a simplified YAML form: + ---------- "c1": - "strategy": "merge" - "properties": ["a", "b"] "c2": - "strategy": "replace" - "properties": ["c", "d"] ---------- + where 'c1' and 'c2' are controllers names, and 'a', 'b', 'c', 'd' are their properties. + Note the format: double quotes, spaces and new lines are required. The 'strategy' specifies what to do if a controller specified already exists as a built-in one: *criu* can either *merge* or *replace* such. + For example, the command line for the above example should look like this: + ---------- --cgroup-props "\"c1\":\n - \"strategy\": \"merge\"\n - \"properties\": [\"a\", \"b\"]\n \"c2\":\n - \"strategy\": \"replace\"\n - \"properties\": [\"c\", \"d\"]" ---------- *--cgroup-props-file* 'file':: Same as *--cgroup-props*, except the specification is read from the 'file'. *--cgroup-dump-controller* 'name':: Dump a controller with 'name' only, skipping anything else that was discovered automatically (usually via */proc*). This option is useful when one needs *criu* to skip some controllers. *--cgroup-yard* 'path':: Instead of trying to mount cgroups in CRIU, provide a path to a directory with already created cgroup yard. Useful if you don't want to grant CAP_SYS_ADMIN to CRIU. For every cgroup mount there should be exactly one directory. If there is only one controller in this mount, the dir's name should be just the name of the controller. If there are multiple controllers comounted, the directory name should have them be separated by a comma. + For example, if */proc/cgroups* looks like this: + ---------- #subsys_name hierarchy num_cgroups enabled cpu 1 1 1 devices 2 2 1 freezer 2 2 1 ---------- + then you can create the cgroup yard by the following commands: + ---------- mkdir private_yard cd private_yard mkdir cpu mount -t cgroup -o cpu none cpu mkdir devices,freezer mount -t cgroup -o devices,freezer none devices,freezer ---------- *--tcp-established*:: Checkpoint established TCP connections. *--tcp-close*:: Don't dump the state of, or block, established tcp connections (including the connection is once established but now closed). This is useful when tcp connections are not going to be restored. *--skip-in-flight*:: This option skips in-flight TCP connections. If any TCP connections that are not yet completely established are found, *criu* ignores these connections, rather than errors out. The TCP stack on the client side is expected to handle the re-connect gracefully. *--evasive-devices*:: Use any path to a device file if the original one is inaccessible. *--page-server*:: Send pages to a page server (see the *page-server* command). *--force-irmap*:: Force resolving names for inotify and fsnotify watches. *--auto-dedup*:: Deduplicate "old" data in pages images of previous *dump*. This option implies incremental *dump* mode (see the *pre-dump* command). *-l*, *--file-locks*:: Dump file locks. It is necessary to make sure that all file lock users are taken into dump, so it is only safe to use this for enclosed containers where locks are not held by any processes outside of dumped process tree. *--link-remap*:: Allows to link unlinked files back, if possible (modifies filesystem during *restore*). *--timeout* 'number':: Set a time limit in seconds for collecting tasks during the dump operation. The timeout is 10 seconds by default. *--ghost-limit* 'size':: Set the maximum size of deleted file to be carried inside image. By default, up to 1M file is allowed. Using this option allows to not put big deleted files inside images. Argument 'size' may be postfixed with a *K*, *M* or *G*, which stands for kilo-, mega, and gigabytes, accordingly. *--ghost-fiemap*:: Enable an optimization based on fiemap ioctl that can reduce the number of system calls used when checkpointing highly sparse ghost files. This option is enabled by default, and it can be disabled with *--no-ghost-fiemap*. An automatic fallback to SEEK_HOLE/SEEK_DATA is used when fiemap is not supported. *-j*, *--shell-job*:: Allow one to dump shell jobs. This implies the restored task will inherit session and process group ID from the *criu* itself. This option also allows to migrate a single external tty connection, to migrate applications like *top*. If used with *dump* command, it must be specified with *restore* as well. *--cpu-cap* ['cap'[,'cap'...]]:: Specify CPU capabilities to write to an image file. The argument is a comma-separated list of: + - *none* to ignore capabilities at all; the image will not be produced on dump, neither any check performed on restore; - *fpu* to check if FPU module is compatible; - *ins* to check if CPU supports all instructions required; - *cpu* to check if CPU capabilities are exactly matching; - *all* for all above set. + By default the option is set to *fpu* and *ins*. *--cgroup-root* ['controller':]/'newroot':: Change the root for the controller that will be dumped. By default, *criu* simply dumps everything below where any of the tasks live. However, if a container moves all of its tasks into a cgroup directory below the container engine's default directory for tasks, permissions will not be preserved on the upper directories with no tasks in them, which may cause problems. *--lazy-pages*:: Perform the dump procedure without writing memory pages into the image files and prepare to service page requests over the network. When *dump* runs in this mode it presumes that *lazy-pages* daemon will connect to it and fetch memory pages to lazily inject them into the restored process address space. This option is intended for post-copy (lazy) migration and should be used in conjunction with *restore* with appropriate options. *--file-validation* ['mode']:: Set the method to be used to validate open files. Validation is done to ensure that the version of the file being restored is the same version when it was dumped. + The 'mode' may be one of the following: *filesize*::: To explicitly use only the file size check all the time. This is the fastest and least intensive check. *buildid*::: To validate ELF files with their build-ID. If the build-ID cannot be obtained, 'chksm-first' method will be used. This is the default if mode is unspecified. *--network-lock* ['mode']:: Set the method to be used for network locking/unlocking. Locking is done to ensure that tcp packets are dropped between dump and restore. This is done to avoid the kernel sending RST when a packet arrives destined for the dumped process. + The 'mode' may be one of the following: *iptables*::: Use iptables rules to drop the packets. This is the default if 'mode' is not specified. *nftables*::: Use nftables rules to drop the packets. *skip*::: Don't lock the network. If *--tcp-close* is not used, the network must be locked externally to allow CRIU to dump TCP connections. *restore* ~~~~~~~~~ Restores previously checkpointed processes. *--inherit-fd* **fd[**__N__**]:**__resource__:: Inherit a file descriptor. This option lets *criu* use an already opened file descriptor 'N' for restoring a file identified by 'resource'. This option can be used to restore an external resource dumped with the help of *--external* *file*, *tty*, *pid* and *unix* options. + The 'resource' argument can be one of the following: + - **tty[**__rdev__**:**__dev__**]** - **pipe[**__inode__**]** - **socket[**__inode__*]* - **file[**__mnt_id__**:**__inode__**]** - 'path/to/file' + Note that square brackets used in this option arguments are literals and usually need to be escaped from shell. *-d*, *--restore-detached*:: Detach *criu* itself once restore is complete. *-s*, *--leave-stopped*:: Leave tasks in stopped state after restore (rather than resuming their execution). *-S*, *--restore-sibling*:: Restore root task as a sibling (makes sense only with *--restore-detached*). *--log-pid*:: Write separate logging files per each pid. *-r*, *--root* 'path':: Change the root filesystem to 'path' (when run in a mount namespace). This option is required to restore a mount namespace. The directory 'path' must be a mount point and its parent must not be overmounted. *--external* __type__**[**__id__**]:**__value__:: Restore an instance of an external resource. The generic syntax is 'type' of resource, followed by resource 'id' (enclosed in literal square brackets), and optional 'value' (prepended by a literal colon). The following resource types are currently supported: *mnt*, *dev*, *veth*, *macvlan*. Syntax depends on type. Note to restore external resources dealing with opened file descriptors (such as dumped with the help of *--external* *file*, *tty*, and *unix* options), option *--inherit-fd* should be used. *--external* **mnt[**__name__**]:**__mountpoint__:: Restore an external bind mount referenced in the image by 'name', bind-mounting it from the host 'mountpoint' to a proper mount point. *--external mnt[]*:: Restore all external bind mounts (dumped with the help of *--external mnt[]* auto-detection). *--external* **dev[**__name__**]:**__/dev/path__:: Restore an external mount device, identified in the image by 'name', using the existing block device '/dev/path'. *--external* **veth[**__inner_dev__**]:**__outer_dev__**@**__bridge__:: Set the outer VETH device name (corresponding to 'inner_dev' being restored) to 'outer_dev'. If optional **@**_bridge_ is specified, 'outer_dev' is added to that bridge. If the option is not used, 'outer_dev' will be autogenerated by the kernel. *--external* **macvlan[**__inner_dev__**]:**__outer_dev__:: When restoring an image that have a MacVLAN device in it, this option must be used to specify to which 'outer_dev' (an existing network device in CRIU namespace) the restored 'inner_dev' should be bound to. *-J*, *--join-ns* **NS**:{**PID**|**NS_FILE**}[,**EXTRA_OPTS**]:: Restore process tree inside an existing namespace. The namespace can be specified in 'PID' or 'NS_FILE' path format (example: *--join-ns net:12345* or *--join-ns net:/foo/bar*). Currently supported values for **NS** are: *ipc*, *net*, *time*, *user*, and *uts*. This option doesn't support joining a PID namespace, however, this is possible using *--external* and *--inheritfd*. 'EXTRA_OPTS' is optional and can be used to specify UID and GID for user namespace (e.g., *--join-ns user:PID,UID,GID*). *--manage-cgroups* ['mode']:: Restore cgroups configuration associated with a task from the image. Controllers are always restored in an optimistic way -- if already present in system, *criu* reuses it, otherwise it will be created. + The 'mode' may be one of the following: *none*::: Do not restore cgroup properties but require cgroup to pre-exist at the moment of *restore* procedure. *props*::: Restore cgroup properties and require cgroup to pre-exist. *soft*::: Restore cgroup properties if only cgroup has been created by *criu*, otherwise do not restore properties. This is the default if mode is unspecified. *full*::: Always restore all cgroups and their properties. *strict*::: Restore all cgroups and their properties from the scratch, requiring them to not present in the system. *ignore*::: Don't deal with cgroups and pretend that they don't exist. *--cgroup-yard* 'path':: Instead of trying to mount cgroups in CRIU, provide a path to a directory with already created cgroup yard. For more information look in the *dump* section. *--cgroup-root* ['controller'*:*]/'newroot':: Change the root cgroup the controller will be installed into. No controller means that root is the default for all controllers not specified. *--tcp-established*:: Restore previously dumped established TCP connections. This implies that the network has been locked between *dump* and *restore* phases so other side of a connection simply notice a kind of lag. *--tcp-close*:: Restore connected TCP sockets in closed state. *--veth-pair* __IN__**=**__OUT__:: Correspondence between outside and inside names of veth devices. *-l*, *--file-locks*:: Restore file locks from the image. *--lsm-profile* __type__**:**__name__:: Specify an LSM profile to be used during restore. The _type_ can be either *apparmor* or *selinux*. *--lsm-mount-context* 'context':: Specify a new mount context to be used during restore. + This option will only replace existing mount context information with the one specified with this option. Mounts without the 'context=' option will not be changed. + If a mountpoint has been checkpointed with an option like context="system_u:object_r:container_file_t:s0:c82,c137" + it is possible to change this option using --lsm-mount-context "system_u:object_r:container_file_t:s0:c204,c495" + which will result that the mountpoint will be restored with the new 'context='. + This option is useful if using *selinux* and if the *selinux* labels need to be changed on restore like if a container is restored into an existing Pod. *--auto-dedup*:: As soon as a page is restored it get punched out from image. *-j*, *--shell-job*:: Restore shell jobs, in other words inherit session and process group ID from the criu itself. *--cpu-cap* ['cap'[,'cap'...]]:: Specify CPU capabilities to be present on the CPU the process is restoring. To inverse a capability, prefix it with *^*. This option implies that *--cpu-cap* has been passed on *dump* as well, except *fpu* option case. The 'cap' argument can be the following (or a set of comma-separated values): *all*::: Require all capabilities. This is *default* mode if *--cpu-cap* is passed without arguments. Most safe mode. *cpu*::: Require the CPU to have all capabilities in image to match runtime CPU. *fpu*::: Require the CPU to have compatible FPU. For example the process might be dumped with xsave capability but attempted to restore without it present on target CPU. In such case we refuse to proceed. This is *default* mode if *--cpu-cap* is not present in command line. Note this argument might be passed even if on the *dump* no *--cpu-cap* have been specified because FPU frames are always encoded into images. *ins*::: Require CPU compatibility on instructions level. *none*::: Ignore capabilities. Most dangerous mode. The behaviour is implementation dependent. Try to not use it until really required. + For example, this option can be used in case *--cpu-cap=cpu* was used during *dump*, and images are migrated to a less capable CPU and are to be restored. By default, *criu* shows an error that CPU capabilities are not adequate, but this can be suppressed by using *--cpu-cap=none*. *--weak-sysctls*:: Silently skip restoring sysctls that are not available. This allows to restore on an older kernel, or a kernel configured without some options. *--lazy-pages*:: Restore the processes without filling out the entire memory contents. When this option is used, *restore* sets up the infrastructure required to fill memory pages either on demand when the process accesses them or in the background without stopping the restored process. This option requires running *lazy-pages* daemon. *--file-validation* ['mode']:: Set the method to be used to validate open files. Validation is done to ensure that the version of the file being restored is the same version when it was dumped. + The 'mode' may be one of the following: *filesize*::: To explicitly use only the file size check all the time. This is the fastest and least intensive check. *buildid*::: To validate ELF files with their build-ID. If the build-ID cannot be obtained, 'chksm-first' method will be used. This is the default if mode is unspecified. *--skip-file-rwx-check*:: Skip checking file permissions (r/w/x for u/g/o) on restore. *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to dump and restore a process tree. There are three categories of kernel support, as described below. *criu check* always checks Category 1 features unless *--feature* is specified which only checks a specified feature. *Category 1*::: Absolutely required. These are features like support for */proc/PID/map_files*, *NETLINK_SOCK_DIAG* socket monitoring, */proc/sys/kernel/ns_last_pid* etc. *Category 2*::: Required only for specific cases. These are features like AIO remap, */dev/net/tun* and others that are only required if a process being dumped or restored is using those. *Category 3*::: Experimental. These are features like *task-diag* that are used for experimental purposes (mostly during development). If there are no errors or warnings, *criu* prints "Looks good." and its exit code is 0. A missing Category 1 feature causes *criu* to print "Does not look good." and its exit code is non-zero. Missing Category 2 and 3 features cause *criu* to print "Looks good but ..." and its exit code is be non-zero. Without any options, *criu check* checks Category 1 features. This behavior can be changed by using the following options: *--extra*:: Check kernel support for Category 2 features. *--experimental*:: Check kernel support for Category 3 features. *--all*:: Check kernel support for Category 1, 2, and 3 features. *--feature* 'name':: Check a specific feature. If 'name' is *list*, a list of valid kernel feature names that can be checked will be printed. *page-server* ~~~~~~~~~~~~~ Launches *criu* in page server mode. *--daemon*:: Runs page server as a daemon (background process). *--status-fd*:: Write \0 to the FD and close it once page-server is ready to handle requests. The status-fd allows to not daemonize a process and get its exit code at the end. It isn't supposed to use --daemon and --status-fd together. *--address* 'address':: Page server IP address or hostname. *--port* 'number':: Page server port number. *--ps-socket* 'fd':: Use provided file descriptor as socket for incoming connection. In this case --address and --port are ignored. Useful for intercepting page-server traffic e.g. to add encryption or authentication. *--lazy-pages*:: Serve local memory dump to a remote *lazy-pages* daemon. In this mode the *page-server* reads local memory dump and allows the remote *lazy-pages* daemon to request memory pages in random order. *--tls-cacert* 'file':: Specifies the path to a trusted Certificate Authority (CA) certificate file to be used for verification of a client or server certificate. The 'file' must be in PEM format. When this option is used only the specified CA is used for verification. Otherwise, the system's trusted CAs and, if present, '/etc/pki/CA/cacert.pem' will be used. *--tls-cacrl* 'file':: Specifies a path to a Certificate Revocation List (CRL) 'file' which contains a list of revoked certificates that should no longer be trusted. The 'file' must be in PEM format. When this option is not specified, the file, if present, '/etc/pki/CA/cacrl.pem' will be used. *--tls-cert* 'file':: Specifies a path to a file that contains a X.509 certificate to present to the remote entity. The 'file' must be in PEM format. When this option is not specified, the default location ('/etc/pki/criu/cert.pem') will be used. *--tls-key* 'file':: Specifies a path to a file that contains TLS private key. The 'file' must be in PEM format. When this option is not the default location ('/etc/pki/criu/private/key.pem') will be used. *--tls*:: Use TLS to secure remote connections. *lazy-pages* ~~~~~~~~~~~~ Launches *criu* in lazy-pages daemon mode. The *lazy-pages* daemon is responsible for managing user-level demand paging for the restored processes. It gets information required to fill the process memory pages from the *restore* and from the checkpoint directory. When a restored process access certain memory page for the first time, the *lazy-pages* daemon injects its contents into the process address space. The memory pages that are not yet requested by the restored processes are injected in the background. *exec* ~~~~~~ Executes a system call inside a destination task\'s context. This functionality is deprecated; please use *Compel* instead. *service* ~~~~~~~~~ Launches *criu* in RPC daemon mode, where *criu* is listening for RPC commands over socket to perform. This is convenient for a case where daemon itself is running in a privileged (superuser) mode but clients are not. dedup ~~~~~ Starts pagemap data deduplication procedure, where *criu* scans over all pagemap files and tries to minimize the number of pagemap entries by obtaining the references from a parent pagemap image. cpuinfo dump ~~~~~~~~~~~~ Fetches current CPU features and write them into an image file. cpuinfo check ~~~~~~~~~~~~~ Fetches current CPU features (i.e. CPU the *criu* is running on) and test if they are compatible with the ones present in an image file. CONFIGURATION FILES ------------------- *Criu* supports usage of configuration files to avoid the need of writing every option on command line, which is useful especially with repeated usage of same options. A specific configuration file can be passed with the "*--config* 'file'" option. If no file is passed, the default configuration files '/etc/criu/default.conf' and '$HOME/.criu/default.conf' are parsed (if present on the system). If the environment variable CRIU_CONFIG_FILE is set, it will also be parsed. The options passed to CRIU via CLI, RPC or configuration file are evaluated in the following order: - apply_config(/etc/criu/default.conf) - apply_config($HOME/.criu/default.conf) - apply_config(CRIU_CONFIG_FILE) - apply_config(*--config* 'file') - apply_config(CLI) or apply_config(RPC) - apply_config(RPC configuration file) (only for RPC mode) Default configuration file parsing can be deactivated with "*--no-default-config*" if needed. Parsed configuration files are merged with command line options, which allows overriding boolean options. Configuration file syntax ~~~~~~~~~~~~~~~~~~~~~~~~~ Comments are supported using \'#' sign. The rest of the line is ignored. Options are the same as command line options without the \'--' prefix, use one option per line (with corresponding argument if applicable, divided by whitespaces). If needed, the argument can be provided in double quotes (this should be needed only if the argument contains whitespaces). In case this type of argument contains a literal double quote as well, it can be escaped using the \'\' sign. Usage of commands is disallowed and all other escape sequences are interpreted literally. Example of configuration file to illustrate syntax: --------------- $ cat ~/.criu/default.conf tcp-established work-dir "/home/USERNAME/criu/my \"work\" directory" #this is a comment no-restore-sibling # this is another comment --------------- Configuration files in RPC mode ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Not only does *criu* evaluate configuration files in CLI mode, it also evaluates configuration files in RPC mode. Just as in CLI mode the configuration file values are evaluated first. This means that any option set via RPC will overwrite the configuration file setting. The user can thus change *criu*'s default behavior but it is not possible to change settings which are explicitly set by the RPC client. The RPC client can, however, specify an additional configuration file which will be evaluated after the RPC options (see above for option evaluation order). The RPC client can specify this additional configuration file via "req.opts.config_file = '/path/to/file'". The values from this configuration file will overwrite all other configuration file settings or RPC options. *This can lead to undesired behavior of criu and should only be used carefully.* NON-ROOT -------- *criu* can be used as non-root with either the *CAP_SYS_ADMIN* capability or with the *CAP_CHECKPOINT_RESTORE* capability introduces in Linux kernel 5.9. *CAP_CHECKPOINT_RESTORE* is the minimum that is required. *criu* also needs either *CAP_SYS_PTRACE* or a value of 0 in */proc/sys/kernel/yama/ptrace_scope* (see *ptrace*(2)) to be able to interrupt the process for dumping. Running *criu* as non-root has many limitations and depending on the process to checkpoint and restore it may not be possible. In addition to *CAP_CHECKPOINT_RESTORE* it is possible to give *criu* additional capabilities to enable additional features in non-root mode. Currently *criu* can benefit from the following additional capabilities: - *CAP_NET_ADMIN* - *CAP_SYS_CHROOT* - *CAP_SETUID* - *CAP_SYS_RESOURCE* Note that for some operations, having a capability in a namespace other than the init namespace (i.e. the default/root namespace) is not sufficient. For example, in order to read symlinks in proc/[pid]/map_files CRIU requires CAP_CHECKPOINT_RESTORE in the init namespace; having CAP_CHECKPOINT_RESTORE while running in another user namespace (e.g. in a container) does not allow CRIU to read symlinks in /proc/[pid]/map_files. Without access to /proc/[pid]/map_files checkpointing/restoring processes that have mapped deleted files may not be possible. Independent of the capabilities it is always necessary to use "*--unprivileged*" to accept *criu*'s limitation in non-root mode. EXAMPLES -------- To checkpoint a program with pid of *1234* and write all image files into directory *checkpoint*: ---------- criu dump -D checkpoint -t 1234 ---------- To restore this program detaching criu itself: ---------- criu restore -d -D checkpoint ---------- AUTHOR ------ The CRIU team. COPYRIGHT --------- Copyright \(C) 2011-2016, Parallels Holdings, Inc. crac-criu-1.5.0/Documentation/custom.xsl000066400000000000000000000004241471504326700202510ustar00rootroot00000000000000 1 1 1 crac-criu-1.5.0/INSTALL.md000066400000000000000000000023231471504326700150260ustar00rootroot00000000000000## Installing CRIU from source code Once CRIU is built one can easily setup the complete CRIU package (which includes executable itself, CRIT tool, libraries, manual and etc) simply typing make install this command accepts the following variables: * **DESTDIR**, to specify global root where all components will be placed under (empty by default); * **PREFIX**, to specify additional prefix for path of every component installed (`/usr/local` by default); * **BINDIR**, to specify where to put CRIT tool (`$(PREFIX)/bin` by default); * **SBINDIR**, to specify where to put CRIU executable (`$(PREFIX)/sbin` by default); * **MANDIR**, to specify directory for manual pages (`$(PREFIX)/share/man` by default); * **LIBDIR**, to specify directory where to put libraries (guess the correct path by default). Thus one can type make DESTDIR=/some/new/place install and get everything installed under `/some/new/place`. ## Uninstalling CRIU To clean up previously installed CRIU instance one can type make uninstall and everything should be removed. Note though that if some variable (**DESTDIR**, **BINDIR** and such) has been used during installation procedure, the same *must* be passed with uninstall action. crac-criu-1.5.0/MAINTAINERS000066400000000000000000000005011471504326700150670ustar00rootroot00000000000000Pavel Emelyanov (chief) Andrey Vagin Mike Rapoport Dmitry Safonov <0x7f454c46@gmail.com> Adrian Reber Pavel Tikhomirov Radostin Stoyanov Alexander Mikhalitsyn crac-criu-1.5.0/MAINTAINERS_GUIDE.md000066400000000000000000000144071471504326700164150ustar00rootroot00000000000000## Introduction Dear maintainer. Thank you for investing the time and energy to help make CRIU as useful as possible. Maintaining a project is difficult, sometimes unrewarding work. Sure, you will contribute cool features to the project, but most of your time will be spent reviewing patches, cleaning things up, documenting, answering questions, justifying design decisions - while everyone else will just have fun! But remember -- the quality of the maintainers work is what distinguishes the good projects from the great. So please be proud of your work, even the unglamorous parts, and encourage a culture of appreciation and respect for *every* aspect of improving the project -- not just the hot new features. Being a maintainer is a time consuming commitment and should not be taken lightly. This document is a manual for maintainers old and new. It explains what is expected of maintainers, how they should work, and what tools are available to them. This is a living document - if you see something out of date or missing, speak up! ## What are a maintainer's responsibility? Part of a healthy project is to have active maintainers to support the community in contributions and perform tasks to keep the project running. It is every maintainer's responsibility to: * Keep the community a friendly place * Deliver prompt feedback and decisions on pull requests and mailing list threads * Encourage other members to help each other, especially in cases the maintainer is overloaded or feels the lack of needed expertise * Make sure the changes made respects the philosophy, design and roadmap of the project ## How are decisions made? CRIU is an open-source project with an open design philosophy. This means that the repository is the source of truth for EVERY aspect of the project. *If it's part of the project, it's in the repo. It's in the repo, it's part of the project.* All decisions affecting CRIU, big and small, follow the same 3 steps: * Submit a change. Anyone can do this * Discuss it. Anyone can and is encouraged to do this * Accept or decline it. Only maintainers do this *I'm a maintainer, should I make pull requests / send patches too?* Yes. Nobody should ever push to the repository directly. All changes should be made through submitting (and accepting) the change. ### Two-steps decision making ### Since CRIU is extremely complex piece of software we try double hard not to make mistakes, that would be hard to fix in the future. In order to facilitate this, the "final" decision is made in two stages: * We definitely want to try something out * We think that the attempt was successful Respectively, new features get accepted first into the *criu-dev* branch and after they have been validated they are merged into the *master* branch. Yet, urgent bug fixes may land directly in the master branch. If a change in the criu-dev branch is considered to be bad (whatever it means), then it can be reverted without propagation to the master branch. Reverting from the master branch is expected not to happen at all, but if such an extraordinary case occurs, the impact of this step, especially the question of backward compatibility, should be considered in the most careful manner. ## Who decides what? All decisions can be expressed as changes to the repository (either in the form of pull requests, or patches sent to the mailing list), and maintainers make decisions by merging or rejecting them. Review and approval or disagreement can be done by anyone and is denoted by adding a respective comment in the pull request. However, merging the change into either branch only happens after approvals from maintainers. In order for a patch to be merged into the criu-dev branch at least two maintainers should accept it. In order for a patch to be merged into the master branch the majority of maintainers should decide that (then prepare a pull request, submit it, etc.). Overall the maintainer system works because of mutual respect across the maintainers of the project. The maintainers trust one another to make decisions in the best interests of the project. Sometimes maintainers can disagree and this is part of a healthy project to represent the point of views of various people. In the case where maintainers cannot find agreement on a specific change the role of a Chief Maintainer comes into play. ### Chief maintainer The chief maintainer for the project is responsible for overall architecture of the project to maintain conceptual integrity. Large decisions and architecture changes should be reviewed by the chief maintainer. Also the chief maintainer has the veto power on any change submitted to any branch. Naturally, a change in the criu-dev branch can be reverted after a chief maintainer veto, a change in the master branch must be carefully reviewed by the chief maintainer and vetoed in advance. ### How are maintainers added (and removed)? The best maintainers have a vested interest in the project. Maintainers are first and foremost contributors that have shown they are committed to the long term success of the project. Contributors wanting to become maintainers are expected to be deeply involved in contributing code, patches review, and paying needed attention to the issues in the project. Just contributing does not make you a maintainer, it is about building trust with the current maintainers of the project and being a person that they can rely on and trust to make decisions in the best interest of the project. When a contributor wants to become a maintainer or nominate someone as a maintainer, one can submit a "nomination", which technically is the respective modification to the `MAINTAINERS` file. When a maintainer feels they is unable to perform the required duties, or someone else wants to draw the community attention to this fact, one can submit a "(self-)removing" change. The final vote to add or to remove a maintainer is to be approved by the majority of current maintainers (with the chief maintainer having veto power on that too). One might have noticed, that the chief maintainer (re-)assignment is not regulated by this document. That's true :) However, this can be done. If the community decides that the chief maintainer needs to be changed the respective "decision making rules" are to be prepared, submitted and accepted into this file first. Good luck! crac-criu-1.5.0/Makefile000066400000000000000000000333401471504326700150410ustar00rootroot00000000000000__nmk_dir=$(CURDIR)/scripts/nmk/scripts/ export __nmk_dir # # No need to try to remake our Makefiles Makefile: ; Makefile.%: ; scripts/%.mak: ; $(__nmk_dir)%.mk: ; # # Import the build engine include $(__nmk_dir)include.mk include $(__nmk_dir)macro.mk ifeq ($(origin HOSTCFLAGS), undefined) HOSTCFLAGS := $(CFLAGS) $(USERCFLAGS) endif # # Supported Architectures ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif # The PowerPC 64 bits architecture could be big or little endian. # They are handled in the same way. ifeq ($(SUBARCH),ppc64) error := $(error ppc64 big endian is not yet supported) endif # # Architecture specific options. ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) ARCHCFLAGS += -march=armv6 endif ifeq ($(ARMV),7) ARCHCFLAGS += -march=armv7-a+fp endif ifeq ($(ARMV),8) # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. ARCHCFLAGS += -march=armv7-a ARMV := 7 endif DEFINES := -DCONFIG_ARMV$(ARMV) -DCONFIG_VDSO_32 PROTOUFIX := y # For simplicity - compile code in Arm mode without interwork. # We could choose Thumb mode as default instead - but a dirty # experiment shows that with 90Kb PIEs Thumb code doesn't save # even one page. So, let's stick so far to Arm mode as it's more # universal around all different Arm variations, until someone # will find any use for Thumb mode. -dima CFLAGS_PIE := -marm endif ifeq ($(ARCH),aarch64) DEFINES := -DCONFIG_AARCH64 endif ifeq ($(ARCH),ppc64) LDARCH := powerpc:common64 DEFINES := -DCONFIG_PPC64 -D__SANE_USERSPACE_TYPES__ endif ifeq ($(ARCH),x86) LDARCH := i386:x86-64 DEFINES := -DCONFIG_X86_64 endif ifeq ($(ARCH),mips) DEFINES := -DCONFIG_MIPS endif ifeq ($(ARCH),loongarch64) DEFINES := -DCONFIG_LOONGARCH64 endif # # CFLAGS_PIE: # # Ensure with -fno-optimize-sibling-calls that we don't create GOT # (Global Offset Table) relocations with gcc compilers that don't have # commit "S/390: Fix 64 bit sibcall". ifeq ($(ARCH),s390) ARCH := s390 DEFINES := -DCONFIG_S390 CFLAGS_PIE := -fno-optimize-sibling-calls endif CFLAGS_PIE += -DCR_NOGLIBC export CFLAGS_PIE LDARCH ?= $(ARCH) export LDARCH export PROTOUFIX DEFINES # # Independent options for all tools. DEFINES += -D_FILE_OFFSET_BITS=64 DEFINES += -D_GNU_SOURCE WARNINGS := -Wall -Wformat-security -Wdeclaration-after-statement -Wstrict-prototypes # -Wdangling-pointer results in false warning when we add a list element to # local list head variable. It is false positive because before leaving the # function we always check that local list head variable is empty, thus # insuring that pointer to it is not dangling anywhere, but gcc can't # understand it. # Note: There is similar problem with kernel list, where this warning is also # disabled: https://github.com/torvalds/linux/commit/49beadbd47c2 WARNINGS += -Wno-dangling-pointer -Wno-unknown-warning-option CFLAGS-GCOV := --coverage -fno-exceptions -fno-inline -fprofile-update=atomic export CFLAGS-GCOV ifeq ($(ARCH),mips) WARNINGS := -rdynamic endif ifeq ($(ARCH),loongarch64) WARNINGS := -Wno-implicit-function-declaration endif ifneq ($(GCOV),) LDFLAGS += -lgcov CFLAGS += $(CFLAGS-GCOV) endif ifeq ($(ASAN),1) CFLAGS-ASAN := -fsanitize=address export CFLAGS-ASAN CFLAGS += $(CFLAGS-ASAN) endif ifneq ($(WERROR),0) WARNINGS += -Werror endif ifeq ($(DEBUG),1) DEFINES += -DCR_DEBUG CFLAGS += -O0 -ggdb3 else CFLAGS += -O2 -g endif ifeq ($(GMON),1) CFLAGS += -pg GMONLDOPT += -pg export GMON GMONLDOPT endif AFLAGS += -D__ASSEMBLY__ CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target all: criu lib crit .PHONY: all # # Version headers. include Makefile.versions VERSION_HEADER := criu/include/version.h GITID_FILE := .gitid GITID := $(shell if [ -d ".git" ]; then git describe --always; fi) # Git repository wasn't inited in CRIU folder ifeq ($(GITID),) GITID := 0 else GITID_FILE_VALUE := $(shell if [ -f '$(GITID_FILE)' ]; then if [ `cat '$(GITID_FILE)'` = $(GITID) ]; then echo y; fi; fi) ifneq ($(GITID_FILE_VALUE),y) .PHONY: $(GITID_FILE) endif endif $(GITID_FILE): $(call msg-gen, $@) $(Q) echo "$(GITID)" > $(GITID_FILE) $(VERSION_HEADER): Makefile.versions $(GITID_FILE) $(call msg-gen, $@) $(Q) echo "/* Autogenerated, do not edit */" > $@ $(Q) echo "#ifndef __CR_VERSION_H__" >> $@ $(Q) echo "#define __CR_VERSION_H__" >> $@ $(Q) echo "#define CRIU_VERSION \"$(CRIU_VERSION)\"" >> $@ $(Q) echo "#define CRIU_VERSION_MAJOR " $(CRIU_VERSION_MAJOR) >> $@ $(Q) echo "#define CRIU_VERSION_MINOR " $(CRIU_VERSION_MINOR) >> $@ ifneq ($(CRIU_VERSION_SUBLEVEL),) $(Q) echo "#define CRIU_VERSION_SUBLEVEL " $(CRIU_VERSION_SUBLEVEL) >> $@ endif ifneq ($(CRIU_VERSION_EXTRA),) $(Q) echo "#define CRIU_VERSION_EXTRA " $(CRIU_VERSION_EXTRA) >> $@ endif $(Q) echo "#define CRIU_GITID \"$(GITID)\"" >> $@ $(Q) echo "#endif /* __CR_VERSION_H__ */" >> $@ criu-deps += $(VERSION_HEADER) # # Setup proper link for asm headers in common code. include/common/asm: include/common/arch/$(ARCH)/asm $(call msg-gen, $@) $(Q) ln -s ./arch/$(ARCH)/asm $@ criu-deps += include/common/asm # # Configure variables. export CONFIG_HEADER := include/common/config.h ifeq ($(filter tags etags cscope clean lint indent fetch-clang-format help mrproper,$(MAKECMDGOALS)),) include Makefile.config else # To clean all files, enable make/build options here export CONFIG_COMPAT := y export CONFIG_GNUTLS := y export CONFIG_HAS_LIBBPF := y endif # # Protobuf images first, they are not depending # on anything else. $(eval $(call gen-built-in,images)) criu-deps += images/built-in.o # # Compel get used by CRIU, build it earlier include Makefile.compel # # Next the socket CR library # SOCCR_A := soccr/libsoccr.a soccr/Makefile: ; soccr/%: $(CONFIG_HEADER) .FORCE $(Q) $(MAKE) $(build)=soccr $@ soccr/built-in.o: $(CONFIG_HEADER) .FORCE $(Q) $(MAKE) $(build)=soccr all $(SOCCR_A): |soccr/built-in.o criu-deps += $(SOCCR_A) # # LZ4 library # LZ4_OBJS = lz4/lib/liblz4.a criu/liblz4io.a $(LZ4_OBJS) : git submodule init git submodule update $(Q) env -i PATH="$$PATH" make CC=$(CC) CFLAGS="$(CFLAGS)" -C lz4 lib lz4 $(Q) $(AR) rcs criu/liblz4io.a lz4/programs/lz4io.o criu-deps += $(LZ4_OBJS) CFLAGS += -I. # # CRIU building done in own directory # with slightly different rules so we # can't use nmk engine directly (we # build syscalls library and such). # # But note that we're already included # the nmk so we can reuse it there. criu/Makefile: ; criu/Makefile.packages: ; criu/Makefile.crtools: ; criu/%: $(criu-deps) .FORCE $(Q) $(MAKE) $(build)=criu $@ criu: $(criu-deps) $(Q) $(MAKE) $(build)=criu all .PHONY: criu unittest: $(criu-deps) $(Q) $(MAKE) $(build)=criu unittest .PHONY: unittest # # Libraries next once criu is ready # (we might generate headers and such # when building criu itself). lib/Makefile: ; lib/%: criu .FORCE $(Q) $(MAKE) $(build)=lib $@ lib: criu $(Q) $(MAKE) $(build)=lib all .PHONY: lib clean mrproper: $(Q) $(MAKE) $(build)=images $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ $(Q) $(MAKE) -C lz4 clean rm -rf $(LZ4_OBJS) .PHONY: clean mrproper clean-amdgpu_plugin: $(Q) $(MAKE) -C plugins/amdgpu clean .PHONY: clean-amdgpu_plugin clean-top: $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) $(build)=test/compel clean $(Q) $(RM) .gitid .PHONY: clean-top clean: clean-top clean-amdgpu_plugin mrproper-top: clean-top clean-amdgpu_plugin $(Q) $(RM) $(CONFIG_HEADER) $(Q) $(RM) $(VERSION_HEADER) $(Q) $(RM) $(COMPEL_VERSION_HEADER) $(Q) $(RM) include/common/asm $(Q) $(RM) compel/include/asm $(Q) $(RM) cscope.* $(Q) $(RM) tags TAGS .PHONY: mrproper-top mrproper: mrproper-top # # Non-CRIU stuff. # docs: $(Q) $(MAKE) -s -C Documentation all .PHONY: docs zdtm: all $(Q) $(MAKE) -C test/zdtm all .PHONY: zdtm test: zdtm $(Q) $(MAKE) -C test .PHONY: test amdgpu_plugin: criu $(Q) $(MAKE) -C plugins/amdgpu all .PHONY: amdgpu_plugin crit: lib $(Q) $(MAKE) -C crit .PHONY: crit # # Generating tar requires tag matched CRIU_VERSION. # If not found then simply use GIT's describe with # "v" prefix stripped. head-name := $(shell git tag -l v$(CRIU_VERSION)) ifeq ($(head-name),) head-name := $(shell git describe 2>/dev/null) endif # If no git tag could describe current commit, # use pre-defined CRIU_VERSION with GITID (if any). ifeq ($(head-name),) ifneq ($(GITID),) head-name := $(CRIU_VERSION)-$(GITID) else head-name := $(CRIU_VERSION) endif endif tar-name := $(shell echo $(head-name) | sed -e 's/^v//g') criu-$(tar-name).tar.bz2: git archive --format tar --prefix 'criu-$(tar-name)/' $(head-name) | bzip2 > $@ dist tar: criu-$(tar-name).tar.bz2 ; .PHONY: dist tar TAGS_FILES_REGEXP := . -name '*.[hcS]' ! -path './.*' \( ! -path './test/*' -o -path './test/zdtm/lib/*' \) tags: $(call msg-gen, $@) $(Q) $(RM) tags $(Q) $(FIND) $(TAGS_FILES_REGEXP) -print | xargs $(CTAGS) -a .PHONY: tags etags: $(call msg-gen, $@) $(Q) $(RM) TAGS $(Q) $(FIND) $(TAGS_FILES_REGEXP) -print | xargs $(ETAGS) -a .PHONY: etags cscope: $(call msg-gen, $@) $(Q) $(FIND) $(TAGS_FILES_REGEXP) ! -type l -print > cscope.files $(Q) $(CSCOPE) -bkqu .PHONY: cscope gcov: $(E) " GCOV" $(Q) test -d gcov || mkdir gcov && \ geninfo --output-filename gcov/criu.info --no-recursion criu/ && \ cd gcov && \ genhtml --rc lcov_branch_coverage=1 --output-directory html criu.info @echo "Code coverage report is in `pwd`/gcov/html/ directory." .PHONY: gcov docker-build: $(MAKE) -C scripts/build/ x86_64 .PHONY: docker-build docker-test: docker run --rm --privileged -v /lib/modules:/lib/modules --network=host --cgroupns=host criu-x86_64 \ ./test/zdtm.py run -a --keep-going --ignore-taint .PHONY: docker-test help: @echo ' Targets:' @echo ' all - Build all [*] targets' @echo ' * criu - Build criu' @echo ' * crit - Build crit' @echo ' zdtm - Build zdtm test-suite' @echo ' docs - Build documentation' @echo ' install - Install CRIU (see INSTALL.md)' @echo ' uninstall - Uninstall CRIU' @echo ' dist - Create a source tarball' @echo ' clean - Clean most, but leave enough to navigate' @echo ' mrproper - Delete all compiled/generated files' @echo ' tags - Generate tags file (ctags)' @echo ' etags - Generate TAGS file (etags)' @echo ' cscope - Generate cscope database' @echo ' test - Run zdtm test-suite' @echo ' gcov - Make code coverage report' @echo ' unittest - Run unit tests' @echo ' lint - Run code linters' @echo ' indent - Indent C code' @echo ' amdgpu_plugin - Make AMD GPU plugin' .PHONY: help lint: flake8 --version flake8 --config=scripts/flake8.cfg test/zdtm.py flake8 --config=scripts/flake8.cfg test/inhfd/*.py flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py flake8 --config=scripts/flake8.cfg lib/pycriu/images/pb2dict.py flake8 --config=scripts/flake8.cfg lib/pycriu/images/images.py flake8 --config=scripts/flake8.cfg scripts/criu-ns flake8 --config=scripts/flake8.cfg test/others/criu-ns/run.py flake8 --config=scripts/flake8.cfg crit/*.py flake8 --config=scripts/flake8.cfg crit/crit/*.py flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py flake8 --config=scripts/flake8.cfg coredump/ coredump/coredump flake8 --config=scripts/flake8.cfg scripts/github-indent-warnings.py shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install shellcheck -x test/others/crit/*.sh shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck -x test/others/config-file/*.sh shellcheck -x test/others/action-script/*.sh codespell -S tags # Do not append \n to pr_perror, pr_pwarn or fail ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' # Do not use %m with pr_* or fail ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|pwarn|debug|info|msg)|fail)\>.*%m' # Do not use errno with pr_perror, pr_pwarn or fail ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>\(".*".*errno' # End pr_(err|warn|msg|info|debug) with \n ! git --no-pager grep -En '^\s*\.*);$$' | grep -v '\\n' # No EOL whitespace for C files ! git --no-pager grep -E '\s+$$' \*.c \*.h .PHONY: lint codecov: SHELL := $(shell which bash) codecov: curl -Os https://uploader.codecov.io/latest/linux/codecov chmod +x codecov ./codecov .PHONY: codecov fetch-clang-format: .FORCE $(E) ".clang-format" $(Q) scripts/fetch-clang-format.sh BASE ?= "HEAD~1" OPTS ?= "--quiet" indent: git clang-format --style file --extensions c,h $(OPTS) $(BASE) .PHONY: indent include Makefile.install .DEFAULT_GOAL := all # Disable implicit rules in _this_ Makefile. .SUFFIXES: # # Optional local include. -include Makefile.local crac-criu-1.5.0/Makefile.compel000066400000000000000000000054541471504326700163240ustar00rootroot00000000000000COMPEL_BIN := ./compel/compel-host export COMPEL_BIN COMPEL_VERSION_HEADER := compel/include/version.h $(COMPEL_VERSION_HEADER): Makefile.versions $(call msg-gen, $(COMPEL_VERSION_HEADER)) $(Q) echo "/* Autogenerated, do not edit */" > $(COMPEL_VERSION_HEADER) $(Q) echo "#ifndef COMPEL_SO_VERSION_H__" >> $(COMPEL_VERSION_HEADER) $(Q) echo "#define COMPEL_SO_VERSION_H__" >> $(COMPEL_VERSION_HEADER) $(Q) echo "#define COMPEL_SO_VERSION \"$(COMPEL_SO_VERSION)\"" >> $(COMPEL_VERSION_HEADER) $(Q) echo "#define COMPEL_SO_VERSION_MAJOR " $(COMPEL_SO_VERSION_MAJOR) >> $(COMPEL_VERSION_HEADER) $(Q) echo "#define COMPEL_SO_VERSION_MINOR " $(COMPEL_SO_VERSION_MINOR) >> $(COMPEL_VERSION_HEADER) $(Q) echo "#define COMPEL_SO_VERSION_SUBLEVEL " $(COMPEL_SO_VERSION_SUBLEVEL) >> $(COMPEL_VERSION_HEADER) $(Q) echo "#endif /* COMPEL_SO_VERSION_H__ */" >> $(COMPEL_VERSION_HEADER) compel/include/asm: $(call msg-gen, $@) $(Q) ln -s ../arch/$(ARCH)/src/lib/include $@ compel-deps += compel/include/asm compel-deps += $(COMPEL_VERSION_HEADER) compel-deps += $(CONFIG_HEADER) compel-deps += include/common/asm compel-plugins += compel/plugins/std.lib.a compel/plugins/fds.lib.a LIBCOMPEL_SO := libcrac-compel.so LIBCOMPEL_A := libcrac-compel.a export LIBCOMPEL_SO LIBCOMPEL_A # # Compel itself. compel/Makefile: ; compel/%: $(compel-deps) $(compel-plugins) .FORCE $(Q) $(MAKE) $(build)=compel $@ criu-deps += compel/compel-host-bin # # Make sure the host program is ready after the # library and plugins are built. compel/compel-host-bin: | compel/$(LIBCOMPEL_A) $(compel-plugins) $(COMPEL_BIN): compel/compel-host-bin # # Plugins compel/plugins/Makefile: ; compel/plugins/%: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ # # GNU make 4.x supports targets matching via wide # match targeting, where GNU make 3.x series (used on # Travis) is not, so we have to write them here explicitly. compel/plugins/std.lib.a: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ compel/plugins/shmem.lib.a: $(compel-deps) compel/plugins/std.lib.a .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ compel/plugins/fds.lib.a: $(compel-deps) compel/plugins/std.lib.a .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ compel/compel: compel/built-in.o compel/$(LIBCOMPEL_A) | $(compel-deps) $(call msg-link, $@) $(Q) $(CC) $(CFLAGS) $^ $(WRAPFLAGS) $(LDFLAGS) -rdynamic -o $@ # # And compel library. LIBCOMPEL_SO_CFLAGS += $(CFLAGS) -rdynamic -Wl,-soname,$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR) compel/$(LIBCOMPEL_SO): compel/$(LIBCOMPEL_A) $(call msg-link, $@) $(Q) $(CC) -shared $(LIBCOMPEL_SO_CFLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(LDFLAGS) compel-install-targets += compel/$(LIBCOMPEL_SO) compel-install-targets += compel/compel compel-install-targets += $(compel-plugins) crac-criu-1.5.0/Makefile.config000066400000000000000000000075721471504326700163150ustar00rootroot00000000000000include $(__nmk_dir)utils.mk include $(__nmk_dir)msg.mk include scripts/feature-tests.mak # Make the binary distributable without depending on build system shared libraries availability. #ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) # LIBS_FEATURES += -lbsd # FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD #else # $(info Note: Building without setproctitle() and strlcpy() support.) # $(info $(info) To enable these features, please install libbsd-devel (RPM) / libbsd-dev (DEB).) #endif ifeq ($(call pkg-config-check,libselinux),y) LIBS_FEATURES += -lselinux FEATURE_DEFINES += -DCONFIG_HAS_SELINUX endif # Make the binary distributable without depending on build system shared libraries availability. #ifeq ($(call pkg-config-check,libbpf),y) # LIBS_FEATURES += -lbpf # FEATURE_DEFINES += -DCONFIG_HAS_LIBBPF # export CONFIG_HAS_LIBBPF := y #endif ifeq ($(call pkg-config-check,libdrm),y) export CONFIG_AMDGPU := y $(info Note: Building criu with amdgpu_plugin.) else $(info Note: Building criu without amdgpu_plugin.) $(info Note: libdrm and libdrm_amdgpu are required to build amdgpu_plugin.) endif ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) LIBS_FEATURES += -lgnutls export CONFIG_GNUTLS := y FEATURE_DEFINES += -DCONFIG_GNUTLS else $(info Note: Building without GnuTLS support) endif # Make the binary distributable without depending on build system shared libraries availability. #ifeq ($(call pkg-config-check,libnftables),y) # LIB_NFTABLES := $(shell $(PKG_CONFIG) --libs libnftables) # ifeq ($(call try-cc,$(FEATURE_TEST_NFTABLES_LIB_API_0),$(LIB_NFTABLES)),true) # LIBS_FEATURES += $(LIB_NFTABLES) # FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_0 # else ifeq ($(call try-cc,$(FEATURE_TEST_NFTABLES_LIB_API_1),$(LIB_NFTABLES)),true) # LIBS_FEATURES += $(LIB_NFTABLES) # FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_1 # else # $(warning Warn: you have libnftables installed but it has incompatible API) # $(warning Warn: Building without nftables support) # endif #else # $(warning Warn: you have no libnftables installed) # $(warning Warn: Building without nftables support) #endif export LIBS += $(LIBS_FEATURES) CONFIG_FILE = .config $(CONFIG_FILE): touch $(CONFIG_FILE) ifeq ($(ARCH),x86) # CONFIG_COMPAT is only for x86 now, no need for compile-test other archs ifeq ($(call try-asm,$(FEATURE_TEST_X86_COMPAT)),true) export CONFIG_COMPAT := y FEATURE_DEFINES += -DCONFIG_COMPAT else $(info Note: Building without ia32 C/R, missed ia32 support in gcc) $(info $(info) That may be related to missing gcc-multilib in your) $(info $(info) distribution or you may have Debian with buggy toolchain) $(info $(info) (issue https://github.com/checkpoint-restore/criu/issues/315)) endif endif export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name define gen-feature-test ifeq ($$(call try-cc,$$(FEATURE_TEST_$(1)),$$(LIBS_FEATURES),$$(DEFINES)),true) $(Q) echo '#define CONFIG_HAS_$(1)' >> $$@ else $(Q) echo '// CONFIG_HAS_$(1) is not set' >> $$@ endif endef define config-header-rule $(CONFIG_HEADER): scripts/feature-tests.mak $(CONFIG_FILE) $(call msg-gen, $$@) $(Q) echo '#ifndef __CR_CONFIG_H__' > $$@ $(Q) echo '#define __CR_CONFIG_H__' >> $$@ $(Q) echo '' >> $$@ $(call map,gen-feature-test,$(FEATURES_LIST)) $(Q) cat $(CONFIG_FILE) | sed -n -e '/^[^#]/s/^/#define CONFIG_/p' >> $$@ $(Q) echo '#endif /* __CR_CONFIG_H__ */' >> $$@ endef $(eval $(config-header-rule)) crac-criu-1.5.0/Makefile.install000066400000000000000000000032631471504326700165070ustar00rootroot00000000000000# # Installation paths. PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin SBINDIR ?= $(PREFIX)/sbin MANDIR ?= $(PREFIX)/share/man INCLUDEDIR ?= $(PREFIX)/include LIBEXECDIR ?= $(PREFIX)/libexec RUNDIR ?= /run PLUGINDIR ?= $(PREFIX)/lib/criu # # For recent Debian/Ubuntu with multiarch support. DEB_HOST_MULTIARCH := $(shell dpkg-architecture -qDEB_HOST_MULTIARCH 2>/dev/null) ifneq "$(DEB_HOST_MULTIARCH)" "" LIBDIR ?= $(PREFIX)/lib/$(DEB_HOST_MULTIARCH) else # # For most other systems ifeq "$(shell uname -m)" "x86_64" LIBDIR ?= $(PREFIX)/lib64 endif endif # # LIBDIR falls back to the standard path. LIBDIR ?= $(PREFIX)/lib export PREFIX BINDIR SBINDIR MANDIR RUNDIR export LIBDIR INCLUDEDIR LIBEXECDIR PLUGINDIR install-man: $(Q) $(MAKE) -C Documentation install .PHONY: install-man install-lib: lib $(Q) $(MAKE) $(build)=lib install .PHONY: install-lib install-crit: lib $(Q) $(MAKE) $(build)=crit install .PHONY: install-crit install-criu: criu $(Q) $(MAKE) $(build)=criu install .PHONY: install-criu install-amdgpu_plugin: amdgpu_plugin $(Q) $(MAKE) -C plugins/amdgpu install .PHONY: install-amdgpu_plugin install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel install $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin ; .PHONY: install uninstall: $(Q) $(MAKE) -C Documentation $@ $(Q) $(MAKE) $(build)=lib $@ $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ $(Q) $(MAKE) -C plugins/amdgpu $@ .PHONY: uninstall crac-criu-1.5.0/Makefile.versions000066400000000000000000000016251471504326700167110ustar00rootroot00000000000000# # CRIU version. CRIU_VERSION_MAJOR := 3 CRIU_VERSION_MINOR := 19 CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := CRIU_VERSION_NAME := Bronze Peacock CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA))-crac export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL export CRIU_VERSION_EXTRA CRIU_VERSION_NAME CRIU_VERSION # # C library for CRIU. CRIU_SO_VERSION_MAJOR := 2 CRIU_SO_VERSION_MINOR := 0 export CRIU_SO_VERSION_MAJOR CRIU_SO_VERSION_MINOR # # SOCCR library. SOCCR_SO_VERSION_MAJOR := 1 SOCCR_SO_VERSION_MINOR := 0 export SOCCR_SO_VERSION_MAJOR SOCCR_SO_VERSION_MINOR COMPEL_SO_VERSION_MAJOR := 1 COMPEL_SO_VERSION_MINOR := 0 COMPEL_SO_VERSION_SUBLEVEL := 0 export COMPEL_SO_VERSION_MAJOR COMPEL_SO_VERSION_MINOR COMPEL_SO_VERSION_SUBLEVEL crac-criu-1.5.0/README.md000066400000000000000000000106201471504326700146540ustar00rootroot00000000000000[![X86_64 GCC Test](https://github.com/checkpoint-restore/criu/workflows/X86_64%20GCC%20Test/badge.svg)]( https://github.com/checkpoint-restore/criu/actions/workflows/x86-64-gcc-test.yml) [![Docker Test](https://github.com/checkpoint-restore/criu/actions/workflows/docker-test.yml/badge.svg)]( https://github.com/checkpoint-restore/criu/actions/workflows/docker-test.yml) [![Podman Test](https://github.com/checkpoint-restore/criu/actions/workflows/podman-test.yml/badge.svg)]( https://github.com/checkpoint-restore/criu/actions/workflows/podman-test.yml) [![CircleCI](https://circleci.com/gh/checkpoint-restore/criu.svg?style=svg)]( https://circleci.com/gh/checkpoint-restore/criu)

## CRIU -- A project to implement checkpoint/restore functionality for Linux CRIU (stands for Checkpoint and Restore in Userspace) is a utility to checkpoint/restore Linux tasks. Using this tool, you can freeze a running application (or part of it) and checkpoint it to a hard drive as a collection of files. You can then use the files to restore and run the application from the point it was frozen at. The distinctive feature of the CRIU project is that it is mainly implemented in user space. There are some more projects doing C/R for Linux, and so far CRIU [appears to be](https://criu.org/Comparison_to_other_CR_projects) the most feature-rich and up-to-date with the kernel. CRIU project is (almost) the never-ending story, because we have to always keep up with the Linux kernel supporting checkpoint and restore for all the features it provides. Thus we're looking for contributors of all kinds -- feedback, bug reports, testing, coding, writing, etc. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) if you would like to get involved. The project [started](https://criu.org/History) as the way to do live migration for OpenVZ Linux containers, but later grew to more sophisticated and flexible tool. It is currently used by (integrated into) OpenVZ, LXC/LXD, Docker, and other software, project gets tremendous help from the community, and its packages are included into many Linux distributions. The project home is at http://criu.org. This wiki contains all the knowledge base for CRIU we have. Pages worth starting with are: - [Installation instructions](http://criu.org/Installation) - [A simple example of usage](http://criu.org/Simple_loop) - [Examples of more advanced usage](https://criu.org/Category:HOWTO) - Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) ### Checkpoint and restore of simple loop process

## Advanced features As main usage for CRIU is live migration, there's a library for it called P.Haul. Also the project exposes two cool core features as standalone libraries. These are libcompel for parasite code injection and libsoccr for TCP connections checkpoint-restore. ### Live migration True [live migration](https://criu.org/Live_migration) using CRIU is possible, but doing all the steps by hands might be complicated. The [phaul sub-project](https://criu.org/P.Haul) provides a Go library that encapsulates most of the complexity. This library and the Go bindings for CRIU are stored in the [go-criu](https://github.com/checkpoint-restore/go-criu) repository. ### Parasite code injection In order to get state of the running process CRIU needs to make this process execute some code, that would fetch the required information. To make this happen without killing the application itself, CRIU uses the [parasite code injection](https://criu.org/Parasite_code) technique, which is also available as a standalone library called [libcompel](https://criu.org/Compel). ### TCP sockets checkpoint-restore One of the CRIU features is the ability to save and restore state of a TCP socket without breaking the connection. This functionality is considered to be useful by itself, and we have it available as the [libsoccr library](https://criu.org/Libsoccr). ## Licence The project is licensed under GPLv2 (though files sitting in the lib/ directory are LGPLv2.1). All files in the images/ directory are licensed under the Expat license (so-called MIT). See the images/LICENSE file. crac-criu-1.5.0/compel/000077500000000000000000000000001471504326700146555ustar00rootroot00000000000000crac-criu-1.5.0/compel/.gitignore000066400000000000000000000010401471504326700166400ustar00rootroot00000000000000arch/x86/plugins/std/sys-exec-tbl-64.c arch/x86/plugins/std/syscalls-64.S arch/arm/plugins/std/syscalls/syscalls.S arch/aarch64/plugins/std/syscalls/syscalls.S arch/s390/plugins/std/syscalls/syscalls.S arch/ppc64/plugins/std/syscalls/syscalls.S include/version.h plugins/include/uapi/std/asm/syscall-types.h plugins/include/uapi/std/syscall-64.h plugins/include/uapi/std/syscall-codes-64.h plugins/include/uapi/std/syscall-codes.h plugins/include/uapi/std/syscall.h plugins/include/uapi/std/syscall-aux.h plugins/include/uapi/std/syscall-aux.S crac-criu-1.5.0/compel/Makefile000066400000000000000000000067611471504326700163270ustar00rootroot00000000000000include Makefile.versions COMPEL_SO_VERSION := $(COMPEL_SO_VERSION_MAJOR)$(if $(COMPEL_SO_VERSION_MINOR),.$(COMPEL_SO_VERSION_MINOR))$(if $(COMPEL_SO_VERSION_SUBLEVEL),.$(COMPEL_SO_VERSION_SUBLEVEL)) COMPEL_SO_VERSION_CODE := $(shell expr $(COMPEL_SO_VERSION_MAJOR) \* 65536 \+ $(COMPEL_SO_VERSION_MINOR) \* 256 \+ $(COMPEL_SO_VERSION_SUBLEVEL)) ccflags-y += -DINCLUDEDIR=\"$(INCLUDEDIR)\" ccflags-y += -DLIBEXECDIR=\"$(LIBEXECDIR)\" ccflags-y += -DLIBDIR=\"$(LIBDIR)\" ccflags-y += -DSTATIC_LIB=\"$(LIBCOMPEL_A)\" ccflags-y += -DDYN_LIB=\"$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR)\" ccflags-y += -iquote compel/arch/$(ARCH)/src/lib/include ccflags-y += -iquote compel/include ccflags-y += -fno-strict-aliasing ccflags-y += -fPIC ldflags-y += -r # # UAPI inclusion, referred as ccflags-y += -I compel/include/uapi lib-name := $(LIBCOMPEL_A) lib-y += src/lib/log.o host-lib-y += src/lib/log.o lib-y += arch/$(ARCH)/src/lib/cpu.o lib-y += arch/$(ARCH)/src/lib/infect.o lib-y += src/lib/infect-rpc.o lib-y += src/lib/infect-util.o lib-y += src/lib/infect.o lib-y += src/lib/ptrace.o ifeq ($(ARCH),x86) lib-y += arch/$(ARCH)/src/lib/thread_area.o endif # handle_elf() has no support of ELF relocations on ARM (yet?) ifneq ($(filter arm aarch64 loongarch64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif obj-y += src/main.o obj-y += arch/$(ARCH)/src/lib/handle-elf.o obj-y += src/lib/handle-elf.o host-ccflags-y += $(ccflags-y) hostprogs-y += compel-host-bin compel-host-bin-objs := $(patsubst %.o,%-host.o,$(obj-y) $(host-lib-y)) cleanup-y += compel/compel cleanup-y += compel/compel-host-bin cleanup-y += compel/libcompel.so install: compel/compel compel/$(LIBCOMPEL_SO) compel/$(LIBCOMPEL_A) $(E) " INSTALL " compel $(Q) mkdir -p $(DESTDIR)$(BINDIR) $(Q) install -m 755 compel/compel $(DESTDIR)$(BINDIR)/crac-compel $(E) " INSTALL " $(LIBCOMPEL_SO) $(Q) mkdir -p $(DESTDIR)$(LIBDIR) $(Q) install -m 0644 compel/$(LIBCOMPEL_SO) $(DESTDIR)$(LIBDIR) $(Q) install -m 755 compel/$(LIBCOMPEL_SO) $(DESTDIR)$(LIBDIR)/$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR) $(Q) ln -fns $(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR) $(DESTDIR)$(LIBDIR)/$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR) $(Q) ln -fns $(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR) $(DESTDIR)$(LIBDIR)/$(LIBCOMPEL_SO) $(E) " INSTALL " $(LIBCOMPEL_A) $(Q) install -m 0644 compel/$(LIBCOMPEL_A) $(DESTDIR)$(LIBDIR) $(E) " INSTALL " compel uapi $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/crac-criu/compel/asm $(Q) cp compel/include/uapi/*.h $(DESTDIR)$(INCLUDEDIR)/crac-criu/compel/ $(Q) cp compel/include/uapi/asm/*.h $(DESTDIR)$(INCLUDEDIR)/crac-criu/compel/asm/ $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/crac-criu/compel/common/asm $(Q) cp include/common/compiler.h $(DESTDIR)$(INCLUDEDIR)/crac-criu/compel/common/ .PHONY: install uninstall: $(E) " UNINSTALL" compel $(Q) $(RM) $(addprefix $(DESTDIR)$(BINDIR)/,crac-compel) $(E) " UNINSTALL" $(LIBCOMPEL_SO) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_SO)) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR)) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR)) $(E) " UNINSTALL" $(LIBCOMPEL_A) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_A)) $(E) " UNINSTALL" compel uapi $(Q) $(RM) -rf $(addprefix $(DESTDIR)$(INCLUDEDIR)/,crac-criu/compel/*) .PHONY: uninstall crac-criu-1.5.0/compel/arch/000077500000000000000000000000001471504326700155725ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/000077500000000000000000000000001471504326700170225ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/plugins/000077500000000000000000000000001471504326700205035ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/plugins/include/000077500000000000000000000000001471504326700221265ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/plugins/include/asm/000077500000000000000000000000001471504326700227065ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/plugins/include/asm/prologue.h000077700000000000000000000000001471504326700354302../../../../../arch/x86/plugins/include/asm/prologue.hustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/plugins/include/asm/syscall-types.h000066400000000000000000000011701471504326700256720ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ #define SA_RESTORER 0x04000000 typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define _KNSIG 64 #define _NSIG_BPW 64 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ crac-criu-1.5.0/compel/arch/aarch64/plugins/include/features.h000066400000000000000000000001511471504326700241120ustar00rootroot00000000000000#ifndef __COMPEL_ARCH_FEATURES_H #define __COMPEL_ARCH_FEATURES_H #endif /* __COMPEL_ARCH_FEATURES_H */ crac-criu-1.5.0/compel/arch/aarch64/plugins/std/000077500000000000000000000000001471504326700212755ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/plugins/std/parasite-head.S000066400000000000000000000003431471504326700241300ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .head.text, "ax" ENTRY(__export_parasite_head_start) bl parasite_service brk #0 // the instruction BRK #0 generates the signal SIGTRAP in Linux END(__export_parasite_head_start) crac-criu-1.5.0/compel/arch/aarch64/plugins/std/syscalls/000077500000000000000000000000001471504326700231325ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/plugins/std/syscalls/Makefile.syscalls000077700000000000000000000000001471504326700376602../../../../arm/plugins/std/syscalls/Makefile.syscallsustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/plugins/std/syscalls/gen-sys-exec-tbl.pl000077700000000000000000000000001471504326700401362../../../../arm/plugins/std/syscalls/gen-sys-exec-tbl.plustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/plugins/std/syscalls/gen-syscalls.pl000077700000000000000000000000001471504326700367722../../../../arm/plugins/std/syscalls/gen-syscalls.plustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.S000066400000000000000000000010311471504326700255160ustar00rootroot00000000000000/** * This source contains emulation of syscalls * that are not implemented in the AArch64 Linux kernel */ ENTRY(sys_open) mov x3, x2 mov x2, x1 mov x1, x0 mov x0, #-100 b sys_openat END(sys_open) ENTRY(sys_mkdir) mov x3, x2 mov x2, x1 mov x1, x0 mov x0, #-100 b sys_mkdirat END(sys_mkdir) ENTRY(sys_rmdir) mov x2, #0x200 // flags = AT_REMOVEDIR mov x1, x0 mov x0, #-100 b sys_unlinkat END(sys_rmdir) ENTRY(sys_unlink) mov x2, #0 // flags = 0 mov x1, x0 mov x0, #-100 b sys_unlinkat END(sys_unlink) crac-criu-1.5.0/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.h000066400000000000000000000000621471504326700255460ustar00rootroot00000000000000#ifndef __NR_openat #define __NR_openat 56 #endif crac-criu-1.5.0/compel/arch/aarch64/plugins/std/syscalls/syscall-common.S000066400000000000000000000003521471504326700262160ustar00rootroot00000000000000#include "common/asm/linkage.h" syscall_common: svc #0 ret .macro syscall name, nr ENTRY(\name) mov x8, \nr b syscall_common END(\name) .endm ENTRY(__cr_restore_rt) mov x8, __NR_rt_sigreturn svc #0 END(__cr_restore_rt) crac-criu-1.5.0/compel/arch/aarch64/plugins/std/syscalls/syscall.def000077700000000000000000000000001471504326700353542../../../../arm/plugins/std/syscalls/syscall.defustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/scripts/000077500000000000000000000000001471504326700205115ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/scripts/compel-pack.lds.S000066400000000000000000000007531471504326700236160ustar00rootroot00000000000000OUTPUT_ARCH(aarch64) EXTERN(__export_parasite_head_start) SECTIONS { .crblob 0x0 : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) . = ALIGN(32); *(.data*) . = ALIGN(32); *(.rodata*) . = ALIGN(32); *(.bss*) . = ALIGN(32); *(.got*) . = ALIGN(32); *(.toc*) . = ALIGN(32); } =0x00000000, /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) *(*) } } crac-criu-1.5.0/compel/arch/aarch64/src/000077500000000000000000000000001471504326700176115ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/src/lib/000077500000000000000000000000001471504326700203575ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/src/lib/cpu.c000066400000000000000000000025631471504326700213200ustar00rootroot00000000000000#include #include #include "compel-cpu.h" #include "common/bitops.h" #include "log.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static void fetch_rt_cpuinfo(void) { static bool rt_info_done = false; if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } } void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } int compel_cpuid(compel_cpuinfo_t *info) { return 0; } bool compel_cpu_has_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_test_cpu_cap(&rt_info, feature); } bool compel_fpu_has_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_test_fpu_cap(&rt_info, feature); } uint32_t compel_fpu_feature_size(unsigned int feature) { fetch_rt_cpuinfo(); return 0; } uint32_t compel_fpu_feature_offset(unsigned int feature) { fetch_rt_cpuinfo(); return 0; } void compel_cpu_clear_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_clear_cpu_cap(&rt_info, feature); } void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) { fetch_rt_cpuinfo(); memcpy(c, &rt_info, sizeof(rt_info)); } crac-criu-1.5.0/compel/arch/aarch64/src/lib/handle-elf-host.c000077700000000000000000000000001471504326700256332handle-elf.custar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/src/lib/handle-elf.c000066400000000000000000000014771471504326700225330ustar00rootroot00000000000000#include #include #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; int handle_binary(void *mem, size_t size) { const unsigned char *elf_ident = #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ elf_ident_64_le; #else elf_ident_64_be; #endif if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) return handle_elf_aarch64(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/000077500000000000000000000000001471504326700220025ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/cpu.h000066400000000000000000000000001471504326700227300ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/handle-elf.h000066400000000000000000000004511471504326700241520ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf64-types.h" #define __handle_elf handle_elf_aarch64 #define arch_is_machine_supported(e_machine) (e_machine == EM_AARCH64) extern int handle_elf_aarch64(void *mem, size_t size); #endif /* COMPEL_HANDLE_ELF_H__ */ crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/syscall.h000066400000000000000000000002521471504326700236240ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ #define __NR(syscall, compat) \ ({ \ (void)compat; \ __NR_##syscall; \ }) #endif crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/uapi/000077500000000000000000000000001471504326700227405ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/uapi/asm/000077500000000000000000000000001471504326700235205ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/uapi/asm/.gitignore000066400000000000000000000000001471504326700254760ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000016641471504326700262210ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT #include #include struct hwbp_cap { char arch; char bp_count; }; /* copied from `linux/arch/arm64/include/asm/hw_breakpoint.h` */ /* Lengths */ #define ARM_BREAKPOINT_LEN_1 0x1 #define ARM_BREAKPOINT_LEN_2 0x3 #define ARM_BREAKPOINT_LEN_3 0x7 #define ARM_BREAKPOINT_LEN_4 0xf #define ARM_BREAKPOINT_LEN_5 0x1f #define ARM_BREAKPOINT_LEN_6 0x3f #define ARM_BREAKPOINT_LEN_7 0x7f #define ARM_BREAKPOINT_LEN_8 0xff /* Privilege Levels */ #define AARCH64_BREAKPOINT_EL1 1 #define AARCH64_BREAKPOINT_EL0 2 /* Breakpoint */ #define ARM_BREAKPOINT_EXECUTE 0 /* Watchpoints */ #define ARM_BREAKPOINT_LOAD 1 #define ARM_BREAKPOINT_STORE 2 #define AARCH64_ESR_ACCESS_MASK (1 << 6) #define DISABLE_HBP 0 #define ENABLE_HBP 1 int ptrace_set_breakpoint(pid_t pid, void *addr); int ptrace_flush_breakpoints(pid_t pid); #endif crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h000066400000000000000000000002141471504326700244550ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_CPU_H__ #define UAPI_COMPEL_ASM_CPU_H__ typedef struct { } compel_cpuinfo_t; #endif /* UAPI_COMPEL_ASM_CPU_H__ */ crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/uapi/asm/fpu.h000066400000000000000000000001211471504326700244550ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #endif /* __CR_ASM_FPU_H__ */ crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000020271471504326700263040ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #include #include #define SIGMAX 64 #define SIGMAX_OLD 31 /* * Copied from the Linux kernel header arch/arm64/include/uapi/asm/ptrace.h * * A thread ARM CPU context */ typedef struct user_pt_regs user_regs_struct_t; typedef struct user_fpsimd_state user_fpregs_struct_t; #define __compel_arch_fetch_thread_area(tid, th) 0 #define compel_arch_fetch_thread_area(tctl) 0 #define compel_arch_get_tls_task(ctl, tls) #define compel_arch_get_tls_thread(tctl, tls) #define REG_RES(r) ((uint64_t)(r).regs[0]) #define REG_IP(r) ((uint64_t)(r).pc) #define SET_REG_IP(r, val) ((r).pc = (val)) #define REG_SP(r) ((uint64_t)((r).sp)) #define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) #define user_regs_native(pregs) true #define ARCH_SI_TRAP TRAP_BRKPT #define __NR(syscall, compat) \ ({ \ (void)compat; \ __NR_##syscall; \ }) #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/uapi/asm/processor-flags.h000066400000000000000000000002121471504326700267750ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ #define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ #endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ crac-criu-1.5.0/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000041561471504326700254740ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include #include #include /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ #define FPSIMD_MAGIC 0x46508001 typedef struct fpsimd_context fpu_state_t; struct aux_context { struct fpsimd_context fpsimd; /* additional context to be added before "end" */ struct _aarch64_ctx end; }; // XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include /* Copied from the kernel source arch/arm64/kernel/signal.c */ struct rt_sigframe { siginfo_t info; ucontext_t uc; uint64_t fp; uint64_t lr; }; /* clang-format off */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "mov sp, %0 \n" \ "mov x8, #"__stringify(__NR_rt_sigreturn)" \n" \ "svc #0 \n" \ : \ : "r"(new_sp) \ : "x8", "memory") /* clang-format on */ /* cr_sigcontext is copied from arch/arm64/include/uapi/asm/sigcontext.h */ struct cr_sigcontext { __u64 fault_address; /* AArch64 registers */ __u64 regs[31]; __u64 sp; __u64 pc; __u64 pstate; /* 4K reserved for FP/SIMD state and future expansion */ __u8 __reserved[4096] __attribute__((__aligned__(16))); }; #define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) #define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.pc) #define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) #define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct aux_context *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 #define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) #define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ crac-criu-1.5.0/compel/arch/aarch64/src/lib/infect.c000066400000000000000000000161221471504326700217750ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "common/page.h" #include "uapi/compel/asm/infect-types.h" #include "log.h" #include "errno.h" #include "infect.h" #include "infect-priv.h" #include "asm/breakpoints.h" unsigned __page_size = 0; unsigned __page_shift = 0; /* * Injected syscall instruction */ const char code_syscall[] = { 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */ 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */ }; static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); static inline void __always_unused __check_code_syscall(void) { BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); memcpy(sigframe->uc.uc_mcontext.regs, regs->regs, sizeof(regs->regs)); sigframe->uc.uc_mcontext.sp = regs->sp; sigframe->uc.uc_mcontext.pc = regs->pc; sigframe->uc.uc_mcontext.pstate = regs->pstate; memcpy(fpsimd->vregs, fpregs->vregs, 32 * sizeof(__uint128_t)); fpsimd->fpsr = fpregs->fpsr; fpsimd->fpcr = fpregs->fpcr; fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); return 0; } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; struct iovec iov; int ret; pr_info("Dumping GP/FPU registers for %d\n", pid); iov.iov_base = regs; iov.iov_len = sizeof(user_regs_struct_t); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { pr_perror("Failed to obtain CPU registers for %d", pid); goto err; } iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { pr_perror("Failed to obtain FPU registers for %d", pid); goto err; } ret = save(arg, regs, fpsimd); err: return ret; } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { struct iovec iov; pr_info("Restoring GP/FPU registers for %d\n", pid); iov.iov_base = ext_regs; iov.iov_len = sizeof(*ext_regs); if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { pr_perror("Failed to set FPU registers for %d", pid); return -1; } return 0; } int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; regs.regs[8] = (unsigned long)nr; regs.regs[0] = arg1; regs.regs[1] = arg2; regs.regs[2] = arg3; regs.regs[3] = arg4; regs.regs[4] = arg5; regs.regs[5] = arg6; regs.regs[6] = 0; regs.regs[7] = 0; err = compel_execute_syscall(ctl, ®s, code_syscall); *ret = regs.regs[0]; return err; } void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); if (err < 0 || (long)map < 0) map = 0; return (void *)map; } void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { regs->pc = new_ip; if (stack) regs->sp = (unsigned long)stack; } bool arch_can_dump_task(struct parasite_ctl *ctl) { /* * TODO: Add proper check here */ return true; } int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { long ret; int err; err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } /* * Range for task size calculated from the following Linux kernel files: * arch/arm64/include/asm/memory.h * arch/arm64/Kconfig * * TODO: handle 32 bit tasks */ #define TASK_SIZE_MIN (1UL << 39) #define TASK_SIZE_MAX (1UL << 48) unsigned long compel_task_size(void) { unsigned long task_size; for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) if (munmap((void *)task_size, page_size())) break; return task_size; } static struct hwbp_cap *ptrace_get_hwbp_cap(pid_t pid) { static struct hwbp_cap info; static int available = -1; if (available == -1) { unsigned int val; struct iovec iovec = { .iov_base = &val, .iov_len = sizeof(val), }; if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_HW_BREAK, &iovec) < 0) available = 0; else { info.arch = (char)((val >> 8) & 0xff); info.bp_count = (char)(val & 0xff); available = (info.arch != 0); } } return available == 1 ? &info : NULL; } int ptrace_set_breakpoint(pid_t pid, void *addr) { k_rtsigset_t block; struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); struct user_hwdebug_state regs = {}; unsigned int ctrl = 0; struct iovec iovec; if (info == NULL || info->bp_count == 0) return 0; /* * The struct is copied from `arch/arm64/include/asm/hw_breakpoint.h` in * linux kernel: * struct arch_hw_breakpoint_ctrl { * __u32 __reserved : 19, * len : 8, * type : 2, * privilege : 2, * enabled : 1; * }; * * The part of `struct arch_hw_breakpoint_ctrl` bits meaning is defined * in <>, * D13.3.2 DBGBCR_EL1, Debug Breakpoint Control Registers. */ ctrl = ARM_BREAKPOINT_LEN_4; ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; ctrl = (ctrl << 1) | ENABLE_HBP; regs.dbg_regs[0].addr = (__u64)addr; regs.dbg_regs[0].ctrl = ctrl; iovec.iov_base = ®s; iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) return -1; /* * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler * will be reset to the default one. */ ksigfillset(&block); ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { pr_perror("Can't block signals for %d", pid); return -1; } if (ptrace(PTRACE_CONT, pid, NULL, NULL) != 0) { pr_perror("Unable to restart the stopped tracee process %d", pid); return -1; } return 1; } int ptrace_flush_breakpoints(pid_t pid) { struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); struct user_hwdebug_state regs = {}; unsigned int ctrl = 0; struct iovec iovec; if (info == NULL || info->bp_count == 0) return 0; ctrl = ARM_BREAKPOINT_LEN_4; ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; ctrl = (ctrl << 1) | DISABLE_HBP; regs.dbg_regs[0].addr = 0ul; regs.dbg_regs[0].ctrl = ctrl; iovec.iov_base = ®s; iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) return -1; return 0; } crac-criu-1.5.0/compel/arch/arm/000077500000000000000000000000001471504326700163515ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/plugins/000077500000000000000000000000001471504326700200325ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/plugins/include/000077500000000000000000000000001471504326700214555ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/plugins/include/asm/000077500000000000000000000000001471504326700222355ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/plugins/include/asm/prologue.h000077700000000000000000000000001471504326700347572../../../../../arch/x86/plugins/include/asm/prologue.hustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/plugins/include/asm/syscall-types.h000066400000000000000000000011701471504326700252210ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ #define SA_RESTORER 0x04000000 typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define _KNSIG 64 #define _NSIG_BPW 32 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ crac-criu-1.5.0/compel/arch/arm/plugins/include/features.h000066400000000000000000000001511471504326700234410ustar00rootroot00000000000000#ifndef __COMPEL_ARCH_FEATURES_H #define __COMPEL_ARCH_FEATURES_H #endif /* __COMPEL_ARCH_FEATURES_H */ crac-criu-1.5.0/compel/arch/arm/plugins/std/000077500000000000000000000000001471504326700206245ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/plugins/std/parasite-head.S000066400000000000000000000003711471504326700234600ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .head.text, "ax" ENTRY(__export_parasite_head_start) bl parasite_service .byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux END(__export_parasite_head_start) crac-criu-1.5.0/compel/arch/arm/plugins/std/syscalls/000077500000000000000000000000001471504326700224615ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/plugins/std/syscalls/Makefile.syscalls000066400000000000000000000036721471504326700257650ustar00rootroot00000000000000ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ sys-types := $(obj)/include/uapi/std/syscall-types.h sys-codes := $(obj)/include/uapi/std/syscall-codes.h sys-proto := $(obj)/include/uapi/std/syscall.h sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall.def sys-asm-common-name := std/syscalls/syscall-common.S sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c sys-gen := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-syscalls.pl sys-gen-tbl := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-sys-exec-tbl.pl sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S std-lib-y += $(sys-asm:.S=).o ifeq ($(ARCH),arm) arch_bits := 32 else arch_bits := 64 endif sys-exec-tbl := sys-exec-tbl.c $(sys-asm) $(sys-types) $(sys-codes) $(sys-proto): $(sys-gen) $(sys-def) $(sys-asm-common) $(sys-asm-types) $(E) " GEN " $@ $(Q) perl \ $(sys-gen) \ $(sys-def) \ $(sys-codes) \ $(sys-proto) \ $(sys-asm) \ $(sys-asm-common-name) \ $(sys-types) \ $(arch_bits) $(sys-asm:.S=).o: $(sys-asm) $(sys-exec-tbl): $(sys-gen-tbl) $(sys-def) $(E) " GEN " $@ $(Q) perl \ $(sys-gen-tbl) \ $(sys-def) \ $(sys-exec-tbl) \ $(arch_bits) $(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(call msg-gen, $@) $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.S $(obj)/include/uapi/std/syscall-aux.S $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.h $(obj)/include/uapi/std/syscall-aux.h std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) mrproper-y += $(std-headers-deps) mrproper-y += $(obj)/include/uapi/std/syscall-aux.S mrproper-y += $(obj)/include/uapi/std/syscall-aux.h crac-criu-1.5.0/compel/arch/arm/plugins/std/syscalls/gen-sys-exec-tbl.pl000077500000000000000000000015741471504326700261160ustar00rootroot00000000000000#!/usr/bin/perl use strict; use warnings; my $in = $ARGV[0]; my $tblout = $ARGV[1]; my $bits = $ARGV[2]; my $code = "code$bits"; open TBLOUT, ">", $tblout or die $!; open IN, "<", $in or die $!; print TBLOUT "/* Autogenerated, don't edit */\n"; print TBLOUT "static struct syscall_exec_desc sc_exec_table[] = {\n"; for () { if ($_ =~ /\#/) { next; } my $sys_name; my $sys_num; if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { $sys_name = $+{alias}; } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { $sys_name = $+{name}; } else { unlink $tblout; die "Invalid syscall definition file: invalid entry $_\n"; } $sys_num = $+{$code}; if ($sys_num ne "!") { print TBLOUT "SYSCALL($sys_name, $sys_num)\n"; } } print TBLOUT " { }, /* terminator */"; print TBLOUT "};" crac-criu-1.5.0/compel/arch/arm/plugins/std/syscalls/gen-syscalls.pl000077500000000000000000000044621471504326700254330ustar00rootroot00000000000000#!/usr/bin/perl use strict; use warnings; my $in = $ARGV[0]; my $codesout = $ARGV[1]; my $codes = $ARGV[1]; $codes =~ s/.*include\/uapi\//compel\/plugins\//g; my $protosout = $ARGV[2]; my $protos = $ARGV[2]; $protos =~ s/.*include\/uapi\//compel\/plugins\//g; my $asmout = $ARGV[3]; my $asmcommon = $ARGV[4]; my $prototypes = $ARGV[5]; $prototypes =~ s/.*include\/uapi\//compel\/plugins\//g; my $bits = $ARGV[6]; my $codesdef = $codes; $codesdef =~ tr/.\-\//_/; my $protosdef = $protos; $protosdef =~ tr/.\-\//_/; my $code = "code$bits"; my $need_aux = 0; unlink $codesout; unlink $protosout; unlink $asmout; open CODESOUT, ">", $codesout or die $!; open PROTOSOUT, ">", $protosout or die $!; open ASMOUT, ">", $asmout or die $!; open IN, "<", $in or die $!; print CODESOUT <<"END"; /* Autogenerated, don't edit */ #ifndef $codesdef #define $codesdef END print PROTOSOUT <<"END"; /* Autogenerated, don't edit */ #ifndef $protosdef #define $protosdef #include <$prototypes> #include <$codes> END print ASMOUT <<"END"; /* Autogenerated, don't edit */ #include <$codes> #include "$asmcommon" END for () { if ($_ =~ /\#/) { next; } my $code_macro; my $sys_macro; my $sys_name; if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { $code_macro = "__NR_$+{name}"; $sys_macro = "SYS_$+{name}"; $sys_name = "sys_$+{alias}"; } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { $code_macro = "__NR_$+{name}"; $sys_macro = "SYS_$+{name}"; $sys_name = "sys_$+{name}"; } else { unlink $codesout; unlink $protosout; unlink $asmout; die "Invalid syscall definition file: invalid entry $_\n"; } if ($+{$code} ne "!") { print CODESOUT "#ifndef $code_macro\n#define $code_macro $+{$code}\n#endif\n"; print CODESOUT "#ifndef $sys_macro\n#define $sys_macro $code_macro\n#endif\n"; print ASMOUT "syscall $sys_name, $code_macro\n"; } else { $need_aux = 1; } print PROTOSOUT "extern long $sys_name($+{args});\n"; } if ($need_aux == 1) { print ASMOUT "#include \n"; print CODESOUT "#include \n"; } print CODESOUT "#endif /* $codesdef */"; print PROTOSOUT "#endif /* $protosdef */"; crac-criu-1.5.0/compel/arch/arm/plugins/std/syscalls/syscall-aux.S000066400000000000000000000003211471504326700250460ustar00rootroot00000000000000nr_sys_mmap: .long 192 ENTRY(sys_mmap) push {r4, r5, r7, lr} ldr r4, [sp, #16] ldr r5, [sp, #20] lsr r5, #12 adr r7, nr_sys_mmap ldr r7, [r7] svc 0x00000000 pop {r4, r5, r7, pc} END(sys_mmap) crac-criu-1.5.0/compel/arch/arm/plugins/std/syscalls/syscall-aux.h000066400000000000000000000007701471504326700251030ustar00rootroot00000000000000#ifndef __NR_mmap2 #define __NR_mmap2 192 #endif #ifndef __ARM_NR_BASE #define __ARM_NR_BASE 0x0f0000 #endif #ifndef __ARM_NR_breakpoint #define __ARM_NR_breakpoint (__ARM_NR_BASE + 1) #endif #ifndef __ARM_NR_cacheflush #define __ARM_NR_cacheflush (__ARM_NR_BASE + 2) #endif #ifndef __ARM_NR_usr26 #define __ARM_NR_usr26 (__ARM_NR_BASE + 3) #endif #ifndef __ARM_NR_usr32 #define __ARM_NR_usr32 (__ARM_NR_BASE + 4) #endif #ifndef __ARM_NR_set_tls #define __ARM_NR_set_tls (__ARM_NR_BASE + 5) #endif crac-criu-1.5.0/compel/arch/arm/plugins/std/syscalls/syscall-common.S000066400000000000000000000013121471504326700255420ustar00rootroot00000000000000#include "common/asm/linkage.h" @ We use the register R8 unlike libc that uses R12. @ This avoids corruption of the register by the stub @ for the syscall sys_munmap() when syscalls are hooked @ by ptrace(). However we have to make sure that @ the compiler doesn't use the register on the route @ between parasite_service() and sys_munmap(). syscall_common: ldr r7, [r7] add r8, sp, #24 ldm r8, {r4, r5, r6} svc 0x00000000 pop {r4, r5, r6, r7, r8, pc} .macro syscall name, nr .nr_\name : .long \nr ENTRY(\name) push {r4, r5, r6, r7, r8, lr} adr r7, .nr_\name b syscall_common END(\name) .endm ENTRY(__cr_restore_rt) adr r7, .nr_sys_rt_sigreturn ldr r7, [r7] svc #0 END(__cr_restore_rt) crac-criu-1.5.0/compel/arch/arm/plugins/std/syscalls/syscall.def000066400000000000000000000176111471504326700246210ustar00rootroot00000000000000# # System calls table, please make sure the table consists of only the syscalls # really used somewhere in the project. # # The template is (name and arguments are optional if you need only __NR_x # defined, but no real entry point in syscalls lib). # # name/alias code64 code32 arguments # ----------------------------------------------------------------------- # read 63 3 (int fd, void *buf, unsigned long count) write 64 4 (int fd, const void *buf, unsigned long count) open ! 5 (const char *filename, unsigned long flags, unsigned long mode) close 57 6 (int fd) lseek 62 19 (int fd, unsigned long offset, unsigned long origin) mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot) munmap 215 91 (void *addr, unsigned long len) brk 214 45 (void *addr) rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) rt_sigreturn 139 173 (void) ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg) pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos) ptrace 117 26 (long request, pid_t pid, void *addr, void *data) mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr) mincore 232 219 (void *addr, unsigned long size, unsigned char *vec) madvise 233 220 (unsigned long start, size_t len, int behavior) shmat 196 305 (int shmid, void *shmaddr, int shmflag) pause 1061 29 (void) nanosleep 101 162 (struct timespec *req, struct timespec *rem) getitimer 102 105 (int which, const struct itimerval *val) setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old) getpid 172 20 (void) socket 198 281 (int domain, int type, int protocol) connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen) sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) shutdown 210 293 (int sockfd, int how) bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) exit 93 1 (unsigned long error_code) wait4 260 114 (int pid, int *status, int options, struct rusage *ru) waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) kill 129 37 (long pid, int sig) fcntl 25 55 (int fd, int type, long arg) flock 32 143 (int fd, unsigned long cmd) mkdir ! 39 (const char *name, int mode) rmdir ! 40 (const char *name) unlink ! 10 (char *pathname) readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize) umask 166 60 (int mask) getgroups 158 205 (int gsize, unsigned int *groups) setgroups 159 206 (int gsize, unsigned int *groups) setresuid 147 164 (int uid, int euid, int suid) getresuid 148 165 (int *uid, int *euid, int *suid) setresgid 149 170 (int gid, int egid, int sgid) getresgid 150 171 (int *gid, int *egid, int *sgid) getpgid 155 132 (pid_t pid) setfsuid 151 138 (int fsuid) setfsgid 152 139 (int fsgid) getsid 156 147 (void) capget 90 184 (struct cap_header *h, struct cap_data *d) capset 91 185 (struct cap_header *h, struct cap_data *d) rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info) setpriority 140 97 (int which, int who, int nice) sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p) sigaltstack 132 186 (const void *uss, void *uoss) personality 92 136 (unsigned int personality) prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) arch_prctl ! 17 (int option, unsigned long addr) setrlimit 164 75 (int resource, struct krlimit *rlim) mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) umount2 39 52 (char *name, int flags) gettid 178 224 (void) futex 98 240 (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) set_tid_address 96 256 (int *tid_addr) restart_syscall 128 0 (void) timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) timer_getoverrun 109 260 (int timer_id) timer_delete 111 261 (kernel_timer_t timer_id) clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) exit_group 94 248 (int error_code) set_robust_list 99 338 (struct robust_list_head *head, size_t len) get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info) vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags) fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags) setns 268 375 (int fd, int nstype) kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode) mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode) unlinkat 35 328 (int dirfd, const char *pathname, int flags) memfd_create 279 385 (const char *name, unsigned int flags) io_setup 0 243 (unsigned nr_events, aio_context_t *ctx) io_submit 2 246 (aio_context_t ctx_id, long nr, struct iocb **iocbpp) io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs) gettimeofday 169 78 (struct timeval *tv, struct timezone *tz) preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) userfaultfd 282 388 (int flags) fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) cacheflush ! 983042 (void *start, void *end, int flags) ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) open_tree 428 428 (int dirfd, const char *pathname, unsigned int flags) move_mount 429 429 (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) fsopen 430 430 (char *fsname, unsigned int flags) fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) clone3 435 435 (struct clone_args *uargs, size_t size) pidfd_open 434 434 (pid_t pid, unsigned int flags) openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) membarrier 283 389 (int cmd, unsigned int flags, int cpu_id) crac-criu-1.5.0/compel/arch/arm/scripts/000077500000000000000000000000001471504326700200405ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/scripts/compel-pack.lds.S000066400000000000000000000007471471504326700231500ustar00rootroot00000000000000OUTPUT_ARCH(arm) EXTERN(__export_parasite_head_start) SECTIONS { .crblob 0x0 : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) . = ALIGN(32); *(.data*) . = ALIGN(32); *(.rodata*) . = ALIGN(32); *(.bss*) . = ALIGN(32); *(.got*) . = ALIGN(32); *(.toc*) . = ALIGN(32); } =0x00000000, /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) *(*) } } crac-criu-1.5.0/compel/arch/arm/src/000077500000000000000000000000001471504326700171405ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/src/lib/000077500000000000000000000000001471504326700177065ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/src/lib/cpu.c000077700000000000000000000000001471504326700252152../../../aarch64/src/lib/cpu.custar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/src/lib/handle-elf-host.c000077700000000000000000000000001471504326700251622handle-elf.custar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/src/lib/handle-elf.c000066400000000000000000000007621471504326700220560ustar00rootroot00000000000000#include #include #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_32[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x01, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; int handle_binary(void *mem, size_t size) { if (memcmp(mem, elf_ident_32, sizeof(elf_ident_32)) == 0) return handle_elf_arm(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } crac-criu-1.5.0/compel/arch/arm/src/lib/include/000077500000000000000000000000001471504326700213315ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/src/lib/include/cpu.h000066400000000000000000000000001471504326700222570ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/src/lib/include/handle-elf.h000066400000000000000000000004351471504326700235030ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf32-types.h" #define __handle_elf handle_elf_arm #define arch_is_machine_supported(e_machine) (e_machine == EM_ARM) extern int handle_elf_arm(void *mem, size_t size); #endif /* COMPEL_HANDLE_ELF_H__ */ crac-criu-1.5.0/compel/arch/arm/src/lib/include/syscall.h000066400000000000000000000002521471504326700231530ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ #define __NR(syscall, compat) \ ({ \ (void)compat; \ __NR_##syscall; \ }) #endif crac-criu-1.5.0/compel/arch/arm/src/lib/include/uapi/000077500000000000000000000000001471504326700222675ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/src/lib/include/uapi/asm/000077500000000000000000000000001471504326700230475ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/src/lib/include/uapi/asm/.gitignore000066400000000000000000000000001471504326700250250ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000003771471504326700255500ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT static inline int ptrace_set_breakpoint(pid_t pid, void *addr) { return 0; } static inline int ptrace_flush_breakpoints(pid_t pid) { return 0; } #endif crac-criu-1.5.0/compel/arch/arm/src/lib/include/uapi/asm/cpu.h000066400000000000000000000002141471504326700240040ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_CPU_H__ #define UAPI_COMPEL_ASM_CPU_H__ typedef struct { } compel_cpuinfo_t; #endif /* UAPI_COMPEL_ASM_CPU_H__ */ crac-criu-1.5.0/compel/arch/arm/src/lib/include/uapi/asm/fpu.h000066400000000000000000000001211471504326700240040ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #endif /* __CR_ASM_FPU_H__ */ crac-criu-1.5.0/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000033111471504326700256300ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #define SIGMAX 64 #define SIGMAX_OLD 31 /* * Copied from the Linux kernel header arch/arm/include/asm/ptrace.h * * A thread ARM CPU context */ typedef struct { long uregs[18]; } user_regs_struct_t; #define __compel_arch_fetch_thread_area(tid, th) 0 #define compel_arch_fetch_thread_area(tctl) 0 #define compel_arch_get_tls_task(ctl, tls) #define compel_arch_get_tls_thread(tctl, tls) typedef struct user_vfp user_fpregs_struct_t; #define ARM_cpsr uregs[16] #define ARM_pc uregs[15] #define ARM_lr uregs[14] #define ARM_sp uregs[13] #define ARM_ip uregs[12] #define ARM_fp uregs[11] #define ARM_r10 uregs[10] #define ARM_r9 uregs[9] #define ARM_r8 uregs[8] #define ARM_r7 uregs[7] #define ARM_r6 uregs[6] #define ARM_r5 uregs[5] #define ARM_r4 uregs[4] #define ARM_r3 uregs[3] #define ARM_r2 uregs[2] #define ARM_r1 uregs[1] #define ARM_r0 uregs[0] #define ARM_ORIG_r0 uregs[17] /* Copied from arch/arm/include/asm/user.h */ struct user_vfp { unsigned long long fpregs[32]; unsigned long fpscr; }; struct user_vfp_exc { unsigned long fpexc; unsigned long fpinst; unsigned long fpinst2; }; #define REG_RES(regs) ((regs).ARM_r0) #define REG_IP(regs) ((regs).ARM_pc) #define SET_REG_IP(regs, val) ((regs).ARM_pc = (val)) #define REG_SP(regs) ((regs).ARM_sp) #define REG_SYSCALL_NR(regs) ((regs).ARM_r7) #define user_regs_native(pregs) true #define ARCH_SI_TRAP TRAP_BRKPT #define __NR(syscall, compat) \ ({ \ (void)compat; \ __NR_##syscall; \ }) #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ crac-criu-1.5.0/compel/arch/arm/src/lib/include/uapi/asm/processor-flags.h000066400000000000000000000021151471504326700263300ustar00rootroot00000000000000#ifndef __CR_PROCESSOR_FLAGS_H__ #define __CR_PROCESSOR_FLAGS_H__ /* Copied from the Linux kernel header arch/arm/include/uapi/asm/ptrace.h */ /* * PSR bits */ #define USR26_MODE 0x00000000 #define FIQ26_MODE 0x00000001 #define IRQ26_MODE 0x00000002 #define SVC26_MODE 0x00000003 #define USR_MODE 0x00000010 #define FIQ_MODE 0x00000011 #define IRQ_MODE 0x00000012 #define SVC_MODE 0x00000013 #define ABT_MODE 0x00000017 #define UND_MODE 0x0000001b #define SYSTEM_MODE 0x0000001f #define MODE32_BIT 0x00000010 #define MODE_MASK 0x0000001f #define PSR_T_BIT 0x00000020 #define PSR_F_BIT 0x00000040 #define PSR_I_BIT 0x00000080 #define PSR_A_BIT 0x00000100 #define PSR_E_BIT 0x00000200 #define PSR_J_BIT 0x01000000 #define PSR_Q_BIT 0x08000000 #define PSR_V_BIT 0x10000000 #define PSR_C_BIT 0x20000000 #define PSR_Z_BIT 0x40000000 #define PSR_N_BIT 0x80000000 /* * Groups of PSR bits */ #define PSR_f 0xff000000 /* Flags */ #define PSR_s 0x00ff0000 /* Status */ #define PSR_x 0x0000ff00 /* Extension */ #define PSR_c 0x000000ff /* Control */ #endif crac-criu-1.5.0/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000045501471504326700250210ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include /* Copied from the Linux kernel header arch/arm/include/asm/sigcontext.h */ struct rt_sigcontext { unsigned long trap_no; unsigned long error_code; unsigned long oldmask; unsigned long arm_r0; unsigned long arm_r1; unsigned long arm_r2; unsigned long arm_r3; unsigned long arm_r4; unsigned long arm_r5; unsigned long arm_r6; unsigned long arm_r7; unsigned long arm_r8; unsigned long arm_r9; unsigned long arm_r10; unsigned long arm_fp; unsigned long arm_ip; unsigned long arm_sp; unsigned long arm_lr; unsigned long arm_pc; unsigned long arm_cpsr; unsigned long fault_address; }; /* Copied from the Linux kernel header arch/arm/include/asm/ucontext.h */ #define VFP_MAGIC 0x56465001 #define VFP_STORAGE_SIZE sizeof(struct vfp_sigframe) struct vfp_sigframe { unsigned long magic; unsigned long size; struct user_vfp ufp; struct user_vfp_exc ufp_exc; }; typedef struct vfp_sigframe fpu_state_t; struct aux_sigframe { /* struct crunch_sigframe crunch; struct iwmmxt_sigframe iwmmxt; */ struct vfp_sigframe vfp; unsigned long end_magic; } __attribute__((aligned(8))); #include struct sigframe { struct rt_ucontext uc; unsigned long retcode[2]; }; struct rt_sigframe { struct rt_siginfo info; struct sigframe sig; }; /* clang-format off */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "mov sp, %0 \n" \ "mov r7, #"__stringify(__NR_rt_sigreturn)" \n" \ "svc #0 \n" \ : \ : "r"(new_sp) \ : "memory") /* clang-format on */ #define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->sig.uc) #define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->sig.uc.uc_mcontext.arm_ip #define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 #define RT_SIGFRAME_AUX_SIGFRAME(rt_sigframe) ((struct aux_sigframe *)&(rt_sigframe)->sig.uc.uc_regspace) #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_SIGFRAME(rt_sigframe)->vfp) #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 #define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->sig.uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) #define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->sig.uc.uc_sigmask, from, sizeof(k_rtsigset_t)) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ crac-criu-1.5.0/compel/arch/arm/src/lib/infect.c000066400000000000000000000123161471504326700213250ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "common/page.h" #include "uapi/compel/asm/infect-types.h" #include "log.h" #include "errno.h" #include "infect.h" #include "infect-priv.h" /* * Injected syscall instruction */ const char code_syscall[] = { 0x00, 0x00, 0x00, 0xef, /* SVC #0 */ 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */ }; static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); static inline __always_unused void __check_code_syscall(void) { BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { struct aux_sigframe *aux = (struct aux_sigframe *)(void *)&sigframe->sig.uc.uc_regspace; sigframe->sig.uc.uc_mcontext.arm_r0 = regs->ARM_r0; sigframe->sig.uc.uc_mcontext.arm_r1 = regs->ARM_r1; sigframe->sig.uc.uc_mcontext.arm_r2 = regs->ARM_r2; sigframe->sig.uc.uc_mcontext.arm_r3 = regs->ARM_r3; sigframe->sig.uc.uc_mcontext.arm_r4 = regs->ARM_r4; sigframe->sig.uc.uc_mcontext.arm_r5 = regs->ARM_r5; sigframe->sig.uc.uc_mcontext.arm_r6 = regs->ARM_r6; sigframe->sig.uc.uc_mcontext.arm_r7 = regs->ARM_r7; sigframe->sig.uc.uc_mcontext.arm_r8 = regs->ARM_r8; sigframe->sig.uc.uc_mcontext.arm_r9 = regs->ARM_r9; sigframe->sig.uc.uc_mcontext.arm_r10 = regs->ARM_r10; sigframe->sig.uc.uc_mcontext.arm_fp = regs->ARM_fp; sigframe->sig.uc.uc_mcontext.arm_ip = regs->ARM_ip; sigframe->sig.uc.uc_mcontext.arm_sp = regs->ARM_sp; sigframe->sig.uc.uc_mcontext.arm_lr = regs->ARM_lr; sigframe->sig.uc.uc_mcontext.arm_pc = regs->ARM_pc; sigframe->sig.uc.uc_mcontext.arm_cpsr = regs->ARM_cpsr; memcpy(&aux->vfp.ufp.fpregs, &fpregs->fpregs, sizeof(aux->vfp.ufp.fpregs)); aux->vfp.ufp.fpscr = fpregs->fpscr; aux->vfp.magic = VFP_MAGIC; aux->vfp.size = VFP_STORAGE_SIZE; return 0; } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } #define PTRACE_GETVFPREGS 27 int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { user_fpregs_struct_t tmp, *vfp = ext_regs ? ext_regs : &tmp; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); if (ptrace(PTRACE_GETVFPREGS, pid, NULL, vfp)) { pr_perror("Can't obtain FPU registers for %d", pid); goto err; } /* Did we come from a system call? */ if ((int)regs->ARM_ORIG_r0 >= 0) { /* Restart the system call */ switch ((long)(int)regs->ARM_r0) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->ARM_r0 = regs->ARM_ORIG_r0; regs->ARM_pc -= 4; break; case -ERESTART_RESTARTBLOCK: pr_warn("Will restore %d with interrupted system call\n", pid); regs->ARM_r0 = -EINTR; break; } } ret = save(arg, regs, vfp); err: return ret; } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { pr_info("Restoring GP/FPU registers for %d\n", pid); if (ptrace(PTRACE_SETVFPREGS, pid, NULL, ext_regs)) { pr_perror("Can't set FPU registers for %d", pid); return -1; } return 0; } int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; regs.ARM_r7 = (unsigned long)nr; regs.ARM_r0 = arg1; regs.ARM_r1 = arg2; regs.ARM_r2 = arg3; regs.ARM_r3 = arg4; regs.ARM_r4 = arg5; regs.ARM_r5 = arg6; err = compel_execute_syscall(ctl, ®s, code_syscall); *ret = regs.ARM_r0; return err; } void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; if (offset & ~PAGE_MASK) return 0; err = compel_syscall(ctl, __NR_mmap2, &map, (unsigned long)addr, length, prot, flags, fd, offset >> 12); if (err < 0 || map > ctl->ictx.task_size) map = 0; return (void *)map; } void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { regs->ARM_pc = new_ip; if (stack) regs->ARM_sp = (unsigned long)stack; /* Make sure flags are in known state */ regs->ARM_cpsr &= PSR_f | PSR_s | PSR_x | MODE32_BIT; } bool arch_can_dump_task(struct parasite_ctl *ctl) { /* * TODO: Add proper check here */ return true; } int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { long ret; int err; err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->sig.uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } /* * Range for task size calculated from the following Linux kernel files: * arch/arm/include/asm/memory.h * arch/arm/Kconfig (PAGE_OFFSET values in Memory split section) */ #define TASK_SIZE_MIN 0x3f000000 #define TASK_SIZE_MAX 0xbf000000 #define SZ_1G 0x40000000 unsigned long compel_task_size(void) { unsigned long task_size; for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size += SZ_1G) if (munmap((void *)task_size, page_size())) break; return task_size; } crac-criu-1.5.0/compel/arch/loongarch64/000077500000000000000000000000001471504326700177205ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/plugins/000077500000000000000000000000001471504326700214015ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/plugins/include/000077500000000000000000000000001471504326700230245ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/plugins/include/asm/000077500000000000000000000000001471504326700236045ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/plugins/include/asm/prologue.h000066400000000000000000000012561471504326700256150ustar00rootroot00000000000000#ifndef __ASM_PROLOGUE_H__ #define __ASM_PROLOGUE_H__ #ifndef __ASSEMBLY__ #include #include #include #include #define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) typedef struct prologue_init_args { struct sockaddr_un ctl_sock_addr; unsigned int ctl_sock_addr_len; unsigned int arg_s; void *arg_p; void *sigframe; } prologue_init_args_t; #endif /* __ASSEMBLY__ */ /* * Reserve enough space for sigframe. * * FIXME It is rather should be taken from sigframe header. */ #define PROLOGUE_SGFRAME_SIZE 4096 #define PROLOGUE_INIT_ARGS_SIZE 1024 #endif /* __ASM_PROLOGUE_H__ */ crac-criu-1.5.0/compel/arch/loongarch64/plugins/include/asm/syscall-types.h000066400000000000000000000014171471504326700265740ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ #include /* Types for sigaction, sigprocmask syscalls */ typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; /* refer to arch/loongarch/include/uapi/asm/signal.h */ #define _KNSIG 64 #define _NSIG_BPW BITS_PER_LONG #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { uint64_t sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #define SA_RESTORER 0x04000000 #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ crac-criu-1.5.0/compel/arch/loongarch64/plugins/include/features.h000066400000000000000000000001511471504326700250100ustar00rootroot00000000000000#ifndef __COMPEL_ARCH_FEATURES_H #define __COMPEL_ARCH_FEATURES_H #endif /* __COMPEL_ARCH_FEATURES_H */ crac-criu-1.5.0/compel/arch/loongarch64/plugins/std/000077500000000000000000000000001471504326700221735ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/plugins/std/parasite-head.S000066400000000000000000000002441471504326700250260ustar00rootroot00000000000000 #include "common/asm/linkage.h" .section .head.text, "ax" ENTRY(__export_parasite_head_start) bl parasite_service; break 0; END(__export_parasite_head_start) crac-criu-1.5.0/compel/arch/loongarch64/plugins/std/syscalls/000077500000000000000000000000001471504326700240305ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls000066400000000000000000000113451471504326700273300ustar00rootroot00000000000000std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o sys-proto-types := $(obj)/include/uapi/std/syscall-types.h sys-proto-generic := $(obj)/include/uapi/std/syscall.h sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h sys-proto = $(obj)/include/uapi/std/syscall-$(1).h sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S sys-asm-common-name = std/syscalls/syscall-common-loongarch-$(1).S sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c sys-bits := 64 AV := $$$$ define gen-rule-sys-codes $(sys-codes): $(sys-def) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ sub("^__NR", "SYS", SYSN); \ print "\n#ifndef ", $(AV)1; \ print "#define", $(AV)1, $(AV)2; \ print "#endif"; \ print "\n#ifndef ", SYSN; \ print "#define ", SYSN, $(AV)1; \ print "#endif";}' >> $$@ $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ endef define gen-rule-sys-proto $(sys-proto): $(sys-def) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ $(Q) echo '#include ' >> $$@ $(Q) echo '#include ' >> $$@ ifeq ($(1),32) $(Q) echo '#include "asm/syscall32.h"' >> $$@ endif $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ endef define gen-rule-sys-asm $(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo '#include ' >> $$@ $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ endef define gen-rule-sys-exec-tbl $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) cat $$< | awk '/^__NR/{print \ "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ endef $(sys-codes-generic): $(sys-proto-types) $(call msg-gen, $@) $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo '#include ' >> $@ $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ sub("^__NR", "__NR32", NR32); \ print "\n#ifndef ", NR32; \ print "#define ", NR32, $$2; \ print "#endif";}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ mrproper-y += $(sys-codes-generic) $(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) $(call msg-gen, $@) $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "" >> $@ $(Q) echo '#include ' >> $@ $(Q) echo "" >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ mrproper-y += $(sys-proto-generic) define gen-rule-sys-exec-tbl $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) cat $$< | awk '/^__NR/{print \ "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ endef $(eval $(call map,gen-rule-sys-codes,$(sys-bits))) $(eval $(call map,gen-rule-sys-proto,$(sys-bits))) $(eval $(call map,gen-rule-sys-asm,$(sys-bits))) $(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) $(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(call msg-gen, $@) $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) std-headers-deps += $(call sys-codes,$(sys-bits)) std-headers-deps += $(call sys-proto,$(sys-bits)) std-headers-deps += $(call sys-asm,$(sys-bits)) std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) std-headers-deps += $(sys-codes-generic) std-headers-deps += $(sys-proto-generic) std-headers-deps += $(sys-asm-types) mrproper-y += $(std-headers-deps) crac-criu-1.5.0/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S000066400000000000000000000014061471504326700313160ustar00rootroot00000000000000#include "common/asm/linkage.h" #define SYSCALL(name, opcode) \ ENTRY(name); \ addi.d $a7, $zero, opcode; \ syscall 0; \ jirl $r0, $r1, 0; \ END(name) #ifndef AT_FDCWD #define AT_FDCWD -100 #endif #ifndef AT_REMOVEDIR #define AT_REMOVEDIR 0x200 #endif ENTRY(sys_open) or $a3, $zero, $a2 or $a2, $zero, $a1 or $a1, $zero, $a0 addi.d $a0, $zero, AT_FDCWD b sys_openat END(sys_open) ENTRY(sys_mkdir) or $a3, $zero, $a2 or $a2, $zero, $a1 or $a1, $zero, $a0 addi.d $a0, $zero, AT_FDCWD b sys_mkdirat END(sys_mkdir) ENTRY(sys_rmdir) addi.d $a2, $zero, AT_REMOVEDIR or $a1, $zero, $a0 addi.d $a0, $zero, AT_FDCWD b sys_unlinkat END(sys_rmdir) ENTRY(__cr_restore_rt) addi.d $a7, $zero, __NR_rt_sigreturn syscall 0 END(__cr_restore_rt) crac-criu-1.5.0/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl000066400000000000000000000232001471504326700265130ustar00rootroot00000000000000# # System calls table, please make sure the table consist only the syscalls # really used somewhere in project. # from kernel/linux-3.10.84/arch/mips/include/uapi/asm/unistd.h Linux 64-bit syscalls are in the range from 5000 to 5999. # # __NR_name code name arguments # ------------------------------------------------------------------------------------------------------------------------------------------------------------- __NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) __NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) __NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) __NR_fcntl 25 sys_fcntl (int fd, int type, long arg) __NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_flock 32 sys_flock (int fd, unsigned long cmd) __NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) __NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) __NR_umount2 39 sys_umount2 (char *name, int flags) __NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) __NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) __NR_close 57 sys_close (int fd) __NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) __NR_read 63 sys_read (int fd, void *buf, unsigned long count) __NR_write 64 sys_write (int fd, const void *buf, unsigned long count) __NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) __NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) __NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) __NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) __NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) __NR_personality 92 sys_personality (unsigned int personality) __NR_exit 93 sys_exit (unsigned long error_code) __NR_exit_group 94 sys_exit_group (int error_code) __NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) __NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) __NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) __NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) __NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) __NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) __NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) __NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) __NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) __NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_restart_syscall 128 sys_restart_syscall (void) __NR_kill 129 sys_kill (long pid, int sig) __NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) __NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) __NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) __NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) __NR_rt_sigreturn 139 sys_rt_sigreturn (void) __NR_setpriority 140 sys_setpriority (int which, int who, int nice) __NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) __NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) __NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) __NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) __NR_getpgid 155 sys_getpgid (pid_t pid) __NR_setfsuid 151 sys_setfsuid (int fsuid) __NR_setfsgid 152 sys_setfsgid (int fsgid) __NR_getsid 156 sys_getsid (void) __NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) __NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) __NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) __NR_umask 166 sys_umask (int mask) __NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) __NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_getpid 172 sys_getpid (void) __NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) __NR_gettid 178 sys_gettid (void) __NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) __NR_socket 198 sys_socket (int domain, int type, int protocol) __NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) __NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) __NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) __NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) __NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) __NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) __NR_shutdown 210 sys_shutdown (int sockfd, int how) __NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) __NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) __NR_brk 214 sys_brk (void *addr) __NR_munmap 215 sys_munmap (void *addr, unsigned long len) __NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) __NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) __NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) __NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) __NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) __NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) __NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) __NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) __NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) __NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) __NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) __NR_setns 268 sys_setns (int fd, int nstype) __NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) __NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 282 sys_userfaultfd (int flags) __NR_membarrier 283 sys_membarrier (int cmd, unsigned int flags, int cpu_id) __NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) __NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) #__NR_dup2 ! sys_dup2 (int oldfd, int newfd) #__NR_rmdir ! sys_rmdir (const char *name) #__NR_unlink ! sys_unlink (char *pathname) #__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) #__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) #__NR_mkdir ! sys_mkdir (const char *name, int mode) #__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) crac-criu-1.5.0/compel/arch/loongarch64/scripts/000077500000000000000000000000001471504326700214075ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/scripts/compel-pack.lds.S000066400000000000000000000007551471504326700245160ustar00rootroot00000000000000OUTPUT_ARCH(loongarch) EXTERN(__export_parasite_head_start) SECTIONS { .crblob 0x0 : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) . = ALIGN(32); *(.data*) . = ALIGN(32); *(.rodata*) . = ALIGN(32); *(.bss*) . = ALIGN(32); *(.got*) . = ALIGN(32); *(.toc*) . = ALIGN(32); } =0x00000000, /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) *(*) } } crac-criu-1.5.0/compel/arch/loongarch64/src/000077500000000000000000000000001471504326700205075ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/src/lib/000077500000000000000000000000001471504326700212555ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/src/lib/cpu.c000066400000000000000000000013121471504326700222050ustar00rootroot00000000000000#include #include #include "compel-cpu.h" #include "common/bitops.h" #include "common/compiler.h" #include "log.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static bool rt_info_done = false; void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { return 0; } int compel_cpuid(compel_cpuinfo_t *c) { return 0; } bool compel_cpu_has_feature(unsigned int feature) { if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } return compel_test_cpu_cap(&rt_info, feature); } crac-criu-1.5.0/compel/arch/loongarch64/src/lib/handle-elf-host.c000066400000000000000000000010531471504326700243720ustar00rootroot00000000000000#include #include #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; extern int __handle_elf(void *mem, size_t size); int handle_binary(void *mem, size_t size) { if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) return __handle_elf(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } crac-criu-1.5.0/compel/arch/loongarch64/src/lib/handle-elf.c000066400000000000000000000010531471504326700234170ustar00rootroot00000000000000#include #include #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; extern int __handle_elf(void *mem, size_t size); int handle_binary(void *mem, size_t size) { if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) return __handle_elf(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } crac-criu-1.5.0/compel/arch/loongarch64/src/lib/include/000077500000000000000000000000001471504326700227005ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/src/lib/include/handle-elf.h000066400000000000000000000003041471504326700250450ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf64-types.h" #define arch_is_machine_supported(e_machine) (e_machine == EM_LOONGARCH) #endif /* COMPEL_HANDLE_ELF_H__ */ crac-criu-1.5.0/compel/arch/loongarch64/src/lib/include/syscall.h000066400000000000000000000001611471504326700245210ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ #ifndef SIGSTKFLT #define SIGSTKFLT 16 #endif #endif crac-criu-1.5.0/compel/arch/loongarch64/src/lib/include/uapi/000077500000000000000000000000001471504326700236365ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/src/lib/include/uapi/asm/000077500000000000000000000000001471504326700244165ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000003221471504326700271050ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT extern int ptrace_set_breakpoint(pid_t pid, void *addr); extern int ptrace_flush_breakpoints(pid_t pid); #endif crac-criu-1.5.0/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h000066400000000000000000000001661471504326700253610ustar00rootroot00000000000000#ifndef __CR_ASM_CPU_H__ #define __CR_ASM_CPU_H__ typedef struct { } compel_cpuinfo_t; #endif /* __CR_ASM_CPU_H__ */ crac-criu-1.5.0/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h000066400000000000000000000001211471504326700253530ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #endif /* __CR_ASM_FPU_H__ */ crac-criu-1.5.0/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000027211471504326700272030ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #define SIGMAX 64 #define SIGMAX_OLD 31 /* * From the Linux kernel header arch/loongarch/include/uapi/asm/ptrace.h * * A thread LoongArch CPU context * * struct user_fp_state { * uint64_t fpr[32]; * uint64_t fcc; * uint32_t fcsr; * }; * * struct user_pt_regs { * unsigned long regs[32]; * unsigned long csr_era; * unsigned long csr_badv; * unsigned long reserved[11]; * }; */ struct user_gp_regs { uint64_t regs[32]; uint64_t orig_a0; uint64_t pc; uint64_t csr_badv; uint64_t reserved[10]; } __attribute__((aligned(8))); struct user_fp_regs { uint64_t regs[32]; uint64_t fcc; uint32_t fcsr; }; typedef struct user_gp_regs user_regs_struct_t; typedef struct user_fp_regs user_fpregs_struct_t; #define user_regs_native(regs) true #define __compel_arch_fetch_thread_area(tid, th) 0 #define compel_arch_fetch_thread_area(tctl) 0 #define compel_arch_get_tls_task(ctl, tls) #define compel_arch_get_tls_thread(tctl, tls) #define REG_RES(r) ((uint64_t)(r).regs[4]) #define REG_IP(r) ((uint64_t)(r).pc) #define REG_SP(r) ((uint64_t)(r).regs[3]) #define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[11]) #define SET_REG_IP(r, val) ((r).pc = (val)) #define GPR_NUM 32 #define FPR_NUM 32 #define __NR(syscall, compat) \ ({ \ (void)compat; \ __NR_##syscall; \ }) #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ crac-criu-1.5.0/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000055341471504326700263730ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include #include #include #include #include #include #define rt_sigcontext sigcontext /* sigcontext defined in usr/include/uapi/asm/sigcontext.h*/ #include typedef __u32 u32; typedef struct sigcontext_t { __u64 pc; __u64 regs[32]; __u32 flags; __u64 extcontext[0] __attribute__((__aligned__(16))); } sigcontext_t; typedef struct context_info_t { __u32 magic; __u32 size; __u64 padding; } context_info_t; #define FPU_CTX_MAGIC 0x46505501 #define FPU_CTX_ALIGN 8 typedef struct fpu_context_t { __u64 regs[32]; __u64 fcc; __u64 fcsr; } fpu_context_t; typedef struct ucontext { unsigned long uc_flags; struct ucontext *uc_link; stack_t uc_stack; sigset_t uc_sigmask; __u8 __unused[1024 / 8 - sizeof(sigset_t)]; sigcontext_t uc_mcontext; } ucontext; /* Copy from the kernel source arch/loongarch/kernel/signal.c */ struct rt_sigframe { rt_siginfo_t rs_info; ucontext rs_uc; }; #define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe->rs_uc)) #define RT_SIGFRAME_SIGMASK(rt_sigframe) ((k_rtsigset_t *)&RT_SIGFRAME_UC(rt_sigframe)->uc_sigmask) #define RT_SIGFRAME_SIGCTX(rt_sigframe) (&(RT_SIGFRAME_UC(rt_sigframe)->uc_mcontext)) #define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(RT_SIGFRAME_SIGCTX(rt_sigframe)->pc)) #define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) #define RT_SIGFRAME_FPU(rt_sigframe) \ ({ \ context_info_t *ctx = (context_info_t *)RT_SIGFRAME_SIGCTX(rt_sigframe)->extcontext; \ ctx->magic = FPU_CTX_MAGIC; \ ctx->size = sizeof(context_info_t) + sizeof(fpu_context_t); \ (fpu_context_t *)((char *)ctx + sizeof(context_info_t)); \ }) #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 /* clang-format off */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "addi.d $sp, %0, 0 \n" \ "addi.d $a7, $zero, "__stringify(__NR_rt_sigreturn)" \n" \ "syscall 0" \ : \ :"r"(new_sp) \ : "$a7", "memory") /* clang-format on */ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); #define rt_sigframe_erase_sigset(sigframe) memset(RT_SIGFRAME_SIGMASK(sigframe), 0, sizeof(k_rtsigset_t)) #define rt_sigframe_copy_sigset(sigframe, from) memcpy(RT_SIGFRAME_SIGMASK(sigframe), from, sizeof(k_rtsigset_t)) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ crac-criu-1.5.0/compel/arch/loongarch64/src/lib/infect.c000066400000000000000000000113361471504326700226750ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "errno.h" #include #include #include "common/err.h" #include "common/page.h" #include "asm/infect-types.h" #include "ptrace.h" #include "infect.h" #include "infect-priv.h" #include "log.h" #include "common/bug.h" /* * Injected syscall instruction * loongarch64 is Little Endian */ const char code_syscall[] = { 0x00, 0x00, 0x2b, 0x00, /* syscall */ 0x00, 0x00, 0x2a, 0x00 /* break */ }; int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { sigcontext_t *sc; fpu_context_t *fpu; sc = RT_SIGFRAME_SIGCTX(sigframe); memcpy(sc->regs, regs->regs, sizeof(regs->regs)); sc->pc = regs->pc; fpu = RT_SIGFRAME_FPU(sigframe); memcpy(fpu->regs, fpregs->regs, sizeof(fpregs->regs)); fpu->fcc = fpregs->fcc; fpu->fcsr = fpregs->fcsr; return 0; } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; struct iovec iov; int ret; pr_info("Dumping GP/FPU registers for %d\n", pid); iov.iov_base = regs; iov.iov_len = sizeof(user_regs_struct_t); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { pr_perror("Failed to obtain CPU registers for %d", pid); goto err; } /* * Refer to Linux kernel arch/loongarch/kernel/signal.c */ if (regs->regs[0]) { switch (regs->regs[4]) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->regs[4] = regs->orig_a0; regs->pc -= 4; break; case -ERESTART_RESTARTBLOCK: regs->regs[4] = regs->orig_a0; regs->regs[11] = __NR_restart_syscall; regs->pc -= 4; break; } regs->regs[0] = 0; /* Don't deal with this again. */ } iov.iov_base = fpregs; iov.iov_len = sizeof(user_fpregs_struct_t); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { pr_perror("Failed to obtain FPU registers for %d", pid); goto err; } ret = save(arg, regs, fpregs); err: return 0; } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { struct iovec iov; pr_info("Restoring GP/FPU registers for %d\n", pid); iov.iov_base = ext_regs; iov.iov_len = sizeof(*ext_regs); if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { pr_perror("Failed to set FPU registers for %d", pid); return -1; } return 0; } /* * Registers $4 ~ $11 represents arguments a0 ~ a7, especially a7 is * used as syscall number. */ int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { int err; user_regs_struct_t regs = ctl->orig.regs; regs.regs[11] = (unsigned long)nr; regs.regs[4] = arg1; regs.regs[5] = arg2; regs.regs[6] = arg3; regs.regs[7] = arg4; regs.regs[8] = arg5; regs.regs[9] = arg6; err = compel_execute_syscall(ctl, ®s, code_syscall); *ret = regs.regs[4]; return err; } void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); if (err < 0 || IS_ERR_VALUE(map)) { pr_err("remote mmap() failed: %s\n", strerror(-map)); return NULL; } return (void *)map; } /* * regs must be inited when calling this function from original context */ void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { regs->pc = new_ip; if (stack) regs->regs[4] = (unsigned long)stack; } bool arch_can_dump_task(struct parasite_ctl *ctl) { return true; } int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { long ret; int err; err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->rs_uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } /* * TODO: add feature */ int ptrace_set_breakpoint(pid_t pid, void *addr) { return 0; } int ptrace_flush_breakpoints(pid_t pid) { return 0; } /* * Refer to Linux kernel arch/loongarch/include/asm/processor.h */ #define TASK_SIZE32 (1UL) << 31 #define TASK_SIZE64_MIN (1UL) << 40 #define TASK_SIZE64_MAX (1UL) << 48 unsigned long compel_task_size(void) { unsigned long task_size; for (task_size = TASK_SIZE64_MIN; task_size < TASK_SIZE64_MAX; task_size <<= 1) if (munmap((void *)task_size, page_size())) break; return task_size; } crac-criu-1.5.0/compel/arch/mips/000077500000000000000000000000001471504326700165425ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/plugins/000077500000000000000000000000001471504326700202235ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/plugins/include/000077500000000000000000000000001471504326700216465ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/plugins/include/asm/000077500000000000000000000000001471504326700224265ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/plugins/include/asm/prologue.h000066400000000000000000000012561471504326700244370ustar00rootroot00000000000000#ifndef __ASM_PROLOGUE_H__ #define __ASM_PROLOGUE_H__ #ifndef __ASSEMBLY__ #include #include #include #include #define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) typedef struct prologue_init_args { struct sockaddr_un ctl_sock_addr; unsigned int ctl_sock_addr_len; unsigned int arg_s; void *arg_p; void *sigframe; } prologue_init_args_t; #endif /* __ASSEMBLY__ */ /* * Reserve enough space for sigframe. * * FIXME It is rather should be taken from sigframe header. */ #define PROLOGUE_SGFRAME_SIZE 4096 #define PROLOGUE_INIT_ARGS_SIZE 1024 #endif /* __ASM_PROLOGUE_H__ */ crac-criu-1.5.0/compel/arch/mips/plugins/include/asm/syscall-types.h000066400000000000000000000016561471504326700254230ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ /* Types for sigaction, sigprocmask syscalls */ typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define SA_RESTORER 0x04000000 /** refer to linux-3.10/arch/mips/include/uapi/asm/signal.h*/ #define _KNSIG 128 #define _NSIG_BPW 64 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) /* * Note: as k_rtsigset_t is the same size for 32-bit and 64-bit, * sig defined as uint64_t rather than (unsigned long) - for the * purpose if we ever going to support native 32-bit compilation. */ typedef struct { uint64_t sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ crac-criu-1.5.0/compel/arch/mips/plugins/include/features.h000066400000000000000000000002021471504326700236270ustar00rootroot00000000000000#ifndef __COMPEL_ARCH_FEATURES_H #define __COMPEL_ARCH_FEATURES_H #define ARCH_HAS_MEMCPY #endif /* __COMPEL_ARCH_FEATURES_H */ crac-criu-1.5.0/compel/arch/mips/plugins/std/000077500000000000000000000000001471504326700210155ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/plugins/std/memcpy.S000066400000000000000000000004161471504326700224340ustar00rootroot00000000000000 #include "common/asm/linkage.h" .section .head.text, "ax" ENTRY(memcpy) .set noreorder dadd v0,zero,a0 daddiu t1,zero,0 loop: beq t1,a2,exit nop lb t2,0(a1) sb t2,0(a0) daddiu t1,t1,1 daddiu a0,a0,1 daddiu a1,a1,1 j loop nop exit: jr ra nop END(memcpy) crac-criu-1.5.0/compel/arch/mips/plugins/std/parasite-head.S000066400000000000000000000004171471504326700236520ustar00rootroot00000000000000 #include "common/asm/linkage.h" .section .head.text, "ax" ENTRY(__export_parasite_head_start) .set push .set noreorder jal parasite_service nop .byte 0x0d, 0x00, 0x00, 0x00 //break .set pop // .byte 0x40,0x01,0x00,0x00 //pause END(__export_parasite_head_start) crac-criu-1.5.0/compel/arch/mips/plugins/std/syscalls/000077500000000000000000000000001471504326700226525ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/plugins/std/syscalls/Makefile.syscalls000066400000000000000000000113401471504326700261450ustar00rootroot00000000000000std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o sys-proto-types := $(obj)/include/uapi/std/syscall-types.h sys-proto-generic := $(obj)/include/uapi/std/syscall.h sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h sys-proto = $(obj)/include/uapi/std/syscall-$(1).h sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S sys-asm-common-name = std/syscalls/syscall-common-mips-$(1).S sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c sys-bits := 64 AV := $$$$ define gen-rule-sys-codes $(sys-codes): $(sys-def) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ sub("^__NR", "SYS", SYSN); \ print "\n#ifndef ", $(AV)1; \ print "#define", $(AV)1, $(AV)2; \ print "#endif"; \ print "\n#ifndef ", SYSN; \ print "#define ", SYSN, $(AV)1; \ print "#endif";}' >> $$@ $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ endef define gen-rule-sys-proto $(sys-proto): $(sys-def) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ $(Q) echo '#include ' >> $$@ $(Q) echo '#include ' >> $$@ ifeq ($(1),32) $(Q) echo '#include "asm/syscall32.h"' >> $$@ endif $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ endef define gen-rule-sys-asm $(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo '#include ' >> $$@ $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ endef define gen-rule-sys-exec-tbl $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) cat $$< | awk '/^__NR/{print \ "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ endef $(sys-codes-generic): $(sys-proto-types) $(call msg-gen, $@) $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo '#include ' >> $@ $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ sub("^__NR", "__NR32", NR32); \ print "\n#ifndef ", NR32; \ print "#define ", NR32, $$2; \ print "#endif";}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ mrproper-y += $(sys-codes-generic) $(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) $(call msg-gen, $@) $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "" >> $@ $(Q) echo '#include ' >> $@ $(Q) echo "" >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ mrproper-y += $(sys-proto-generic) define gen-rule-sys-exec-tbl $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) cat $$< | awk '/^__NR/{print \ "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ endef $(eval $(call map,gen-rule-sys-codes,$(sys-bits))) $(eval $(call map,gen-rule-sys-proto,$(sys-bits))) $(eval $(call map,gen-rule-sys-asm,$(sys-bits))) $(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) $(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(call msg-gen, $@) $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) std-headers-deps += $(call sys-codes,$(sys-bits)) std-headers-deps += $(call sys-proto,$(sys-bits)) std-headers-deps += $(call sys-asm,$(sys-bits)) std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) std-headers-deps += $(sys-codes-generic) std-headers-deps += $(sys-proto-generic) std-headers-deps += $(sys-asm-types) mrproper-y += $(std-headers-deps) crac-criu-1.5.0/compel/arch/mips/plugins/std/syscalls/syscall-common-mips-64.S000066400000000000000000000003051471504326700271310ustar00rootroot00000000000000#include "common/asm/linkage.h" #define SYSCALL(name, opcode) \ ENTRY(name); \ li v0, opcode; \ syscall; \ jr ra; \ nop; \ END(name) ENTRY(__cr_restore_rt) END(__cr_restore_rt) crac-criu-1.5.0/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl000066400000000000000000000233741471504326700253510ustar00rootroot00000000000000# # System calls table, please make sure the table consist only the syscalls # really used somewhere in project. # from kernel/linux-3.10.84/arch/mips/include/uapi/asm/unistd.h Linux 64-bit syscalls are in the range from 5000 to 5999. # # __NR_name code name arguments # ------------------------------------------------------------------------------------------------------------------------------------------------------------- __NR_read 5000 sys_read (int fd, void *buf, unsigned long count) __NR_write 5001 sys_write (int fd, const void *buf, unsigned long count) __NR_open 5002 sys_open (const char *filename, unsigned long flags, unsigned long mode) __NR_close 5003 sys_close (int fd) __NR_lseek 5008 sys_lseek (int fd, unsigned long offset, unsigned long origin) __NR_mmap 5009 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) __NR_mprotect 5010 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) __NR_munmap 5011 sys_munmap (void *addr, unsigned long len) __NR_brk 5012 sys_brk (void *addr) __NR_rt_sigaction 5013 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) __NR_rt_sigprocmask 5014 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) __NR_rt_sigreturn 5211 sys_rt_sigreturn (void) __NR_ioctl 5015 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_pread64 5016 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) __NR_mremap 5024 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) __NR_mincore 5026 sys_mincore (void *addr, unsigned long size, unsigned char *vec) __NR_madvise 5027 sys_madvise (unsigned long start, size_t len, int behavior) __NR_shmat 5029 sys_shmat (int shmid, void *shmaddr, int shmflag) __NR_dup2 5032 sys_dup2 (int oldfd, int newfd) __NR_nanosleep 5034 sys_nanosleep (struct timespec *req, struct timespec *rem) __NR_getitimer 5035 sys_getitimer (int which, const struct itimerval *val) __NR_setitimer 5036 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) __NR_getpid 5038 sys_getpid (void) __NR_socket 5040 sys_socket (int domain, int type, int protocol) __NR_connect 5041 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) __NR_sendto 5043 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) __NR_recvfrom 5044 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) __NR_sendmsg 5045 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) __NR_recvmsg 5046 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) __NR_shutdown 5047 sys_shutdown (int sockfd, int how) __NR_bind 5048 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) __NR_setsockopt 5053 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) __NR_getsockopt 5054 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) __NR_clone 5055 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) __NR_exit 5058 sys_exit (unsigned long error_code) __NR_wait4 5059 sys_wait4 (int pid, int *status, int options, struct rusage *ru) __NR_kill 5060 sys_kill (long pid, int sig) __NR_fcntl 5070 sys_fcntl (int fd, int type, long arg) __NR_flock 5071 sys_flock (int fd, unsigned long cmd) __NR_mkdir 5081 sys_mkdir (const char *name, int mode) __NR_rmdir 5082 sys_rmdir (const char *name) __NR_unlink 5085 sys_unlink (char *pathname) __NR_umask 5093 sys_umask (int mask) __NR_gettimeofday 5094 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_ptrace 5099 sys_ptrace (long request, pid_t pid, void *addr, void *data) __NR_getgroups 5113 sys_getgroups (int gsize, unsigned int *groups) __NR_setgroups 5114 sys_setgroups (int gsize, unsigned int *groups) __NR_setresuid 5115 sys_setresuid (int uid, int euid, int suid) __NR_getresuid 5116 sys_getresuid (int *uid, int *euid, int *suid) __NR_setresgid 5117 sys_setresgid (int gid, int egid, int sgid) __NR_getresgid 5118 sys_getresgid (int *gid, int *egid, int *sgid) __NR_getpgid 5119 sys_getpgid (pid_t pid) __NR_setfsuid 5120 sys_setfsuid (int fsuid) __NR_setfsgid 5121 sys_setfsgid (int fsgid) __NR_getsid 5122 sys_getsid (void) __NR_capget 5123 sys_capget (struct cap_header *h, struct cap_data *d) __NR_capset 5124 sys_capset (struct cap_header *h, struct cap_data *d) __NR_rt_sigqueueinfo 5127 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) __NR_sigaltstack 5129 sys_sigaltstack (const void *uss, void *uoss) __NR_personality 5132 sys_personality (unsigned int personality) __NR_setpriority 5138 sys_setpriority (int which, int who, int nice) __NR_sched_setscheduler 5141 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_prctl 5153 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) __NR_setrlimit 5155 sys_setrlimit (int resource, struct krlimit *rlim) __NR_mount 5160 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) __NR_umount2 5161 sys_umount2 (char *name, int flags) __NR_gettid 5178 sys_gettid (void) __NR_futex 5194 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) __NR_cacheflush 5197 sys_cacheflush (char *addr, int nbytes, int cache) __NR_io_setup 5200 sys_io_setup (unsigned nr_events, aio_context_t *ctx) __NR_io_getevents 5202 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) __NR_io_submit 5203 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) __NR_set_tid_address 5212 sys_set_tid_address (int *tid_addr) __NR_restart_syscall 5213 sys_restart_syscall (void) __NR_sys_timer_create 5216 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_sys_timer_settime 5217 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) __NR_sys_timer_gettime 5218 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 5219 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 5220 sys_timer_delete (kernel_timer_t timer_id) __NR_clock_gettime 5222 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 5205 sys_exit_group (int error_code) __NR_set_thread_area 5242 sys_set_thread_area (unsigned long *addr) __NR_openat 5247 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_waitid 5237 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_readlinkat 5257 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) __NR_ppoll 5261 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) __NR_set_robust_list 5268 sys_set_robust_list (struct robust_list_head *head, size_t len) __NR_get_robust_list 5269 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_fallocate 5279 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) __NR_seccomp 5312 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_vmsplice 5266 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_timerfd_settime 5282 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_signalfd4 5283 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_preadv 5289 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_rt_tgsigqueueinfo 5291 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) __NR_fanotify_init 5295 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) __NR_fanotify_mark 5296 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) __NR_open_by_handle_at 5299 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) __NR_setns 5303 sys_setns (int fd, int nstype) __NR_kcmp 5306 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) __NR_memfd_create 5314 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 5317 sys_userfaultfd (int flags) ##TODO for kernel __NR_open_tree 5428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) __NR_move_mount 5429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 5430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 5431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size) __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 5437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_membarrier 5318 sys_membarrier (int cmd, unsigned int flags, int cpu_id) crac-criu-1.5.0/compel/arch/mips/scripts/000077500000000000000000000000001471504326700202315ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/scripts/compel-pack-compat.lds.S000066400000000000000000000001531471504326700246110ustar00rootroot00000000000000OUTPUT_ARCH(mips) EXTERN(__export_parasite_head_start) ASSERT(0,"Compatible PIEs are unsupported on mips") crac-criu-1.5.0/compel/arch/mips/scripts/compel-pack.lds.S000066400000000000000000000010251471504326700233270ustar00rootroot00000000000000OUTPUT_ARCH(mips) EXTERN(__export_parasite_head_start) SECTIONS { .text : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) *(.compel.exit) *(.compel.init) /* .rodata section*/ *(.rodata*) *(.got*) /* .data section */ *(.data*) *(.bss*) *(.sbss*) *(.toc*) } /DISCARD/ : { /*segments need to discard */ *(.debug*) *(.pdr) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) *(.MIPS.options) *(.gnu.attributes) } } crac-criu-1.5.0/compel/arch/mips/src/000077500000000000000000000000001471504326700173315ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/src/lib/000077500000000000000000000000001471504326700200775ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/src/lib/cpu.c000066400000000000000000000013121471504326700210270ustar00rootroot00000000000000#include #include #include "compel-cpu.h" #include "common/bitops.h" #include "common/compiler.h" #include "log.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static bool rt_info_done = false; void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { return 0; } int compel_cpuid(compel_cpuinfo_t *c) { return 0; } bool compel_cpu_has_feature(unsigned int feature) { if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } return compel_test_cpu_cap(&rt_info, feature); } crac-criu-1.5.0/compel/arch/mips/src/lib/handle-elf-host.c000077700000000000000000000000001471504326700253532handle-elf.custar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/src/lib/handle-elf.c000066400000000000000000000010531471504326700222410ustar00rootroot00000000000000#include #include #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; extern int __handle_elf(void *mem, size_t size); int handle_binary(void *mem, size_t size) { if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) return __handle_elf(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } crac-criu-1.5.0/compel/arch/mips/src/lib/include/000077500000000000000000000000001471504326700215225ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/src/lib/include/handle-elf.h000066400000000000000000000002771471504326700237000ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf64-types.h" #define arch_is_machine_supported(e_machine) (e_machine == EM_MIPS) #endif /* COMPEL_HANDLE_ELF_H__ */ crac-criu-1.5.0/compel/arch/mips/src/lib/include/ldsodefs.h000066400000000000000000000132671471504326700235070ustar00rootroot00000000000000/* * Run-time dynamic linker data structures for loaded ELF shared objects. * Copyright (C) 2000-2014 Free Software Foundation, Inc. * This file is part of the GNU C Library. * * The GNU C Library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * The GNU C Library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with the GNU C Library. If not, see * . */ #ifndef _MIPS_LDSODEFS_H #define _MIPS_LDSODEFS_H 1 #include struct La_mips_32_regs; struct La_mips_32_retval; struct La_mips_64_regs; struct La_mips_64_retval; #define ARCH_PLTENTER_MEMBERS \ Elf32_Addr (*mips_o32_gnu_pltenter)(Elf32_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ struct La_mips_32_regs *, unsigned int *, const char *name, \ long int *framesizep); \ Elf32_Addr (*mips_n32_gnu_pltenter)(Elf32_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ struct La_mips_64_regs *, unsigned int *, const char *name, \ long int *framesizep); \ Elf64_Addr (*mips_n64_gnu_pltenter)(Elf64_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ struct La_mips_64_regs *, unsigned int *, const char *name, \ long int *framesizep); #define ARCH_PLTEXIT_MEMBERS \ unsigned int (*mips_o32_gnu_pltexit)(Elf32_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ const struct La_mips_32_regs *, struct La_mips_32_retval *, \ const char *); \ unsigned int (*mips_n32_gnu_pltexit)(Elf32_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ const struct La_mips_64_regs *, struct La_mips_64_retval *, \ const char *); \ unsigned int (*mips_n64_gnu_pltexit)(Elf64_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ const struct La_mips_64_regs *, struct La_mips_64_retval *, \ const char *); /* The MIPS ABI specifies that the dynamic section has to be read-only. */ /* * The 64-bit MIPS ELF ABI uses an unusual reloc format. Each * relocation entry specifies up to three actual relocations, all at * the same address. The first relocation which required a symbol * uses the symbol in the r_sym field. The second relocation which * requires a symbol uses the symbol in the r_ssym field. If all * three relocations require a symbol, the third one uses a zero * value. * * We define these structures in internal headers because we're not * sure we want to make them part of the ABI yet. Eventually, some of * this may move into elf/elf.h. */ /* An entry in a 64 bit SHT_REL section. */ typedef struct { Elf32_Word r_sym; /* Symbol index */ unsigned char r_ssym; /* Special symbol for 2nd relocation */ unsigned char r_type3; /* 3rd relocation type */ unsigned char r_type2; /* 2nd relocation type */ unsigned char r_type1; /* 1st relocation type */ } _Elf64_Mips_R_Info; typedef union { Elf64_Xword r_info_number; _Elf64_Mips_R_Info r_info_fields; } _Elf64_Mips_R_Info_union; typedef struct { Elf64_Addr r_offset; /* Address */ _Elf64_Mips_R_Info_union r_info; /* Relocation type and symbol index */ } Elf64_Mips_Rel; typedef struct { Elf64_Addr r_offset; /* Address */ _Elf64_Mips_R_Info_union r_info; /* Relocation type and symbol index */ Elf64_Sxword r_addend; /* Addend */ } Elf64_Mips_Rela; #define ELF64_MIPS_R_SYM(i) ((__extension__(_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_sym) #define ELF64_MIPS_R_TYPE(i) \ (((_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_type1 | \ ((Elf32_Word)(__extension__(_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_type2 << 8) | \ ((Elf32_Word)(__extension__(_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_type3 << 16) | \ ((Elf32_Word)(__extension__(_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_ssym << 24)) #define ELF64_MIPS_R_INFO(sym, type) \ (__extension__(_Elf64_Mips_R_Info_union)( \ __extension__(_Elf64_Mips_R_Info){ (sym), ELF64_MIPS_R_SSYM(type), ELF64_MIPS_R_TYPE3(type), \ ELF64_MIPS_R_TYPE2(type), ELF64_MIPS_R_TYPE1(type) }) \ .r_info_number) /* * These macros decompose the value returned by ELF64_MIPS_R_TYPE, and * compose it back into a value that it can be used as an argument to * ELF64_MIPS_R_INFO. */ #define ELF64_MIPS_R_SSYM(i) (((i) >> 24) & 0xff) #define ELF64_MIPS_R_TYPE3(i) (((i) >> 16) & 0xff) #define ELF64_MIPS_R_TYPE2(i) (((i) >> 8) & 0xff) #define ELF64_MIPS_R_TYPE1(i) ((i)&0xff) #define ELF64_MIPS_R_TYPEENC(type1, type2, type3, ssym) \ ((type1) | ((Elf32_Word)(type2) << 8) | ((Elf32_Word)(type3) << 16) | ((Elf32_Word)(ssym) << 24)) #undef ELF64_R_SYM #define ELF64_R_SYM(i) ELF64_MIPS_R_SYM(i) #undef ELF64_R_TYPE /*fixme*/ #define ELF64_R_TYPE(i) (ELF64_MIPS_R_TYPE(i) & 0x00ff) #undef ELF64_R_INFO #define ELF64_R_INFO(sym, type) ELF64_MIPS_R_INFO((sym), (type)) #endif crac-criu-1.5.0/compel/arch/mips/src/lib/include/syscall.h000066400000000000000000000001601471504326700233420ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ #ifndef SIGSTKFLT #define SIGSTKFLT 16 #endif #endif crac-criu-1.5.0/compel/arch/mips/src/lib/include/uapi/000077500000000000000000000000001471504326700224605ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/src/lib/include/uapi/asm/000077500000000000000000000000001471504326700232405ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/mips/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000003221471504326700257270ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT extern int ptrace_set_breakpoint(pid_t pid, void *addr); extern int ptrace_flush_breakpoints(pid_t pid); #endif crac-criu-1.5.0/compel/arch/mips/src/lib/include/uapi/asm/cpu.h000066400000000000000000000001661471504326700242030ustar00rootroot00000000000000#ifndef __CR_ASM_CPU_H__ #define __CR_ASM_CPU_H__ typedef struct { } compel_cpuinfo_t; #endif /* __CR_ASM_CPU_H__ */ crac-criu-1.5.0/compel/arch/mips/src/lib/include/uapi/asm/fpu.h000066400000000000000000000001211471504326700241750ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #endif /* __CR_ASM_FPU_H__ */ crac-criu-1.5.0/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000031741471504326700260300ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #include #include #include #define SIGMAX 64 #define SIGMAX_OLD 31 /* * Copied from the Linux kernel header arch/mips/include/asm/ptrace.h * * A thread MIPS CPU context */ typedef struct { /* Saved main processor registers. */ __u64 regs[32]; /* Saved special registers. */ __u64 lo; __u64 hi; __u64 cp0_epc; __u64 cp0_badvaddr; __u64 cp0_status; __u64 cp0_cause; } user_regs_struct_t; /* from linux-3.10/arch/mips/kernel/ptrace.c */ typedef struct { /* Saved fpu registers. */ __u64 regs[32]; __u32 fpu_fcr31; __u32 fpu_id; } user_fpregs_struct_t; #define MIPS_a0 regs[4] //arguments a0-a3 #define MIPS_t0 regs[8] //temporaries t0-t7 #define MIPS_v0 regs[2] #define MIPS_v1 regs[3] #define MIPS_sp regs[29] #define MIPS_ra regs[31] #define NATIVE_MAGIC 0x0A #define COMPAT_MAGIC 0x0C static inline bool user_regs_native(user_regs_struct_t *pregs) { return true; } #define __compel_arch_fetch_thread_area(tid, th) 0 #define compel_arch_fetch_thread_area(tctl) 0 #define compel_arch_get_tls_task(ctl, tls) #define compel_arch_get_tls_thread(tctl, tls) #define REG_RES(regs) ((regs).MIPS_v0) #define REG_IP(regs) ((regs).cp0_epc) #define SET_REG_IP(regs, val) ((regs).cp0_epc = (val)) #define REG_SP(regs) ((regs).MIPS_sp) #define REG_SYSCALL_NR(regs) ((regs).MIPS_v0) //#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) #define __NR(syscall, compat) __NR_##syscall #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ crac-criu-1.5.0/compel/arch/mips/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000033571471504326700252160ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include #include #include #include #include #define u32 __u32 /* sigcontext defined in /usr/include/asm/sigcontext.h*/ #define rt_sigcontext sigcontext #include /* refer to linux-3.10/include/uapi/asm-generic/ucontext.h */ struct k_ucontext { unsigned long uc_flags; struct k_ucontext *uc_link; stack_t uc_stack; struct sigcontext uc_mcontext; k_rtsigset_t uc_sigmask; }; /* Copy from the kernel source arch/mips/kernel/signal.c */ struct rt_sigframe { u32 rs_ass[4]; /* argument save space for o32 */ u32 rs_pad[2]; /* Was: signal trampoline */ siginfo_t rs_info; struct k_ucontext rs_uc; }; #define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->rs_uc) #define RT_SIGFRAME_UC_SIGMASK(rt_sigframe) ((k_rtsigset_t *)(void *)&rt_sigframe->rs_uc.uc_sigmask) #define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)0x00) #define RT_SIGFRAME_FPU(rt_sigframe) #define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 /* clang-format off */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "move $29, %0 \n" \ "li $2, "__stringify(__NR_rt_sigreturn)" \n" \ "syscall \n" \ : \ : "r"(new_sp) \ : "$2","memory") /* clang-format on */ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); #define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->rs_uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) #define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->rs_uc.uc_sigmask, from, sizeof(k_rtsigset_t)) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ crac-criu-1.5.0/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h000066400000000000000000000054751471504326700250620ustar00rootroot00000000000000/* * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive * for more details. * * Copyright (C) 1998, 1999, 2001, 2003 Ralf Baechle * Copyright (C) 2000, 2001 Silicon Graphics, Inc. */ #ifndef _UAPI_ASM_SIGINFO_H #define _UAPI_ASM_SIGINFO_H #define __ARCH_SIGEV_PREAMBLE_SIZE (sizeof(long) + 2 * sizeof(int)) #undef __ARCH_SI_TRAPNO /* exception code needs to fill this ... */ #define HAVE_ARCH_SIGINFO_T /* * Careful to keep union _sifields from shifting ... */ #define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int)) #define __ARCH_SIGSYS #define SI_MAX_SIZE 128 #define SI_PAD_SIZE ((SI_MAX_SIZE - __ARCH_SI_PREAMBLE_SIZE) / sizeof(int)) #define __ARCH_SI_UID_T __kernel_uid32_t #ifndef __ARCH_SI_UID_T #define __ARCH_SI_UID_T __kernel_uid32_t #endif #ifndef __ARCH_SI_BAND_T #define __ARCH_SI_BAND_T long #endif #ifndef __ARCH_SI_CLOCK_T #define __ARCH_SI_CLOCK_T __kernel_clock_t #endif #ifndef __ARCH_SI_ATTRIBUTES #define __ARCH_SI_ATTRIBUTES #endif typedef struct siginfo { int si_signo; int si_errno; int si_code; union { int _pad[SI_PAD_SIZE]; /* kill() */ struct { __kernel_pid_t _pid; /* sender's pid */ __ARCH_SI_UID_T _uid; /* sender's uid */ } _kill; /* POSIX.1b timers */ struct { __kernel_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ char _pad[sizeof(__ARCH_SI_UID_T) - sizeof(int)]; sigval_t _sigval; /* same as below */ int _sys_private; /* not to be passed to user */ } _timer; /* POSIX.1b signals */ struct { __kernel_pid_t _pid; /* sender's pid */ __ARCH_SI_UID_T _uid; /* sender's uid */ sigval_t _sigval; } _rt; /* SIGCHLD */ struct { __kernel_pid_t _pid; /* which child */ __ARCH_SI_UID_T _uid; /* sender's uid */ int _status; /* exit code */ __ARCH_SI_CLOCK_T _utime; __ARCH_SI_CLOCK_T _stime; } _sigchld; /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ struct { void *_addr; /* faulting insn/memory ref. */ #ifdef __ARCH_SI_TRAPNO int _trapno; /* TRAP # which caused the signal */ #endif short _addr_lsb; /* LSB of the reported address */ #ifndef __GENKSYMS__ struct { void *_lower; void *_upper; } _addr_bnd; #endif } _sigfault; /* SIGPOLL */ struct { __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */ int _fd; } _sigpoll; /* SIGSYS */ struct { void *_call_addr; /* calling user insn */ int _syscall; /* triggering system call number */ unsigned int _arch; /* AUDIT_ARCH_* of syscall */ } _sigsys; } _sifields; } __ARCH_SI_ATTRIBUTES siginfo_t; /* * si_code values * Again these have been chosen to be IRIX compatible. */ #undef SI_ASYNCIO #undef SI_TIMER #undef SI_MESGQ #define SI_ASYNCIO -2 /* sent by AIO completion */ #endif /* _UAPI_ASM_SIGINFO_H */ crac-criu-1.5.0/compel/arch/mips/src/lib/infect.c000066400000000000000000000244601471504326700215210ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "errno.h" #include #include #include "common/err.h" #include "common/page.h" #include "asm/infect-types.h" #include "ptrace.h" #include "infect.h" #include "infect-priv.h" #include "log.h" #include "common/bug.h" /* * Injected syscall instruction * mips64el is Little Endian */ const char code_syscall[] = { 0x0c, 0x00, 0x00, 0x00, /* syscall */ 0x0d, 0x00, 0x00, 0x00 /* break */ }; /* 10-byte legacy floating point register */ struct fpreg { uint16_t significand[4]; uint16_t exponent; }; /* 16-byte floating point register */ struct fpxreg { uint16_t significand[4]; uint16_t exponent; uint16_t padding[3]; }; int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { sigframe->rs_uc.uc_mcontext.sc_regs[0] = regs->regs[0]; sigframe->rs_uc.uc_mcontext.sc_regs[1] = regs->regs[1]; sigframe->rs_uc.uc_mcontext.sc_regs[2] = regs->regs[2]; sigframe->rs_uc.uc_mcontext.sc_regs[3] = regs->regs[3]; sigframe->rs_uc.uc_mcontext.sc_regs[4] = regs->regs[4]; sigframe->rs_uc.uc_mcontext.sc_regs[5] = regs->regs[5]; sigframe->rs_uc.uc_mcontext.sc_regs[6] = regs->regs[6]; sigframe->rs_uc.uc_mcontext.sc_regs[7] = regs->regs[7]; sigframe->rs_uc.uc_mcontext.sc_regs[8] = regs->regs[8]; sigframe->rs_uc.uc_mcontext.sc_regs[9] = regs->regs[9]; sigframe->rs_uc.uc_mcontext.sc_regs[10] = regs->regs[10]; sigframe->rs_uc.uc_mcontext.sc_regs[11] = regs->regs[11]; sigframe->rs_uc.uc_mcontext.sc_regs[12] = regs->regs[12]; sigframe->rs_uc.uc_mcontext.sc_regs[13] = regs->regs[13]; sigframe->rs_uc.uc_mcontext.sc_regs[14] = regs->regs[14]; sigframe->rs_uc.uc_mcontext.sc_regs[15] = regs->regs[15]; sigframe->rs_uc.uc_mcontext.sc_regs[16] = regs->regs[16]; sigframe->rs_uc.uc_mcontext.sc_regs[17] = regs->regs[17]; sigframe->rs_uc.uc_mcontext.sc_regs[18] = regs->regs[18]; sigframe->rs_uc.uc_mcontext.sc_regs[19] = regs->regs[19]; sigframe->rs_uc.uc_mcontext.sc_regs[20] = regs->regs[20]; sigframe->rs_uc.uc_mcontext.sc_regs[21] = regs->regs[21]; sigframe->rs_uc.uc_mcontext.sc_regs[22] = regs->regs[22]; sigframe->rs_uc.uc_mcontext.sc_regs[23] = regs->regs[23]; sigframe->rs_uc.uc_mcontext.sc_regs[24] = regs->regs[24]; sigframe->rs_uc.uc_mcontext.sc_regs[25] = regs->regs[25]; sigframe->rs_uc.uc_mcontext.sc_regs[26] = regs->regs[26]; sigframe->rs_uc.uc_mcontext.sc_regs[27] = regs->regs[27]; sigframe->rs_uc.uc_mcontext.sc_regs[28] = regs->regs[28]; sigframe->rs_uc.uc_mcontext.sc_regs[29] = regs->regs[29]; sigframe->rs_uc.uc_mcontext.sc_regs[30] = regs->regs[30]; sigframe->rs_uc.uc_mcontext.sc_regs[31] = regs->regs[31]; sigframe->rs_uc.uc_mcontext.sc_mdlo = regs->lo; sigframe->rs_uc.uc_mcontext.sc_mdhi = regs->hi; sigframe->rs_uc.uc_mcontext.sc_pc = regs->cp0_epc; sigframe->rs_uc.uc_mcontext.sc_fpregs[0] = fpregs->regs[0]; sigframe->rs_uc.uc_mcontext.sc_fpregs[1] = fpregs->regs[1]; sigframe->rs_uc.uc_mcontext.sc_fpregs[2] = fpregs->regs[2]; sigframe->rs_uc.uc_mcontext.sc_fpregs[3] = fpregs->regs[3]; sigframe->rs_uc.uc_mcontext.sc_fpregs[4] = fpregs->regs[4]; sigframe->rs_uc.uc_mcontext.sc_fpregs[5] = fpregs->regs[5]; sigframe->rs_uc.uc_mcontext.sc_fpregs[6] = fpregs->regs[6]; sigframe->rs_uc.uc_mcontext.sc_fpregs[7] = fpregs->regs[7]; sigframe->rs_uc.uc_mcontext.sc_fpregs[8] = fpregs->regs[8]; sigframe->rs_uc.uc_mcontext.sc_fpregs[9] = fpregs->regs[9]; sigframe->rs_uc.uc_mcontext.sc_fpregs[10] = fpregs->regs[10]; sigframe->rs_uc.uc_mcontext.sc_fpregs[11] = fpregs->regs[11]; sigframe->rs_uc.uc_mcontext.sc_fpregs[12] = fpregs->regs[12]; sigframe->rs_uc.uc_mcontext.sc_fpregs[13] = fpregs->regs[13]; sigframe->rs_uc.uc_mcontext.sc_fpregs[14] = fpregs->regs[14]; sigframe->rs_uc.uc_mcontext.sc_fpregs[15] = fpregs->regs[15]; sigframe->rs_uc.uc_mcontext.sc_fpregs[16] = fpregs->regs[16]; sigframe->rs_uc.uc_mcontext.sc_fpregs[17] = fpregs->regs[17]; sigframe->rs_uc.uc_mcontext.sc_fpregs[18] = fpregs->regs[18]; sigframe->rs_uc.uc_mcontext.sc_fpregs[19] = fpregs->regs[19]; sigframe->rs_uc.uc_mcontext.sc_fpregs[20] = fpregs->regs[20]; sigframe->rs_uc.uc_mcontext.sc_fpregs[21] = fpregs->regs[21]; sigframe->rs_uc.uc_mcontext.sc_fpregs[22] = fpregs->regs[22]; sigframe->rs_uc.uc_mcontext.sc_fpregs[23] = fpregs->regs[23]; sigframe->rs_uc.uc_mcontext.sc_fpregs[24] = fpregs->regs[24]; sigframe->rs_uc.uc_mcontext.sc_fpregs[25] = fpregs->regs[25]; sigframe->rs_uc.uc_mcontext.sc_fpregs[26] = fpregs->regs[26]; sigframe->rs_uc.uc_mcontext.sc_fpregs[27] = fpregs->regs[27]; sigframe->rs_uc.uc_mcontext.sc_fpregs[28] = fpregs->regs[28]; sigframe->rs_uc.uc_mcontext.sc_fpregs[29] = fpregs->regs[29]; sigframe->rs_uc.uc_mcontext.sc_fpregs[30] = fpregs->regs[30]; sigframe->rs_uc.uc_mcontext.sc_fpregs[31] = fpregs->regs[31]; return 0; } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); if (ptrace(PTRACE_GETFPREGS, pid, NULL, xs)) { pr_perror("Can't obtain FPU registers for %d", pid); return ret; } /*Restart the system call*/ if (regs->regs[0]) { switch ((long)(int)regs->regs[2]) { case ERESTARTNOHAND: case ERESTARTSYS: case ERESTARTNOINTR: regs->regs[2] = regs->regs[0]; regs->regs[7] = regs->regs[26]; regs->cp0_epc -= 4; break; case ERESTART_RESTARTBLOCK: pr_warn("Will restore %d with interrupted system call\n", pid); regs->regs[2] = -EINTR; break; } regs->regs[0] = 0; } ret = save(arg, regs, xs); return ret; } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { pr_info("Restoring GP/FPU registers for %d\n", pid); if (ptrace(PTRACE_SETFPREGS, pid, NULL, ext_regs)) { pr_perror("Can't set FPU registers for %d", pid); return -1; } return 0; } int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { /*refer to glibc-2.20/sysdeps/unix/sysv/linux/mips/mips64/syscall.S*/ user_regs_struct_t regs = ctl->orig.regs; int err; regs.regs[2] = (unsigned long)nr; //syscall_number will be in v0 regs.regs[4] = arg1; regs.regs[5] = arg2; regs.regs[6] = arg3; regs.regs[7] = arg4; regs.regs[8] = arg5; regs.regs[9] = arg6; err = compel_execute_syscall(ctl, ®s, code_syscall); *ret = regs.regs[2]; return err; } void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); if (err < 0 || IS_ERR_VALUE(map)) { pr_err("remote mmap() failed: %s\n", strerror(-map)); return NULL; } return (void *)map; } /* * regs must be inited when calling this function from original context */ void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { regs->cp0_epc = new_ip; if (stack) { /* regs[29] is sp */ regs->regs[29] = (unsigned long)stack; } } bool arch_can_dump_task(struct parasite_ctl *ctl) { return true; } int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { long ret; int err; err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->rs_uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } int ptrace_set_breakpoint(pid_t pid, void *addr) { return 0; } int ptrace_flush_breakpoints(pid_t pid) { return 0; } /*refer to kernel linux-3.10/arch/mips/include/asm/processor.h*/ #define TASK_SIZE32 0x7fff8000UL #define TASK_SIZE64 0x10000000000UL #define TASK_SIZE TASK_SIZE64 unsigned long compel_task_size(void) { return TASK_SIZE; } /* * Get task registers (overwrites weak function) * */ int ptrace_get_regs(int pid, user_regs_struct_t *regs) { return ptrace(PTRACE_GETREGS, pid, NULL, regs); } /* * Set task registers (overwrites weak function) */ int ptrace_set_regs(int pid, user_regs_struct_t *regs) { return ptrace(PTRACE_SETREGS, pid, NULL, regs); } void compel_relocs_apply_mips(void *mem, void *vbase, struct parasite_blob_desc *pbd) { compel_reloc_t *elf_relocs = pbd->hdr.relocs; size_t nr_relocs = pbd->hdr.nr_relocs; size_t i, j; /* * mips rebasing :load time relocation * parasite.built-in.o and restorer.built-in.o is ELF 64-bit LSB relocatable for mips. * so we have to relocate some type for R_MIPS_26 R_MIPS_HIGHEST R_MIPS_HIGHER R_MIPS_HI16 and R_MIPS_LO16 in there. * for mips64el .if toload/store data or jump instruct ,need to relocation R_TYPE */ for (i = 0, j = 0; i < nr_relocs; i++) { if (elf_relocs[i].type & COMPEL_TYPE_MIPS_26) { int *where = (mem + elf_relocs[i].offset); *where = *where | ((elf_relocs[i].addend + ((unsigned long)vbase & 0x00fffffff) /*low 28 bit*/) >> 2); } else if (elf_relocs[i].type & COMPEL_TYPE_MIPS_64) { unsigned long *where = (mem + elf_relocs[i].offset); *where = elf_relocs[i].addend + (unsigned long)vbase; } else if (elf_relocs[i].type & COMPEL_TYPE_MIPS_HI16) { /* refer to binutils mips.cc */ int *where = (mem + elf_relocs[i].offset); int v_lo16 = (unsigned long)vbase & 0x00ffff; if ((v_lo16 + elf_relocs[i].value + elf_relocs[i].addend) >= 0x8000) { *where = *where | ((((unsigned long)vbase >> 16) & 0xffff) + 0x1); } else { *where = *where | ((((unsigned long)vbase >> 16) & 0xffff)); } } else if (elf_relocs[i].type & COMPEL_TYPE_MIPS_LO16) { int *where = (mem + elf_relocs[i].offset); int v_lo16 = (unsigned long)vbase & 0x00ffff; *where = *where | ((v_lo16 + elf_relocs[i].addend) & 0xffff); } else if (elf_relocs[i].type & COMPEL_TYPE_MIPS_HIGHER) { int *where = (mem + elf_relocs[i].offset); *where = *where | ((((unsigned long)vbase + (uint64_t)0x80008000) >> 32) & 0xffff); } else if (elf_relocs[i].type & COMPEL_TYPE_MIPS_HIGHEST) { int *where = (mem + elf_relocs[i].offset); *where = *where | ((((unsigned long)vbase + (uint64_t)0x800080008000llu) >> 48) & 0xffff); } else { BUG(); } } } crac-criu-1.5.0/compel/arch/ppc64/000077500000000000000000000000001471504326700165265ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/plugins/000077500000000000000000000000001471504326700202075ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/plugins/include/000077500000000000000000000000001471504326700216325ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/plugins/include/asm/000077500000000000000000000000001471504326700224125ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/plugins/include/asm/prologue.h000077700000000000000000000000001471504326700351342../../../../../arch/x86/plugins/include/asm/prologue.hustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/plugins/include/asm/syscall-types.h000066400000000000000000000011711471504326700253770ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ #define SA_RESTORER 0x04000000U typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define _KNSIG 64 #define _NSIG_BPW 64 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ crac-criu-1.5.0/compel/arch/ppc64/plugins/include/features.h000066400000000000000000000002321471504326700236160ustar00rootroot00000000000000#ifndef __COMPEL_ARCH_FEATURES_H #define __COMPEL_ARCH_FEATURES_H #define ARCH_HAS_MEMCPY #define ARCH_HAS_MEMCMP #endif /* __COMPEL_ARCH_FEATURES_H */ crac-criu-1.5.0/compel/arch/ppc64/plugins/std/000077500000000000000000000000001471504326700210015ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/plugins/std/memcmp.S000066400000000000000000000056561471504326700224170ustar00rootroot00000000000000/* * Author: Anton Blanchard * Copyright 2015 IBM Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * -- * Copied form the linux file arch/powerpc/lib/memcmp_64.S */ #include "common/asm/linkage.h" #define off8 r6 #define off16 r7 #define off24 r8 #define rA r9 #define rB r10 #define rC r11 #define rD r27 #define rE r28 #define rF r29 #define rG r30 #define rH r31 #ifdef __LITTLE_ENDIAN__ #define LD ldbrx #else #define LD ldx #endif ENTRY(memcmp) cmpdi cr1,r5,0 /* Use the short loop if both strings are not 8B aligned */ or r6,r3,r4 andi. r6,r6,7 /* Use the short loop if length is less than 32B */ cmpdi cr6,r5,31 beq cr1,.Lzero bne .Lshort bgt cr6,.Llong .Lshort: mtctr r5 1: lbz rA,0(r3) lbz rB,0(r4) subf. rC,rB,rA bne .Lnon_zero bdz .Lzero lbz rA,1(r3) lbz rB,1(r4) subf. rC,rB,rA bne .Lnon_zero bdz .Lzero lbz rA,2(r3) lbz rB,2(r4) subf. rC,rB,rA bne .Lnon_zero bdz .Lzero lbz rA,3(r3) lbz rB,3(r4) subf. rC,rB,rA bne .Lnon_zero addi r3,r3,4 addi r4,r4,4 bdnz 1b .Lzero: li r3,0 blr .Lnon_zero: mr r3,rC blr .Llong: li off8,8 li off16,16 li off24,24 std r31,-8(r1) std r30,-16(r1) std r29,-24(r1) std r28,-32(r1) std r27,-40(r1) srdi r0,r5,5 mtctr r0 andi. r5,r5,31 LD rA,0,r3 LD rB,0,r4 LD rC,off8,r3 LD rD,off8,r4 LD rE,off16,r3 LD rF,off16,r4 LD rG,off24,r3 LD rH,off24,r4 cmpld cr0,rA,rB addi r3,r3,32 addi r4,r4,32 bdz .Lfirst32 LD rA,0,r3 LD rB,0,r4 cmpld cr1,rC,rD LD rC,off8,r3 LD rD,off8,r4 cmpld cr6,rE,rF LD rE,off16,r3 LD rF,off16,r4 cmpld cr7,rG,rH bne cr0,.LcmpAB LD rG,off24,r3 LD rH,off24,r4 cmpld cr0,rA,rB bne cr1,.LcmpCD addi r3,r3,32 addi r4,r4,32 bdz .Lsecond32 .balign 16 1: LD rA,0,r3 LD rB,0,r4 cmpld cr1,rC,rD bne cr6,.LcmpEF LD rC,off8,r3 LD rD,off8,r4 cmpld cr6,rE,rF bne cr7,.LcmpGH LD rE,off16,r3 LD rF,off16,r4 cmpld cr7,rG,rH bne cr0,.LcmpAB LD rG,off24,r3 LD rH,off24,r4 cmpld cr0,rA,rB bne cr1,.LcmpCD addi r3,r3,32 addi r4,r4,32 bdnz 1b .Lsecond32: cmpld cr1,rC,rD bne cr6,.LcmpEF cmpld cr6,rE,rF bne cr7,.LcmpGH cmpld cr7,rG,rH bne cr0,.LcmpAB bne cr1,.LcmpCD bne cr6,.LcmpEF bne cr7,.LcmpGH .Ltail: ld r31,-8(r1) ld r30,-16(r1) ld r29,-24(r1) ld r28,-32(r1) ld r27,-40(r1) cmpdi r5,0 beq .Lzero b .Lshort .Lfirst32: cmpld cr1,rC,rD cmpld cr6,rE,rF cmpld cr7,rG,rH bne cr0,.LcmpAB bne cr1,.LcmpCD bne cr6,.LcmpEF bne cr7,.LcmpGH b .Ltail .LcmpAB: li r3,1 bgt cr0,.Lout li r3,-1 b .Lout .LcmpCD: li r3,1 bgt cr1,.Lout li r3,-1 b .Lout .LcmpEF: li r3,1 bgt cr6,.Lout li r3,-1 b .Lout .LcmpGH: li r3,1 bgt cr7,.Lout li r3,-1 .Lout: ld r31,-8(r1) ld r30,-16(r1) ld r29,-24(r1) ld r28,-32(r1) ld r27,-40(r1) blr crac-criu-1.5.0/compel/arch/ppc64/plugins/std/memcpy.S000066400000000000000000000076061471504326700224300ustar00rootroot00000000000000/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2012 * * Author: Anton Blanchard * * -- * Copied from the kernel file arch/powerpc/lib/memcpy_power7.S * Altivec support has been removed so we don't taint restored process. */ #include "common/asm/linkage.h" /* * When building the parasite code, the compiler may rely on the C library * service memcpy to initialise big local variable in the stack. */ ENTRY(memcpy) cmpldi r5,16 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) blt .Lshort_copy .Lnonvmx_copy: /* Get the source 8B aligned */ neg r6,r4 mtocrf 0x01,r6 clrldi r6,r6,(64-3) bf cr7*4+3,1f lbz r0,0(r4) addi r4,r4,1 stb r0,0(r3) addi r3,r3,1 1: bf cr7*4+2,2f lhz r0,0(r4) addi r4,r4,2 sth r0,0(r3) addi r3,r3,2 2: bf cr7*4+1,3f lwz r0,0(r4) addi r4,r4,4 stw r0,0(r3) addi r3,r3,4 3: sub r5,r5,r6 cmpldi r5,128 blt 5f mflr r0 stdu r1,-STACKFRAMESIZE(r1) std r14,STK_REG(R14)(r1) std r15,STK_REG(R15)(r1) std r16,STK_REG(R16)(r1) std r17,STK_REG(R17)(r1) std r18,STK_REG(R18)(r1) std r19,STK_REG(R19)(r1) std r20,STK_REG(R20)(r1) std r21,STK_REG(R21)(r1) std r22,STK_REG(R22)(r1) std r0,STACKFRAMESIZE+16(r1) srdi r6,r5,7 mtctr r6 /* Now do cacheline (128B) sized loads and stores. */ .align 5 4: ld r0,0(r4) ld r6,8(r4) ld r7,16(r4) ld r8,24(r4) ld r9,32(r4) ld r10,40(r4) ld r11,48(r4) ld r12,56(r4) ld r14,64(r4) ld r15,72(r4) ld r16,80(r4) ld r17,88(r4) ld r18,96(r4) ld r19,104(r4) ld r20,112(r4) ld r21,120(r4) addi r4,r4,128 std r0,0(r3) std r6,8(r3) std r7,16(r3) std r8,24(r3) std r9,32(r3) std r10,40(r3) std r11,48(r3) std r12,56(r3) std r14,64(r3) std r15,72(r3) std r16,80(r3) std r17,88(r3) std r18,96(r3) std r19,104(r3) std r20,112(r3) std r21,120(r3) addi r3,r3,128 bdnz 4b clrldi r5,r5,(64-7) ld r14,STK_REG(R14)(r1) ld r15,STK_REG(R15)(r1) ld r16,STK_REG(R16)(r1) ld r17,STK_REG(R17)(r1) ld r18,STK_REG(R18)(r1) ld r19,STK_REG(R19)(r1) ld r20,STK_REG(R20)(r1) ld r21,STK_REG(R21)(r1) ld r22,STK_REG(R22)(r1) addi r1,r1,STACKFRAMESIZE /* Up to 127B to go */ 5: srdi r6,r5,4 mtocrf 0x01,r6 6: bf cr7*4+1,7f ld r0,0(r4) ld r6,8(r4) ld r7,16(r4) ld r8,24(r4) ld r9,32(r4) ld r10,40(r4) ld r11,48(r4) ld r12,56(r4) addi r4,r4,64 std r0,0(r3) std r6,8(r3) std r7,16(r3) std r8,24(r3) std r9,32(r3) std r10,40(r3) std r11,48(r3) std r12,56(r3) addi r3,r3,64 /* Up to 63B to go */ 7: bf cr7*4+2,8f ld r0,0(r4) ld r6,8(r4) ld r7,16(r4) ld r8,24(r4) addi r4,r4,32 std r0,0(r3) std r6,8(r3) std r7,16(r3) std r8,24(r3) addi r3,r3,32 /* Up to 31B to go */ 8: bf cr7*4+3,9f ld r0,0(r4) ld r6,8(r4) addi r4,r4,16 std r0,0(r3) std r6,8(r3) addi r3,r3,16 9: clrldi r5,r5,(64-4) /* Up to 15B to go */ .Lshort_copy: mtocrf 0x01,r5 bf cr7*4+0,12f lwz r0,0(r4) /* Less chance of a reject with word ops */ lwz r6,4(r4) addi r4,r4,8 stw r0,0(r3) stw r6,4(r3) addi r3,r3,8 12: bf cr7*4+1,13f lwz r0,0(r4) addi r4,r4,4 stw r0,0(r3) addi r3,r3,4 13: bf cr7*4+2,14f lhz r0,0(r4) addi r4,r4,2 sth r0,0(r3) addi r3,r3,2 14: bf cr7*4+3,15f lbz r0,0(r4) stb r0,0(r3) 15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) blr .Lunwind_stack_nonvmx_copy: addi r1,r1,STACKFRAMESIZE b .Lnonvmx_copy crac-criu-1.5.0/compel/arch/ppc64/plugins/std/parasite-head.S000066400000000000000000000013311471504326700236320ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .head.text .align 8 ENTRY(__export_parasite_head_start) bl 0f 0: mflr r2 #define LOAD_REG_ADDR(reg, name) \ addis reg,r2,(name - 0b)@ha; \ addi reg,r2,(name - 0b)@l; LOAD_REG_ADDR(r12,parasite_service_ptr) ld r12,0(r12) mtctr r12 bctrl // call parasite_service twi 31,0,0 // Should generate SIGTRAP parasite_service_ptr: // We want to run the function prototype to set r2. // Since the relocation will prefer the local entry // point, we force it to the global one which is 2 // instructions above the local one. // FIXME: There should be a way to specify the global entry here. .quad parasite_service - 8 END(__export_parasite_head_start) crac-criu-1.5.0/compel/arch/ppc64/plugins/std/syscalls/000077500000000000000000000000001471504326700226365ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/plugins/std/syscalls/Makefile.syscalls000066400000000000000000000052101471504326700261300ustar00rootroot00000000000000ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ sys-types := $(obj)/include/uapi/std/syscall-types.h sys-codes := $(obj)/include/uapi/std/syscall-codes.h sys-proto := $(obj)/include/uapi/std/syscall.h sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall-ppc64.tbl sys-asm-common-name := std/syscalls/syscall-common-ppc64.S sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S std-lib-y += $(sys-asm:.S=).o $(sys-codes): $(sys-def) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) cat $< | awk '/^__NR/{SYSN=$$1; sub("^__NR", "SYS", SYSN);'\ 'print "\n#ifndef ", $$1, "\n#define", $$1, $$2, "\n#endif";'\ 'print "#ifndef ", SYSN, "\n#define ", SYSN, $$1, "\n#endif"}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ $(sys-proto): $(sys-def) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#include " >> $@ $(Q) echo "#include " >> $@ $(Q) cat $< | awk '/^__NR/{print "extern long", $$3, substr($$0, index($$0,$$4)), ";"}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ $(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#include " >> $@ $(Q) echo "#include \"$(sys-asm-common-name)\"" >> $@ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", $$3, ",", $$2, ")"}' >> $@ $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "static struct syscall_exec_desc sc_exec_table[] = {" >> $@ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", substr($$3, 5), ",", $$2, ")"}' >> $@ $(Q) echo " { }, /* terminator */" >> $@ $(Q) echo "};" >> $@ $(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(call msg-gen, $@) $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) mrproper-y += $(std-headers-deps) crac-criu-1.5.0/compel/arch/ppc64/plugins/std/syscalls/syscall-common-ppc64.S000066400000000000000000000006641471504326700266620ustar00rootroot00000000000000#include "common/asm/linkage.h" #include /* for __NR_ipc */ #define SYSCALL(name, opcode) \ ENTRY(name); \ li r0, opcode; \ b __syscall_common; \ END(name) .text .align 4 ENTRY(__syscall_common) sc bnslr+ /* if no error return to LR */ neg r3,r3 /* r3 = -r3 to return -errno value */ blr END(__syscall_common) ENTRY(__cr_restore_rt) li r0, __NR_rt_sigreturn b __syscall_common END(__cr_restore_rt) crac-criu-1.5.0/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl000066400000000000000000000225161471504326700257530ustar00rootroot00000000000000# # System calls table, please make sure the table consists of only the syscalls # really used somewhere in the project. # # The template is (name and arguments are optional if you need only __NR_x # defined, but no real entry point in syscalls lib). # # name code name arguments # ----------------------------------------------------------------------- # __NR_read 3 sys_read (int fd, void *buf, unsigned long count) __NR_write 4 sys_write (int fd, const void *buf, unsigned long count) __NR_open 5 sys_open (const char *filename, unsigned long flags, unsigned long mode) __NR_close 6 sys_close (int fd) __NR_lseek 19 sys_lseek (int fd, unsigned long offset, unsigned long origin) __NR_mmap 90 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) __NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) __NR_munmap 91 sys_munmap (void *addr, unsigned long len) __NR_brk 45 sys_brk (void *addr) __NR_rt_sigaction 173 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) __NR_rt_sigprocmask 174 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) __NR_rt_sigreturn 172 sys_rt_sigreturn (void) __NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_pread64 179 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) __NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data) __NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) __NR_mincore 206 sys_mincore (void *addr, unsigned long size, unsigned char *vec) __NR_madvise 205 sys_madvise (unsigned long start, size_t len, int behavior) __NR_pause 29 sys_pause (void) __NR_nanosleep 162 sys_nanosleep (struct timespec *req, struct timespec *rem) __NR_getitimer 105 sys_getitimer (int which, const struct itimerval *val) __NR_setitimer 104 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) __NR_getpid 20 sys_getpid (void) __NR_socket 326 sys_socket (int domain, int type, int protocol) __NR_connect 328 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) __NR_sendto 335 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) __NR_recvfrom 337 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) __NR_sendmsg 341 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) __NR_recvmsg 342 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) __NR_shutdown 338 sys_shutdown (int sockfd, int how) __NR_bind 327 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) __NR_setsockopt 339 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) __NR_getsockopt 340 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) __NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) __NR_exit 1 sys_exit (unsigned long error_code) __NR_wait4 114 sys_wait4 (int pid, int *status, int options, struct rusage *ru) __NR_kill 37 sys_kill (long pid, int sig) __NR_fcntl 55 sys_fcntl (int fd, int type, long arg) __NR_flock 143 sys_flock (int fd, unsigned long cmd) __NR_mkdir 39 sys_mkdir (const char *name, int mode) __NR_rmdir 40 sys_rmdir (const char *name) __NR_unlink 10 sys_unlink (char *pathname) __NR_readlinkat 296 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) __NR_umask 60 sys_umask (int mask) __NR_getgroups 80 sys_getgroups (int gsize, unsigned int *groups) __NR_setgroups 81 sys_setgroups (int gsize, unsigned int *groups) __NR_setresuid 164 sys_setresuid (int uid, int euid, int suid) __NR_getresuid 165 sys_getresuid (int *uid, int *euid, int *suid) __NR_setresgid 169 sys_setresgid (int gid, int egid, int sgid) __NR_getresgid 170 sys_getresgid (int *gid, int *egid, int *sgid) __NR_getpgid 132 sys_getpgid (pid_t pid) __NR_setfsuid 138 sys_setfsuid (int fsuid) __NR_setfsgid 139 sys_setfsgid (int fsgid) __NR_getsid 147 sys_getsid (void) __NR_capget 183 sys_capget (struct cap_header *h, struct cap_data *d) __NR_capset 184 sys_capset (struct cap_header *h, struct cap_data *d) __NR_rt_sigqueueinfo 177 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) __NR_sigaltstack 185 sys_sigaltstack (const void *uss, void *uoss) __NR_personality 136 sys_personality (unsigned int personality) __NR_setpriority 97 sys_setpriority (int which, int who, int nice) __NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_prctl 171 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) __NR_setrlimit 75 sys_setrlimit (int resource, struct krlimit *rlim) __NR_mount 21 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) __NR_umount2 52 sys_umount2 (char *name, int flags) __NR_gettid 207 sys_gettid (void) __NR_futex 221 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) __NR_set_tid_address 232 sys_set_tid_address (int *tid_addr) __NR_restart_syscall 0 sys_restart_syscall (void) __NR_sys_timer_create 240 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_sys_timer_settime 241 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) __NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 244 sys_timer_delete (kernel_timer_t timer_id) __NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 234 sys_exit_group (int error_code) __NR_waitid 272 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len) __NR_get_robust_list 299 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_vmsplice 285 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_openat 286 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_fallocate 309 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) __NR_timerfd_settime 311 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_signalfd4 313 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_rt_tgsigqueueinfo 322 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) __NR_fanotify_init 323 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) __NR_fanotify_mark 324 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) __NR_open_by_handle_at 346 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) __NR_setns 350 sys_setns (int fd, int nstype) __NR_kcmp 354 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) __NR_seccomp 358 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags) __NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp) __NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) __NR_io_submit 230 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp) __NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth) __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_preadv 320 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_userfaultfd 364 sys_userfaultfd (int flags) __NR_ppoll 281 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) __NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) __NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_membarrier 365 sys_membarrier (int cmd, unsigned int flags, int cpu_id) crac-criu-1.5.0/compel/arch/ppc64/scripts/000077500000000000000000000000001471504326700202155ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/scripts/compel-pack.lds.S000066400000000000000000000007361471504326700233230ustar00rootroot00000000000000OUTPUT_ARCH(powerpc:common64) EXTERN(__export_parasite_head_start) SECTIONS { .text : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) *(.compel.exit) *(.compel.init) } .data : ALIGN(0x10000) { *(.data*) *(.bss*) } .rodata : { *(.rodata*) *(.got*) } .toc : ALIGN(8) { *(.toc*) } /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) } } crac-criu-1.5.0/compel/arch/ppc64/src/000077500000000000000000000000001471504326700173155ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/src/lib/000077500000000000000000000000001471504326700200635ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/src/lib/cpu.c000066400000000000000000000031671471504326700210250ustar00rootroot00000000000000#include #include #include #include #include #include "compel-cpu.h" #include "common/bitops.h" #include "log.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static void fetch_rt_cpuinfo(void) { static bool rt_info_done = false; if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } } void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } int compel_cpuid(compel_cpuinfo_t *info) { info->hwcap[0] = getauxval(AT_HWCAP); info->hwcap[1] = getauxval(AT_HWCAP2); if (!info->hwcap[0] || !info->hwcap[1]) { pr_err("Can't read the hardware capabilities\n"); return -1; } return 0; } bool compel_cpu_has_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_test_cpu_cap(&rt_info, feature); } bool compel_fpu_has_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_test_fpu_cap(&rt_info, feature); } uint32_t compel_fpu_feature_size(unsigned int feature) { fetch_rt_cpuinfo(); return 0; } uint32_t compel_fpu_feature_offset(unsigned int feature) { fetch_rt_cpuinfo(); return 0; } void compel_cpu_clear_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_clear_cpu_cap(&rt_info, feature); } void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) { fetch_rt_cpuinfo(); memcpy(c, &rt_info, sizeof(rt_info)); } crac-criu-1.5.0/compel/arch/ppc64/src/lib/handle-elf-host.c000077700000000000000000000000001471504326700253372handle-elf.custar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/src/lib/handle-elf.c000066400000000000000000000014751471504326700222350ustar00rootroot00000000000000#include #include #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; int handle_binary(void *mem, size_t size) { const unsigned char *elf_ident = #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ elf_ident_64_le; #else elf_ident_64_be; #endif if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) return handle_elf_ppc64(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/000077500000000000000000000000001471504326700215065ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/cpu.h000066400000000000000000000000001471504326700224340ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/handle-elf.h000066400000000000000000000004661471504326700236640ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf64-types.h" #define ELF_PPC64 #define __handle_elf handle_elf_ppc64 #define arch_is_machine_supported(e_machine) (e_machine == EM_PPC64) extern int handle_elf_ppc64(void *mem, size_t size); #endif /* COMPEL_HANDLE_ELF_H__ */ crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/syscall.h000066400000000000000000000002521471504326700233300ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ #define __NR(syscall, compat) \ ({ \ (void)compat; \ __NR_##syscall; \ }) #endif crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/uapi/000077500000000000000000000000001471504326700224445ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/uapi/asm/000077500000000000000000000000001471504326700232245ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/uapi/asm/.gitignore000066400000000000000000000000001471504326700252020ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000003771471504326700257250ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT static inline int ptrace_set_breakpoint(pid_t pid, void *addr) { return 0; } static inline int ptrace_flush_breakpoints(pid_t pid) { return 0; } #endif crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/uapi/asm/cpu.h000066400000000000000000000002651471504326700241670ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_CPU_H__ #define UAPI_COMPEL_ASM_CPU_H__ #include typedef struct { uint64_t hwcap[2]; } compel_cpuinfo_t; #endif /* UAPI_COMPEL_ASM_CPU_H__ */ crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/uapi/asm/fpu.h000066400000000000000000000001211471504326700241610ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #endif /* __CR_ASM_FPU_H__ */ crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000053061471504326700260130ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #include #define SIGMAX_OLD 31 #define SIGMAX 64 /* * Copied from kernel header arch/powerpc/include/uapi/asm/ptrace.h */ typedef struct { unsigned long gpr[32]; unsigned long nip; unsigned long msr; unsigned long orig_gpr3; /* Used for restarting system calls */ unsigned long ctr; unsigned long link; unsigned long xer; unsigned long ccr; unsigned long softe; /* Soft enabled/disabled */ unsigned long trap; /* Reason for being here */ /* * N.B. for critical exceptions on 4xx, the dar and dsisr * fields are overloaded to hold srr0 and srr1. */ unsigned long dar; /* Fault registers */ unsigned long dsisr; /* on 4xx/Book-E used for ESR */ unsigned long result; /* Result of a system call */ } user_regs_struct_t; #define NVSXREG 32 #define USER_FPREGS_FL_FP 0x00001 #define USER_FPREGS_FL_ALTIVEC 0x00002 #define USER_FPREGS_FL_VSX 0x00004 #define USER_FPREGS_FL_TM 0x00010 #ifndef NT_PPC_TM_SPR #define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ #define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ #define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ #define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ #define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ #endif #define MSR_TMA (1UL << 34) /* bit 29 Trans Mem state: Transactional */ #define MSR_TMS (1UL << 33) /* bit 30 Trans Mem state: Suspended */ #define MSR_TM (1UL << 32) /* bit 31 Trans Mem Available */ #define MSR_VEC (1UL << 25) #define MSR_VSX (1UL << 23) #define MSR_TM_ACTIVE(x) ((((x)&MSR_TM) && ((x) & (MSR_TMA | MSR_TMS))) != 0) typedef struct { uint64_t fpregs[NFPREG]; __vector128 vrregs[NVRREG]; uint64_t vsxregs[NVSXREG]; int flags; struct tm_regs { int flags; struct { uint64_t tfhar, texasr, tfiar; } tm_spr_regs; user_regs_struct_t regs; uint64_t fpregs[NFPREG]; __vector128 vrregs[NVRREG]; uint64_t vsxregs[NVSXREG]; } tm; } user_fpregs_struct_t; #define REG_RES(regs) ((uint64_t)(regs).gpr[3]) #define REG_IP(regs) ((uint64_t)(regs).nip) #define SET_REG_IP(regs, val) ((regs).nip = (val)) #define REG_SP(regs) ((uint64_t)(regs).gpr[1]) #define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) #define user_regs_native(pregs) true #define ARCH_SI_TRAP TRAP_BRKPT #define __NR(syscall, compat) \ ({ \ (void)compat; \ __NR_##syscall; \ }) #define __compel_arch_fetch_thread_area(tid, th) 0 #define compel_arch_fetch_thread_area(tctl) 0 #define compel_arch_get_tls_task(ctl, tls) #define compel_arch_get_tls_thread(tctl, tls) #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/uapi/asm/processor-flags.h000066400000000000000000000002121471504326700265010ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ #define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ #endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/uapi/asm/processor.h000066400000000000000000000001701471504326700254120ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_PROCESSOR_H__ #define UAPI_COMPEL_ASM_PROCESSOR_H__ #endif /* UAPI_COMPEL_ASM_PROCESSOR_H__ */ crac-criu-1.5.0/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000050621471504326700251750ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include #include #include /* * sigcontext structure defined in file * /usr/include/powerpc64le-linux-gnu/bits/sigcontext.h, * included from /usr/include/signal.h * * Kernel definition can be found in arch/powerpc/include/uapi/asm/sigcontext.h */ #include // XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 /* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */ #define USER_REDZONE_SIZE 512 #if _CALL_ELF != 2 #error Only supporting ABIv2. #else #define STACK_FRAME_MIN_SIZE 32 #endif /* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */ #define TRAMP_SIZE 6 /* * ucontext_t defined in /usr/include/powerpc64le-linux-gnu/sys/ucontext.h */ struct rt_sigframe { /* sys_rt_sigreturn requires the ucontext be the first field */ ucontext_t uc; ucontext_t uc_transact; /* Transactional state */ unsigned long _unused[2]; unsigned int tramp[TRAMP_SIZE]; struct rt_siginfo *pinfo; void *puc; struct rt_siginfo info; /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */ char abigap[USER_REDZONE_SIZE]; } __attribute__((aligned(16))); /* clang-format off */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "mr 1, %0 \n" \ "li 0, "__stringify(__NR_rt_sigreturn)" \n" \ "sc \n" \ : \ : "r"(new_sp) \ : "memory") /* clang-format on */ #if _CALL_ELF != 2 #error Only supporting ABIv2. #else #define FRAME_MIN_SIZE_PARM 96 #endif #define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe)->uc) #define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.gp_regs[PT_NIP]) #define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) #define RT_SIGFRAME_FPU(rt_sigframe) (&(rt_sigframe)->uc.uc_mcontext) #define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) #define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) #define MSR_TMA (1UL << 34) /* bit 29 Trans Mem state: Transactional */ #define MSR_TMS (1UL << 33) /* bit 30 Trans Mem state: Suspended */ #define MSR_TM (1UL << 32) /* bit 31 Trans Mem Available */ #define MSR_VEC (1UL << 25) #define MSR_VSX (1UL << 23) #define MSR_TM_ACTIVE(x) ((((x)&MSR_TM) && ((x) & (MSR_TMA | MSR_TMS))) != 0) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ crac-criu-1.5.0/compel/arch/ppc64/src/lib/infect.c000066400000000000000000000363631471504326700215120ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "uapi/compel/asm/infect-types.h" #include "errno.h" #include "log.h" #include "common/bug.h" #include "common/page.h" #include "common/err.h" #include "infect.h" #include "infect-priv.h" #ifndef NT_PPC_TM_SPR #define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ #define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ #define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ #define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ #define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ #endif unsigned __page_size = 0; unsigned __page_shift = 0; /* * Injected syscall instruction */ const uint32_t code_syscall[] = { 0x44000002, /* sc */ 0x0fe00000 /* twi 31,0,0 */ }; static inline __always_unused void __check_code_syscall(void) { BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } static void prep_gp_regs(mcontext_t *dst, user_regs_struct_t *regs) { memcpy(dst->gp_regs, regs->gpr, sizeof(regs->gpr)); dst->gp_regs[PT_NIP] = regs->nip; dst->gp_regs[PT_MSR] = regs->msr; dst->gp_regs[PT_ORIG_R3] = regs->orig_gpr3; dst->gp_regs[PT_CTR] = regs->ctr; dst->gp_regs[PT_LNK] = regs->link; dst->gp_regs[PT_XER] = regs->xer; dst->gp_regs[PT_CCR] = regs->ccr; dst->gp_regs[PT_TRAP] = regs->trap; } static void put_fpu_regs(mcontext_t *mc, uint64_t *fpregs) { uint64_t *mcfp = (uint64_t *)mc->fp_regs; memcpy(mcfp, fpregs, sizeof(*fpregs) * NFPREG); } static void put_altivec_regs(mcontext_t *mc, __vector128 *vrregs) { vrregset_t *v_regs = (vrregset_t *)(((unsigned long)mc->vmx_reserve + 15) & ~0xful); memcpy(&v_regs->vrregs[0][0], vrregs, sizeof(uint64_t) * 2 * (NVRREG - 1)); v_regs->vrsave = *((uint32_t *)&vrregs[NVRREG - 1]); mc->v_regs = v_regs; } static void put_vsx_regs(mcontext_t *mc, uint64_t *vsxregs) { memcpy((uint64_t *)(mc->v_regs + 1), vsxregs, sizeof(*vsxregs) * NVSXREG); } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { mcontext_t *dst_tc = &sigframe->uc_transact.uc_mcontext; mcontext_t *dst = &sigframe->uc.uc_mcontext; if (fpregs->flags & USER_FPREGS_FL_TM) { prep_gp_regs(&sigframe->uc_transact.uc_mcontext, &fpregs->tm.regs); prep_gp_regs(&sigframe->uc.uc_mcontext, &fpregs->tm.regs); } else { prep_gp_regs(&sigframe->uc.uc_mcontext, regs); } if (fpregs->flags & USER_FPREGS_FL_TM) sigframe->uc.uc_link = &sigframe->uc_transact; if (fpregs->flags & USER_FPREGS_FL_FP) { if (fpregs->flags & USER_FPREGS_FL_TM) { put_fpu_regs(&sigframe->uc_transact.uc_mcontext, fpregs->tm.fpregs); put_fpu_regs(&sigframe->uc.uc_mcontext, fpregs->tm.fpregs); } else { put_fpu_regs(&sigframe->uc.uc_mcontext, fpregs->fpregs); } } if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) { if (fpregs->flags & USER_FPREGS_FL_TM) { put_altivec_regs(&sigframe->uc_transact.uc_mcontext, fpregs->tm.vrregs); put_altivec_regs(&sigframe->uc.uc_mcontext, fpregs->tm.vrregs); dst_tc->gp_regs[PT_MSR] |= MSR_VEC; } else { put_altivec_regs(&sigframe->uc.uc_mcontext, fpregs->vrregs); } dst->gp_regs[PT_MSR] |= MSR_VEC; if (fpregs->flags & USER_FPREGS_FL_VSX) { if (fpregs->flags & USER_FPREGS_FL_TM) { put_vsx_regs(&sigframe->uc_transact.uc_mcontext, fpregs->tm.vsxregs); put_vsx_regs(&sigframe->uc.uc_mcontext, fpregs->tm.vsxregs); dst_tc->gp_regs[PT_MSR] |= MSR_VSX; } else { put_vsx_regs(&sigframe->uc.uc_mcontext, fpregs->vsxregs); } dst->gp_regs[PT_MSR] |= MSR_VSX; } } return 0; } static void update_vregs(mcontext_t *lcontext, mcontext_t *rcontext) { if (lcontext->v_regs) { uint64_t offset = (uint64_t)(lcontext->v_regs) - (uint64_t)lcontext; lcontext->v_regs = (vrregset_t *)((uint64_t)rcontext + offset); pr_debug("Updated v_regs:%llx (rcontext:%llx)\n", (unsigned long long)lcontext->v_regs, (unsigned long long)rcontext); } } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *frame, struct rt_sigframe *rframe) { uint64_t msr = frame->uc.uc_mcontext.gp_regs[PT_MSR]; update_vregs(&frame->uc.uc_mcontext, &rframe->uc.uc_mcontext); /* Sanity check: If TM so uc_link should be set, otherwise not */ if (MSR_TM_ACTIVE(msr) ^ (!!(frame->uc.uc_link))) { BUG(); return -1; } /* Updating the transactional state address if any */ if (frame->uc.uc_link) { update_vregs(&frame->uc_transact.uc_mcontext, &rframe->uc_transact.uc_mcontext); frame->uc.uc_link = &rframe->uc_transact; } return 0; } /* This is the layout of the POWER7 VSX registers and the way they * overlap with the existing FPR and VMX registers. * * VSR doubleword 0 VSR doubleword 1 * ---------------------------------------------------------------- * VSR[0] | FPR[0] | | * ---------------------------------------------------------------- * VSR[1] | FPR[1] | | * ---------------------------------------------------------------- * | ... | | * ---------------------------------------------------------------- * VSR[30] | FPR[30] | | * ---------------------------------------------------------------- * VSR[31] | FPR[31] | | * ---------------------------------------------------------------- * VSR[32] | VR[0] | * ---------------------------------------------------------------- * VSR[33] | VR[1] | * ---------------------------------------------------------------- * | ... | * ---------------------------------------------------------------- * VSR[62] | VR[30] | * ---------------------------------------------------------------- * VSR[63] | VR[31] | * ---------------------------------------------------------------- * * PTRACE_GETFPREGS returns FPR[0..31] + FPSCR * PTRACE_GETVRREGS returns VR[0..31] + VSCR + VRSAVE * PTRACE_GETVSRREGS returns VSR[0..31] * * PTRACE_GETVSRREGS and PTRACE_GETFPREGS are required since we need * to save FPSCR too. * * There 32 VSX double word registers to save since the 32 first VSX double * word registers are saved through FPR[0..32] and the remaining registers * are saved when saving the Altivec registers VR[0..32]. */ static int get_fpu_regs(pid_t pid, user_fpregs_struct_t *fp) { if (ptrace(PTRACE_GETFPREGS, pid, 0, (void *)&fp->fpregs) < 0) { pr_perror("Couldn't get floating-point registers"); return -1; } fp->flags |= USER_FPREGS_FL_FP; return 0; } static int get_altivec_regs(pid_t pid, user_fpregs_struct_t *fp) { if (ptrace(PTRACE_GETVRREGS, pid, 0, (void *)&fp->vrregs) < 0) { /* PTRACE_GETVRREGS returns EIO if Altivec is not supported. * This should not happen if msr_vec is set. */ if (errno != EIO) { pr_perror("Couldn't get Altivec registers"); return -1; } pr_debug("Altivec not supported\n"); } else { pr_debug("Dumping Altivec registers\n"); fp->flags |= USER_FPREGS_FL_ALTIVEC; } return 0; } /* * Since the FPR[0-31] is stored in the first double word of VSR[0-31] and * FPR are saved through the FP state, there is no need to save the upper part * of the first 32 VSX registers. * Furthermore, the 32 last VSX registers are also the 32 Altivec registers * already saved, so no need to save them. * As a consequence, only the doubleword 1 of the 32 first VSX registers have * to be saved (the ones are returned by PTRACE_GETVSRREGS). */ static int get_vsx_regs(pid_t pid, user_fpregs_struct_t *fp) { if (ptrace(PTRACE_GETVSRREGS, pid, 0, (void *)fp->vsxregs) < 0) { /* * EIO is returned in the case PTRACE_GETVRREGS is not * supported. */ if (errno != EIO) { pr_perror("Couldn't get VSX registers"); return -1; } pr_debug("VSX register's dump not supported.\n"); } else { pr_debug("Dumping VSX registers\n"); fp->flags |= USER_FPREGS_FL_VSX; } return 0; } static int get_tm_regs(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; pr_debug("Dumping TM registers\n"); #define TM_REQUIRED 0 #define TM_OPTIONAL 1 #define PTRACE_GET_TM(s, n, c, u) \ do { \ iov.iov_base = &s; \ iov.iov_len = sizeof(s); \ if (ptrace(PTRACE_GETREGSET, pid, c, &iov)) { \ if (!u || errno != EIO) { \ pr_perror("Couldn't get TM " n); \ pr_err("Your kernel seems to not support the " \ "new TM ptrace API (>= 4.8)\n"); \ goto out_free; \ } \ pr_debug("TM " n " not supported.\n"); \ iov.iov_base = NULL; \ } \ } while (0) /* Get special registers */ PTRACE_GET_TM(fpregs->tm.tm_spr_regs, "SPR", NT_PPC_TM_SPR, TM_REQUIRED); /* Get checkpointed regular registers */ PTRACE_GET_TM(fpregs->tm.regs, "GPR", NT_PPC_TM_CGPR, TM_REQUIRED); /* Get checkpointed FP registers */ PTRACE_GET_TM(fpregs->tm.fpregs, "FPR", NT_PPC_TM_CFPR, TM_OPTIONAL); if (iov.iov_base) fpregs->tm.flags |= USER_FPREGS_FL_FP; /* Get checkpointed VMX (Altivec) registers */ PTRACE_GET_TM(fpregs->tm.vrregs, "VMX", NT_PPC_TM_CVMX, TM_OPTIONAL); if (iov.iov_base) fpregs->tm.flags |= USER_FPREGS_FL_ALTIVEC; /* Get checkpointed VSX registers */ PTRACE_GET_TM(fpregs->tm.vsxregs, "VSX", NT_PPC_TM_CVSX, TM_OPTIONAL); if (iov.iov_base) fpregs->tm.flags |= USER_FPREGS_FL_VSX; return 0; out_free: return -1; /* still failing the checkpoint */ } /* * This is inspired by kernel function check_syscall_restart in * arch/powerpc/kernel/signal.c */ #ifndef TRAP #define TRAP(r) ((r).trap & ~0xF) #endif static bool trap_is_scv(user_regs_struct_t *regs) { return TRAP(*regs) == 0x3000; } static bool trap_is_syscall(user_regs_struct_t *regs) { return trap_is_scv(regs) || TRAP(*regs) == 0x0C00; } static void handle_syscall(pid_t pid, user_regs_struct_t *regs) { unsigned long ret = regs->gpr[3]; if (trap_is_scv(regs)) { if (!IS_ERR_VALUE(ret)) return; ret = -ret; } else if (!(regs->ccr & 0x10000000)) { return; } /* Restart or interrupt the system call */ switch (ret) { case ERESTARTNOHAND: case ERESTARTSYS: case ERESTARTNOINTR: regs->gpr[3] = regs->orig_gpr3; regs->nip -= 4; break; case ERESTART_RESTARTBLOCK: pr_warn("Will restore %d with interrupted system call\n", pid); regs->gpr[3] = trap_is_scv(regs) ? -EINTR : EINTR; break; } } static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { pr_info("Dumping GP/FPU registers for %d\n", pid); if (trap_is_syscall(regs)) handle_syscall(pid, regs); /* Resetting trap since we are now coming from user space. */ regs->trap = 0; fpregs->flags = 0; /* * Check for Transactional Memory operation in progress. * Until we have support of TM register's state through the ptrace API, * we can't checkpoint process with TM operation in progress (almost * impossible) or suspended (easy to get). */ if (MSR_TM_ACTIVE(regs->msr)) { pr_debug("Task %d has %s TM operation at 0x%lx\n", pid, (regs->msr & MSR_TMS) ? "a suspended" : "an active", regs->nip); if (get_tm_regs(pid, fpregs)) return -1; fpregs->flags = USER_FPREGS_FL_TM; } if (get_fpu_regs(pid, fpregs)) return -1; if (get_altivec_regs(pid, fpregs)) return -1; if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) { /* * Save the VSX registers if Altivec registers are supported */ if (get_vsx_regs(pid, fpregs)) return -1; } return 0; } int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; int ret; ret = __get_task_regs(pid, regs, fpregs); if (ret) return ret; return save(arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { int ret = 0; pr_info("Restoring GP/FPU registers for %d\n", pid); /* XXX: should restore TM registers somehow? */ if (ext_regs->flags & USER_FPREGS_FL_FP) { if (ptrace(PTRACE_SETFPREGS, pid, 0, (void *)&ext_regs->fpregs) < 0) { pr_perror("Couldn't set floating-point registers"); ret = -1; } } if (ext_regs->flags & USER_FPREGS_FL_ALTIVEC) { if (ptrace(PTRACE_SETVRREGS, pid, 0, (void *)&ext_regs->vrregs) < 0) { pr_perror("Couldn't set Altivec registers"); ret = -1; } if (ptrace(PTRACE_SETVSRREGS, pid, 0, (void *)ext_regs->vsxregs) < 0) { pr_perror("Couldn't set VSX registers"); ret = -1; } } return ret; } int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; regs.gpr[0] = (unsigned long)nr; regs.gpr[3] = arg1; regs.gpr[4] = arg2; regs.gpr[5] = arg3; regs.gpr[6] = arg4; regs.gpr[7] = arg5; regs.gpr[8] = arg6; err = compel_execute_syscall(ctl, ®s, (char *)code_syscall); *ret = regs.gpr[3]; return err; } void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map = 0; int err; err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); if (err < 0 || (long)map < 0) map = 0; return (void *)map; } void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { /* * OpenPOWER ABI requires that r12 is set to the calling function address * to compute the TOC pointer. */ regs->gpr[12] = new_ip; regs->nip = new_ip; if (stack) regs->gpr[1] = (unsigned long)stack - STACK_FRAME_MIN_SIZE; regs->trap = 0; } bool arch_can_dump_task(struct parasite_ctl *ctl) { /* * TODO: We should detect 32bit task when BE support is done. */ return true; } int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { long ret; int err; err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } /* * Copied for the Linux kernel arch/powerpc/include/asm/processor.h * * NOTE: 32bit tasks are not supported. */ #define TASK_SIZE_64TB (0x0000400000000000UL) #define TASK_SIZE_512TB (0x0002000000000000UL) #define TASK_SIZE_MIN TASK_SIZE_64TB #define TASK_SIZE_MAX TASK_SIZE_512TB unsigned long compel_task_size(void) { unsigned long task_size; for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) if (munmap((void *)task_size, page_size())) break; return task_size; } crac-criu-1.5.0/compel/arch/s390/000077500000000000000000000000001471504326700162705ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/plugins/000077500000000000000000000000001471504326700177515ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/plugins/include/000077500000000000000000000000001471504326700213745ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/plugins/include/asm/000077500000000000000000000000001471504326700221545ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/plugins/include/asm/prologue.h000077700000000000000000000000001471504326700346762../../../../../arch/x86/plugins/include/asm/prologue.hustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/plugins/include/asm/syscall-types.h000066400000000000000000000013761471504326700251500ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ #define SA_RESTORER 0x04000000U typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define _KNSIG 64 #define _NSIG_BPW 64 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; /* * Used for rt_sigaction() system call - see kernel "struct sigaction" in * include/linux/signal.h. */ typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; struct mmap_arg_struct; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ crac-criu-1.5.0/compel/arch/s390/plugins/std/000077500000000000000000000000001471504326700205435ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/plugins/std/parasite-head.S000066400000000000000000000003371471504326700234010ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .head.text, "ax" ENTRY(__export_parasite_head_start) brasl %r14,parasite_service .long 0x00010001 /* S390_BREAKPOINT_U16: Generates SIGTRAP */ END(__export_parasite_head_start) crac-criu-1.5.0/compel/arch/s390/plugins/std/syscalls/000077500000000000000000000000001471504326700224005ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/plugins/std/syscalls/Makefile.syscalls000066400000000000000000000053061471504326700257000ustar00rootroot00000000000000ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ sys-types := $(obj)/include/uapi/std/syscall-types.h sys-codes := $(obj)/include/uapi/std/syscall-codes.h sys-proto := $(obj)/include/uapi/std/syscall.h sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall-s390.tbl sys-asm-common-name := std/syscalls/syscall-common-s390.S sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S std-lib-y += $(sys-asm:.S=).o std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls-s390.o $(sys-codes): $(sys-def) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) cat $< | awk '/^__NR/{SYSN=$$1; sub("^__NR", "SYS", SYSN);'\ 'print "\n#ifndef ", $$1, "\n#define", $$1, $$2, "\n#endif";'\ 'print "#ifndef ", SYSN, "\n#define ", SYSN, $$1, "\n#endif"}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ $(sys-proto): $(sys-def) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#include " >> $@ $(Q) echo "#include " >> $@ $(Q) cat $< | awk '/^__NR/{print "extern long", $$3, substr($$0, index($$0,$$4)), ";"}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ $(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#include " >> $@ $(Q) echo "#include \"$(sys-asm-common-name)\"" >> $@ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", $$3, ",", $$2, ")"}' >> $@ $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "static struct syscall_exec_desc sc_exec_table[] = {" >> $@ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", substr($$3, 5), ",", $$2, ")"}' >> $@ $(Q) echo " { }, /* terminator */" >> $@ $(Q) echo "};" >> $@ $(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(call msg-gen, $@) $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) mrproper-y += $(std-headers-deps) crac-criu-1.5.0/compel/arch/s390/plugins/std/syscalls/syscall-common-s390.S000066400000000000000000000017211471504326700261610ustar00rootroot00000000000000#include "common/asm/linkage.h" /* * Define a system call * * C-ABI on s390: * - Parameters 1-5 are passed in %r2-%r6 * - Parameter 6 is passed on the stack 160(%r15) * - Return value is in %r2 * - Return address is in %r14 * - Registers %r0-%r6,%r14 are call-clobbered * - Registers %r7-%r13,%r15 are call-saved * * SVC ABI on s390: * - For SVC 0 the system call number is passed in %r1 * - Parameters 1-6 are passed in %r2-%r7 * - Return value is passed in %r2 * - Besides of %r2 all registers are call-saved */ #define SYSCALL(name, opcode) \ ENTRY(name); \ lgr %r0,%r7; /* Save %r7 */ \ lg %r7,160(%r15); /* Load 6th parameter */ \ lghi %r1,opcode; /* Load SVC number */ \ svc 0; /* Issue SVC 0 */ \ lgr %r7,%r0; /* Restore %r7 */ \ br %r14; /* Return to caller */ \ END(name) \ /* * Issue rt_sigreturn system call for sa_restorer */ ENTRY(__cr_restore_rt) lghi %r1,__NR_rt_sigreturn svc 0 END(__cr_restore_rt) crac-criu-1.5.0/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl000066400000000000000000000223631471504326700252570ustar00rootroot00000000000000# # System calls table, please make sure the table consists of only the syscalls # really used somewhere in the project. # # The template is (name and arguments are optional if you need only __NR_x # defined, but no real entry point in syscalls lib). # # name code name arguments # ----------------------------------------------------------------------- # __NR_read 3 sys_read (int fd, void *buf, unsigned long count) __NR_write 4 sys_write (int fd, const void *buf, unsigned long count) __NR_open 5 sys_open (const char *filename, unsigned long flags, unsigned long mode) __NR_close 6 sys_close (int fd) __NR_lseek 19 sys_lseek (int fd, unsigned long offset, unsigned long origin) __NR_mmap 90 sys_old_mmap (struct mmap_arg_struct *) __NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) __NR_munmap 91 sys_munmap (void *addr, unsigned long len) __NR_brk 45 sys_brk (void *addr) __NR_rt_sigaction 174 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) __NR_rt_sigprocmask 175 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) __NR_rt_sigreturn 173 sys_rt_sigreturn (void) __NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_pread64 180 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) __NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data) __NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) __NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char *vec) __NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior) __NR_pause 29 sys_pause (void) __NR_nanosleep 162 sys_nanosleep (struct timespec *req, struct timespec *rem) __NR_getitimer 105 sys_getitimer (int which, const struct itimerval *val) __NR_setitimer 104 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) __NR_getpid 20 sys_getpid (void) __NR_socket 359 sys_socket (int domain, int type, int protocol) __NR_connect 362 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) __NR_sendto 369 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) __NR_recvfrom 371 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) __NR_sendmsg 370 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) __NR_recvmsg 372 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) __NR_shutdown 373 sys_shutdown (int sockfd, int how) __NR_bind 361 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) __NR_setsockopt 366 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) __NR_getsockopt 365 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) __NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid, void *tls) __NR_exit 1 sys_exit (unsigned long error_code) __NR_wait4 114 sys_wait4 (int pid, int *status, int options, struct rusage *ru) __NR_kill 37 sys_kill (long pid, int sig) __NR_fcntl 55 sys_fcntl (int fd, int type, long arg) __NR_flock 143 sys_flock (int fd, unsigned long cmd) __NR_mkdir 39 sys_mkdir (const char *name, int mode) __NR_rmdir 40 sys_rmdir (const char *name) __NR_unlink 10 sys_unlink (char *pathname) __NR_readlinkat 298 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) __NR_umask 60 sys_umask (int mask) __NR_getgroups 205 sys_getgroups (int gsize, unsigned int *groups) __NR_setgroups 206 sys_setgroups (int gsize, unsigned int *groups) __NR_setresuid 208 sys_setresuid (int uid, int euid, int suid) __NR_getresuid 209 sys_getresuid (int *uid, int *euid, int *suid) __NR_setresgid 210 sys_setresgid (int gid, int egid, int sgid) __NR_getresgid 211 sys_getresgid (int *gid, int *egid, int *sgid) __NR_getpgid 132 sys_getpgid (pid_t pid) __NR_setfsuid 215 sys_setfsuid (int fsuid) __NR_setfsgid 216 sys_setfsgid (int fsgid) __NR_getsid 147 sys_getsid (void) __NR_capget 184 sys_capget (struct cap_header *h, struct cap_data *d) __NR_capset 185 sys_capset (struct cap_header *h, struct cap_data *d) __NR_rt_sigqueueinfo 178 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) __NR_sigaltstack 186 sys_sigaltstack (const void *uss, void *uoss) __NR_personality 136 sys_personality (unsigned int personality) __NR_setpriority 97 sys_setpriority (int which, int who, int nice) __NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_prctl 172 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) __NR_setrlimit 75 sys_setrlimit (int resource, struct krlimit *rlim) __NR_mount 21 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) __NR_umount2 52 sys_umount2 (char *name, int flags) __NR_gettid 236 sys_gettid (void) __NR_futex 238 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) __NR_set_tid_address 252 sys_set_tid_address (int *tid_addr) __NR_restart_syscall 7 sys_restart_syscall (void) __NR_sys_timer_create 254 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_sys_timer_settime 255 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) __NR_sys_timer_gettime 256 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 257 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 258 sys_timer_delete (kernel_timer_t timer_id) __NR_clock_gettime 260 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 248 sys_exit_group (int error_code) __NR_waitid 281 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, size_t len) __NR_get_robust_list 305 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_vmsplice 309 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_openat 288 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_fallocate 314 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) __NR_timerfd_settime 320 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_signalfd4 322 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_rt_tgsigqueueinfo 330 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) __NR_fanotify_init 332 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) __NR_fanotify_mark 333 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) __NR_open_by_handle_at 336 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) __NR_setns 339 sys_setns (int fd, int nstype) __NR_kcmp 343 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) __NR_seccomp 348 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_memfd_create 350 sys_memfd_create (const char *name, unsigned int flags) __NR_io_setup 243 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp) __NR_io_getevents 245 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) __NR_io_submit 246 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp) __NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth) __NR_userfaultfd 355 sys_userfaultfd (int flags) __NR_preadv 328 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_ppoll 302 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) __NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) __NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_membarrier 356 sys_membarrier (int cmd, unsigned int flags, int cpu_id) crac-criu-1.5.0/compel/arch/s390/plugins/std/syscalls/syscalls-s390.c000066400000000000000000000012521471504326700250750ustar00rootroot00000000000000#include "asm/infect-types.h" /* * Define prototype because of compile error if we include uapi/std/syscall.h */ long sys_old_mmap(struct mmap_arg_struct *); /* * On s390 we have defined __ARCH_WANT_SYS_OLD_MMAP - Therefore implement * system call with one parameter "mmap_arg_struct". */ unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) { struct mmap_arg_struct arg_struct; arg_struct.addr = (unsigned long)addr; arg_struct.len = len; arg_struct.prot = prot; arg_struct.flags = flags; arg_struct.fd = fd; arg_struct.offset = offset; return sys_old_mmap(&arg_struct); } crac-criu-1.5.0/compel/arch/s390/scripts/000077500000000000000000000000001471504326700177575ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/scripts/compel-pack.lds.S000066400000000000000000000007301471504326700230570ustar00rootroot00000000000000OUTPUT_ARCH(s390:64-bit) EXTERN(__export_parasite_head_start) SECTIONS { .text : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) *(.compel.exit) *(.compel.init) } .data : ALIGN(0x1000) { *(.data*) *(.bss*) } .rodata : { *(.rodata*) *(.got*) } .toc : ALIGN(8) { *(.toc*) } /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) } } crac-criu-1.5.0/compel/arch/s390/src/000077500000000000000000000000001471504326700170575ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/src/lib/000077500000000000000000000000001471504326700176255ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/src/lib/cpu.c000066400000000000000000000031111471504326700205540ustar00rootroot00000000000000#include #include #include #include "compel-cpu.h" #include "common/bitops.h" #include "common/compiler.h" #include "log.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static bool rt_info_done = false; static void fetch_rt_cpuinfo(void) { if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } } void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { return 0; } int compel_cpuid(compel_cpuinfo_t *info) { info->hwcap[0] = getauxval(AT_HWCAP); info->hwcap[1] = getauxval(AT_HWCAP2); if (!info->hwcap[0]) { pr_err("Can't read the hardware capabilities\n"); return -1; } return 0; } bool compel_cpu_has_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_test_cpu_cap(&rt_info, feature); } bool compel_fpu_has_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_test_fpu_cap(&rt_info, feature); } uint32_t compel_fpu_feature_offset(unsigned int feature) { fetch_rt_cpuinfo(); return 0; } uint32_t compel_fpu_feature_size(unsigned int feature) { fetch_rt_cpuinfo(); return 0; } void compel_cpu_clear_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_clear_cpu_cap(&rt_info, feature); } void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) { fetch_rt_cpuinfo(); memcpy(c, &rt_info, sizeof(rt_info)); } crac-criu-1.5.0/compel/arch/s390/src/lib/handle-elf-host.c000077700000000000000000000000001471504326700251012handle-elf.custar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/src/lib/handle-elf.c000066400000000000000000000007631471504326700217760ustar00rootroot00000000000000#include #include #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; int handle_binary(void *mem, size_t size) { if (memcmp(mem, elf_ident_64, sizeof(elf_ident_64)) == 0) return handle_elf_s390(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } crac-criu-1.5.0/compel/arch/s390/src/lib/include/000077500000000000000000000000001471504326700212505ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/src/lib/include/handle-elf.h000066400000000000000000000004531471504326700234220ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf64-types.h" #define ELF_S390 #define __handle_elf handle_elf_s390 #define arch_is_machine_supported(e_machine) (e_machine == EM_S390) int handle_elf_s390(void *mem, size_t size); #endif /* COMPEL_HANDLE_ELF_H__ */ crac-criu-1.5.0/compel/arch/s390/src/lib/include/syscall.h000066400000000000000000000003241471504326700230720ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset); #endif crac-criu-1.5.0/compel/arch/s390/src/lib/include/uapi/000077500000000000000000000000001471504326700222065ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/src/lib/include/uapi/asm/000077500000000000000000000000001471504326700227665ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/s390/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000003771471504326700254670ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT static inline int ptrace_set_breakpoint(pid_t pid, void *addr) { return 0; } static inline int ptrace_flush_breakpoints(pid_t pid) { return 0; } #endif crac-criu-1.5.0/compel/arch/s390/src/lib/include/uapi/asm/cpu.h000066400000000000000000000002561471504326700237310ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_CPU_H__ #define UAPI_COMPEL_ASM_CPU_H__ #include typedef struct { uint64_t hwcap[2]; } compel_cpuinfo_t; #endif /* __CR_ASM_CPU_H__ */ crac-criu-1.5.0/compel/arch/s390/src/lib/include/uapi/asm/fpu.h000066400000000000000000000003251471504326700237310ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #include #include /* * This one is used in restorer */ typedef struct { bool has_fpu; } fpu_state_t; #endif /* __CR_ASM_FPU_H__ */ crac-criu-1.5.0/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000042411471504326700255520ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #include #include #include "common/page.h" #define SIGMAX 64 #define SIGMAX_OLD 31 /* * Definitions from /usr/include/asm/ptrace.h: * * typedef struct * { * __u32 fpc; * freg_t fprs[NUM_FPRS]; * } s390_fp_regs; * * typedef struct * { * psw_t psw; * unsigned long gprs[NUM_GPRS]; * unsigned int acrs[NUM_ACRS]; * unsigned long orig_gpr2; * } s390_regs; */ typedef struct { uint64_t part1; uint64_t part2; } vector128_t; struct prfpreg { uint32_t fpc; uint64_t fprs[16]; }; #define USER_FPREGS_VXRS 0x000000001 /* Guarded-storage control block */ #define USER_GS_CB 0x000000002 /* Guarded-storage broadcast control block */ #define USER_GS_BC 0x000000004 /* Runtime-instrumentation control block */ #define USER_RI_CB 0x000000008 /* Runtime-instrumentation bit set */ #define USER_RI_ON 0x000000010 typedef struct { uint32_t flags; struct prfpreg prfpreg; uint64_t vxrs_low[16]; vector128_t vxrs_high[16]; uint64_t gs_cb[4]; uint64_t gs_bc[4]; uint64_t ri_cb[8]; } user_fpregs_struct_t; typedef struct { s390_regs prstatus; uint32_t system_call; } user_regs_struct_t; #define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) #define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) #define SET_REG_IP(r, val) ((r).prstatus.psw.addr = (val)) #define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15]) /* * We assume that REG_SYSCALL_NR() is only used for pie code where we * always use svc 0 with opcode in %r1. */ #define REG_SYSCALL_NR(r) ((uint64_t)(r).prstatus.gprs[1]) #define user_regs_native(pregs) true #define __NR(syscall, compat) \ ({ \ (void)compat; \ __NR_##syscall; \ }) struct mmap_arg_struct { unsigned long addr; unsigned long len; unsigned long prot; unsigned long flags; unsigned long fd; unsigned long offset; }; #define __compel_arch_fetch_thread_area(tid, th) 0 #define compel_arch_fetch_thread_area(tctl) 0 #define compel_arch_get_tls_task(ctl, tls) #define compel_arch_get_tls_thread(tctl, tls) #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ crac-criu-1.5.0/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000035411471504326700247370ustar00rootroot00000000000000 #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include #include #include #include // XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 /* * From /usr/include/asm/sigcontext.h * * Redefine _sigregs_ext to be able to compile on older systems */ #ifndef __NUM_VXRS_LOW typedef struct { __u32 u[4]; } __vector128; typedef struct { unsigned long long vxrs_low[16]; __vector128 vxrs_high[16]; unsigned char __reserved[128]; } _sigregs_ext; #endif /* * From /usr/include/uapi/asm/ucontext.h */ struct ucontext_extended { unsigned long uc_flags; ucontext_t *uc_link; stack_t uc_stack; _sigregs uc_mcontext; sigset_t uc_sigmask; /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ unsigned char __unused[128 - sizeof(sigset_t)]; _sigregs_ext uc_mcontext_ext; }; /* * Signal stack frame for RT sigreturn */ struct rt_sigframe { uint8_t callee_used_stack[160]; uint8_t retcode[2]; siginfo_t info; struct ucontext_extended uc; }; /* * Do rt_sigreturn SVC */ /* clang-format off */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "lgr %%r15,%0\n" \ "lghi %%r1,173\n" \ "svc 0\n" \ : \ : "d" (new_sp) \ : "memory") /* clang-format on */ #define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) #define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->uc.uc_mcontext.regs.psw.addr #define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) #define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) #define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ crac-criu-1.5.0/compel/arch/s390/src/lib/infect.c000066400000000000000000000464721471504326700212560ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "uapi/compel/asm/infect-types.h" #include "errno.h" #include "log.h" #include "common/bug.h" #include "infect.h" #include "ptrace.h" #include "infect-priv.h" #define NT_PRFPREG 2 #define NT_S390_VXRS_LOW 0x309 #define NT_S390_VXRS_HIGH 0x30a #define NT_S390_GS_CB 0x30b #define NT_S390_GS_BC 0x30c #define NT_S390_RI_CB 0x30d /* * Print general purpose and access registers */ static void print_user_regs_struct(const char *msg, int pid, user_regs_struct_t *regs) { int i; pr_debug("%s: Registers for pid=%d\n", msg, pid); pr_debug("system_call %08lx\n", (unsigned long)regs->system_call); pr_debug(" psw %016lx %016lx\n", regs->prstatus.psw.mask, regs->prstatus.psw.addr); pr_debug(" orig_gpr2 %016lx\n", regs->prstatus.orig_gpr2); for (i = 0; i < 16; i++) pr_debug(" g%02d %016lx\n", i, regs->prstatus.gprs[i]); for (i = 0; i < 16; i++) pr_debug(" a%02d %08x\n", i, regs->prstatus.acrs[i]); } /* * Print vector registers */ static void print_vxrs(user_fpregs_struct_t *fpregs) { int i; if (!(fpregs->flags & USER_FPREGS_VXRS)) { pr_debug(" No VXRS\n"); return; } for (i = 0; i < 16; i++) pr_debug(" vx_low%02d %016lx\n", i, fpregs->vxrs_low[i]); for (i = 0; i < 16; i++) pr_debug(" vx_high%02d %016lx %016lx\n", i, fpregs->vxrs_high[i].part1, fpregs->vxrs_high[i].part2); } /* * Print guarded-storage control block */ static void print_gs_cb(user_fpregs_struct_t *fpregs) { int i; if (!(fpregs->flags & USER_GS_CB)) { pr_debug(" No GS_CB\n"); return; } for (i = 0; i < 4; i++) pr_debug(" gs_cb%02d %016lx\n", i, fpregs->gs_cb[i]); } /* * Print guarded-storage broadcast control block */ static void print_gs_bc(user_fpregs_struct_t *fpregs) { int i; if (!(fpregs->flags & USER_GS_BC)) { pr_debug(" No GS_BC\n"); return; } for (i = 0; i < 4; i++) pr_debug(" gs_bc%02d %016lx\n", i, fpregs->gs_bc[i]); } /* * Print runtime-instrumentation control block */ static void print_ri_cb(user_fpregs_struct_t *fpregs) { int i; if (!(fpregs->flags & USER_RI_CB)) { pr_debug(" No RI_CB\n"); return; } for (i = 0; i < 8; i++) pr_debug(" ri_cb%02d %016lx\n", i, fpregs->ri_cb[i]); } /* * Print FP registers, VX registers, guarded-storage, and * runtime-instrumentation */ static void print_user_fpregs_struct(const char *msg, int pid, user_fpregs_struct_t *fpregs) { int i; pr_debug("%s: FP registers for pid=%d\n", msg, pid); pr_debug(" fpc %08x\n", fpregs->prfpreg.fpc); for (i = 0; i < 16; i++) pr_debug(" f%02d %016lx\n", i, fpregs->prfpreg.fprs[i]); print_vxrs(fpregs); print_gs_cb(fpregs); print_gs_bc(fpregs); print_ri_cb(fpregs); } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { _sigregs_ext *dst_ext = &sigframe->uc.uc_mcontext_ext; _sigregs *dst = &sigframe->uc.uc_mcontext; memcpy(dst->regs.gprs, regs->prstatus.gprs, sizeof(regs->prstatus.gprs)); memcpy(dst->regs.acrs, regs->prstatus.acrs, sizeof(regs->prstatus.acrs)); memcpy(&dst->regs.psw, ®s->prstatus.psw, sizeof(regs->prstatus.psw)); memcpy(&dst->fpregs.fpc, &fpregs->prfpreg.fpc, sizeof(fpregs->prfpreg.fpc)); memcpy(&dst->fpregs.fprs, &fpregs->prfpreg.fprs, sizeof(fpregs->prfpreg.fprs)); if (fpregs->flags & USER_FPREGS_VXRS) { memcpy(&dst_ext->vxrs_low, &fpregs->vxrs_low, sizeof(fpregs->vxrs_low)); memcpy(&dst_ext->vxrs_high, &fpregs->vxrs_high, sizeof(fpregs->vxrs_high)); } else { memset(&dst_ext->vxrs_low, 0, sizeof(dst_ext->vxrs_low)); memset(&dst_ext->vxrs_high, 0, sizeof(dst_ext->vxrs_high)); } return 0; } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } /* * Rewind the psw for 'bytes' bytes */ static inline void rewind_psw(psw_t *psw, unsigned long bytes) { unsigned long mask; pr_debug("Rewind psw: %016lx bytes=%lu\n", psw->addr, bytes); mask = (psw->mask & PSW_MASK_EA) ? -1UL : (psw->mask & PSW_MASK_BA) ? (1UL << 31) - 1 : (1UL << 24) - 1; psw->addr = (psw->addr - bytes) & mask; } /* * Get vector registers */ int get_vx_regs(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; fpregs->flags &= ~USER_FPREGS_VXRS; iov.iov_base = &fpregs->vxrs_low; iov.iov_len = sizeof(fpregs->vxrs_low); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_VXRS_LOW, &iov) < 0) { /* * If the kernel does not support vector registers, we get * EINVAL. With kernel support and old hardware, we get ENODEV. */ if (errno == EINVAL || errno == ENODEV) { memset(fpregs->vxrs_low, 0, sizeof(fpregs->vxrs_low)); memset(fpregs->vxrs_high, 0, sizeof(fpregs->vxrs_high)); pr_debug("VXRS registers not supported\n"); return 0; } pr_perror("Couldn't get VXRS_LOW"); return -1; } iov.iov_base = &fpregs->vxrs_high; iov.iov_len = sizeof(fpregs->vxrs_high); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_VXRS_HIGH, &iov) < 0) { pr_perror("Couldn't get VXRS_HIGH"); return -1; } fpregs->flags |= USER_FPREGS_VXRS; return 0; } /* * Get guarded-storage control block */ int get_gs_cb(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; fpregs->flags &= ~(USER_GS_CB | USER_GS_BC); iov.iov_base = &fpregs->gs_cb; iov.iov_len = sizeof(fpregs->gs_cb); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_GS_CB, &iov) < 0) { switch (errno) { case EINVAL: case ENODEV: memset(&fpregs->gs_cb, 0, sizeof(fpregs->gs_cb)); memset(&fpregs->gs_bc, 0, sizeof(fpregs->gs_bc)); pr_debug("GS_CB not supported\n"); return 0; case ENODATA: pr_debug("GS_CB not set\n"); break; default: return -1; } } else { fpregs->flags |= USER_GS_CB; } iov.iov_base = &fpregs->gs_bc; iov.iov_len = sizeof(fpregs->gs_bc); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_GS_BC, &iov) < 0) { if (errno == ENODATA) { pr_debug("GS_BC not set\n"); return 0; } pr_perror("Couldn't get GS_BC"); return -1; } fpregs->flags |= USER_GS_BC; return 0; } /* * Get runtime-instrumentation control block */ int get_ri_cb(pid_t pid, user_fpregs_struct_t *fpregs) { user_regs_struct_t regs; struct iovec iov; psw_t *psw; fpregs->flags &= ~(USER_RI_CB | USER_RI_ON); iov.iov_base = &fpregs->ri_cb; iov.iov_len = sizeof(fpregs->ri_cb); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_RI_CB, &iov) < 0) { switch (errno) { case EINVAL: case ENODEV: memset(&fpregs->ri_cb, 0, sizeof(fpregs->ri_cb)); pr_debug("RI_CB not supported\n"); return 0; case ENODATA: pr_debug("RI_CB not set\n"); return 0; default: pr_perror("Couldn't get RI_CB"); return -1; } } fpregs->flags |= USER_RI_CB; /* Get PSW and check if runtime-instrumentation bit is enabled */ iov.iov_base = ®s.prstatus; iov.iov_len = sizeof(regs.prstatus); if (ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov) < 0) return -1; psw = ®s.prstatus.psw; if (psw->mask & PSW_MASK_RI) fpregs->flags |= USER_RI_ON; return 0; } /* * Disable runtime-instrumentation bit */ static int s390_disable_ri_bit(pid_t pid, user_regs_struct_t *regs) { struct iovec iov; psw_t *psw; iov.iov_base = ®s->prstatus; iov.iov_len = sizeof(regs->prstatus); psw = ®s->prstatus.psw; psw->mask &= ~PSW_MASK_RI; return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); } /* * Prepare task registers for restart */ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; struct iovec iov; int rewind; print_user_regs_struct("compel_get_task_regs", pid, regs); memset(fpregs, 0, sizeof(*fpregs)); iov.iov_base = &fpregs->prfpreg; iov.iov_len = sizeof(fpregs->prfpreg); if (ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov) < 0) { pr_perror("Couldn't get floating-point registers"); return -1; } if (get_vx_regs(pid, fpregs)) { pr_perror("Couldn't get vector registers"); return -1; } if (get_gs_cb(pid, fpregs)) { pr_perror("Couldn't get guarded-storage"); return -1; } if (get_ri_cb(pid, fpregs)) { pr_perror("Couldn't get runtime-instrumentation"); return -1; } /* * If the runtime-instrumentation bit is set, we have to disable it * before we execute parasite code. Otherwise parasite operations * would be recorded. */ if (fpregs->flags & USER_RI_ON) s390_disable_ri_bit(pid, regs); print_user_fpregs_struct("compel_get_task_regs", pid, fpregs); /* Check for system call restarting. */ if (regs->system_call) { rewind = regs->system_call >> 16; /* see arch/s390/kernel/signal.c: do_signal() */ switch ((long)regs->prstatus.gprs[2]) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->prstatus.gprs[2] = regs->prstatus.orig_gpr2; rewind_psw(®s->prstatus.psw, rewind); pr_debug("New gpr2: %016lx\n", regs->prstatus.gprs[2]); break; case -ERESTART_RESTARTBLOCK: pr_warn("Will restore %d with interrupted system call\n", pid); regs->prstatus.gprs[2] = -EINTR; break; } } /* Call save_task_regs() */ return save(arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { struct iovec iov; int ret = 0; iov.iov_base = &ext_regs->prfpreg; iov.iov_len = sizeof(ext_regs->prfpreg); if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov) < 0) { pr_perror("Couldn't set floating-point registers"); ret = -1; } if (ext_regs->flags & USER_FPREGS_VXRS) { iov.iov_base = &ext_regs->vxrs_low; iov.iov_len = sizeof(ext_regs->vxrs_low); if (ptrace(PTRACE_SETREGSET, pid, NT_S390_VXRS_LOW, &iov) < 0) { pr_perror("Couldn't set VXRS_LOW"); ret = -1; } iov.iov_base = &ext_regs->vxrs_high; iov.iov_len = sizeof(ext_regs->vxrs_high); if (ptrace(PTRACE_SETREGSET, pid, NT_S390_VXRS_HIGH, &iov) < 0) { pr_perror("Couldn't set VXRS_HIGH"); ret = -1; } } if (ext_regs->flags & USER_GS_CB) { iov.iov_base = &ext_regs->gs_cb; iov.iov_len = sizeof(ext_regs->gs_cb); if (ptrace(PTRACE_SETREGSET, pid, NT_S390_GS_CB, &iov) < 0) { pr_perror("Couldn't set GS_CB"); ret = -1; } iov.iov_base = &ext_regs->gs_bc; iov.iov_len = sizeof(ext_regs->gs_bc); if (ptrace(PTRACE_SETREGSET, pid, NT_S390_GS_BC, &iov) < 0) { pr_perror("Couldn't set GS_BC"); ret = -1; } } if (ext_regs->flags & USER_RI_CB) { iov.iov_base = &ext_regs->ri_cb; iov.iov_len = sizeof(ext_regs->ri_cb); if (ptrace(PTRACE_SETREGSET, pid, NT_S390_RI_CB, &iov) < 0) { pr_perror("Couldn't set RI_CB"); ret = -1; } } return ret; } /* * Injected syscall instruction */ const char code_syscall[] = { 0x0a, 0x00, /* sc 0 */ 0x00, 0x01, /* S390_BREAKPOINT_U16 */ 0x00, 0x01, /* S390_BREAKPOINT_U16 */ 0x00, 0x01, /* S390_BREAKPOINT_U16 */ }; static inline void __check_code_syscall(void) { BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } /* * Issue s390 system call */ int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; /* Load syscall number into %r1 */ regs.prstatus.gprs[1] = (unsigned long)nr; /* Load parameter registers %r2-%r7 */ regs.prstatus.gprs[2] = arg1; regs.prstatus.gprs[3] = arg2; regs.prstatus.gprs[4] = arg3; regs.prstatus.gprs[5] = arg4; regs.prstatus.gprs[6] = arg5; regs.prstatus.gprs[7] = arg6; err = compel_execute_syscall(ctl, ®s, (char *)code_syscall); /* Return code from system is in %r2 */ if (ret) *ret = regs.prstatus.gprs[2]; return err; } /* * Issue s390 mmap call */ void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; struct mmap_arg_struct arg_struct; pid_t pid = ctl->rpid; long map = 0; int err; /* Setup s390 mmap data */ arg_struct.addr = (unsigned long)addr; arg_struct.len = length; arg_struct.prot = prot; arg_struct.flags = flags; arg_struct.fd = fd; arg_struct.offset = offset; /* Move args to process */ if (ptrace_swap_area(pid, where, &arg_struct, sizeof(arg_struct))) { pr_err("Can't inject memfd args (pid: %d)\n", pid); return NULL; } /* Do syscall */ err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)where, 0, 0, 0, 0, 0); if (err < 0 || (long)map < 0) map = 0; /* Restore data */ if (ptrace_poke_area(pid, &arg_struct, where, sizeof(arg_struct))) { pr_err("Can't restore mmap args (pid: %d)\n", pid); if (map != 0) { err = compel_syscall(ctl, __NR_munmap, NULL, map, length, 0, 0, 0, 0); if (err) pr_err("Can't munmap %d\n", err); map = 0; } } return (void *)map; } /* * Setup registers for parasite call */ void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { regs->prstatus.psw.addr = new_ip; if (!stack) return; regs->prstatus.gprs[15] = ((unsigned long)stack) - STACK_FRAME_OVERHEAD; } /* * Check if we have all kernel and CRIU features to dump the task */ bool arch_can_dump_task(struct parasite_ctl *ctl) { user_fpregs_struct_t fpregs; user_regs_struct_t regs; pid_t pid = ctl->rpid; char str[8]; psw_t *psw; if (ptrace_get_regs(pid, ®s)) return false; psw = ®s.prstatus.psw; /* Check if the kernel supports RI ptrace interface */ if (psw->mask & PSW_MASK_RI) { if (get_ri_cb(pid, &fpregs) < 0) { pr_perror("Can't dump process with RI bit active"); return false; } } /* We don't support 24 and 31 bit mode - only 64 bit */ if (psw->mask & PSW_MASK_EA) { if (psw->mask & PSW_MASK_BA) return true; else sprintf(str, "??"); } else { if (psw->mask & PSW_MASK_BA) sprintf(str, "31"); else sprintf(str, "24"); } pr_err("Pid %d is %s bit: Only 64 bit tasks are supported\n", pid, str); return false; } /* * Return current alternate signal stack */ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { long ret; int err; err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } /* * Find last mapped address of current process */ static unsigned long max_mapped_addr(void) { unsigned long addr_end, addr_max = 0; char line[128]; FILE *fp; fp = fopen("/proc/self/maps", "r"); if (!fp) goto out; /* Parse lines like: 3fff415f000-3fff4180000 rw-p 00000000 00:00 0 */ while (fgets(line, sizeof(line), fp)) { char *ptr; /* First skip start address */ strtoul(&line[0], &ptr, 16); addr_end = strtoul(ptr + 1, NULL, 16); addr_max = max(addr_max, addr_end); } fclose(fp); out: return addr_max - 1; } /* * Kernel task size level * * We have (dynamic) 4 level page tables for 64 bit since linux 2.6.25: * * 5a216a2083 ("[S390] Add four level page tables for CONFIG_64BIT=y.") * 6252d702c5 ("[S390] dynamic page tables.") * * The code below is already prepared for future (dynamic) 5 level page tables. * * Besides that there is one problematic kernel bug that has been fixed for * linux 4.11 by the following commit: * * ee71d16d22 ("s390/mm: make TASK_SIZE independent from the number * of page table levels") * * A 64 bit process on s390x always starts with 3 levels and upgrades to 4 * levels for mmap(> 4 TB) and to 5 levels for mmap(> 16 EB). * * Unfortunately before fix ee71d16d22 for a 3 level process munmap() * and mremap() fail for addresses > 4 TB. CRIU uses the task size, * to unmap() all memory from a starting point to task size to get rid of * unwanted mappings. CRIU uses mremap() to establish the final mappings * which also fails if we want to restore mappings > 4 TB and the initial * restore process still runs with 3 levels. * * To support the current CRIU design on s390 we return task size = 4 TB when * a kernel without fix ee71d16d22 is detected. In this case we can dump at * least processes with < 4 TB which is the most likely case anyway. * * For kernels with fix ee71d16d22 we are fully functional. */ enum kernel_ts_level { /* Kernel with 4 level page tables without fix ee71d16d22 */ KERNEL_TS_LEVEL_4_FIX_NO, /* Kernel with 4 level page tables with fix ee71d16d22 */ KERNEL_TS_LEVEL_4_FIX_YES, /* Kernel with 4 level page tables with or without fix ee71d16d22 */ KERNEL_TS_LEVEL_4_FIX_UNKN, /* Kernel with 5 level page tables */ KERNEL_TS_LEVEL_5, }; /* See arch/s390/include/asm/processor.h */ #define TASK_SIZE_LEVEL_3 0x40000000000UL /* 4 TB */ #define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ #define TASK_SIZE_LEVEL_5 0xffffffffffffefffUL /* 16 EB - 0x1000 */ /* * Return detected kernel version regarding task size level * * We use unmap() to probe the maximum possible page table level of kernel */ static enum kernel_ts_level get_kernel_ts_level(void) { unsigned long criu_end_addr = max_mapped_addr(); /* Check for 5 levels */ if (criu_end_addr >= TASK_SIZE_LEVEL_4) return KERNEL_TS_LEVEL_5; else if (munmap((void *)TASK_SIZE_LEVEL_4, 0x1000) == 0) return KERNEL_TS_LEVEL_5; if (criu_end_addr < TASK_SIZE_LEVEL_3) { /* Check for 4 level kernel with fix */ if (munmap((void *)TASK_SIZE_LEVEL_3, 0x1000) == 0) return KERNEL_TS_LEVEL_4_FIX_YES; else return KERNEL_TS_LEVEL_4_FIX_NO; } /* We can't find out if kernel has the fix */ return KERNEL_TS_LEVEL_4_FIX_UNKN; } /* * Log detected level */ static void pr_levels(const char *str) { pr_debug("Max user page table levels (task size): %s\n", str); } /* * Return last address (+1) of biggest possible user address space for * current kernel */ unsigned long compel_task_size(void) { switch (get_kernel_ts_level()) { case KERNEL_TS_LEVEL_4_FIX_NO: pr_levels("KERNEL_TS_LEVEL_4_FIX_NO"); return TASK_SIZE_LEVEL_3; case KERNEL_TS_LEVEL_4_FIX_YES: pr_levels("KERNEL_TS_LEVEL_4_FIX_YES"); return TASK_SIZE_LEVEL_4; case KERNEL_TS_LEVEL_4_FIX_UNKN: pr_levels("KERNEL_TS_LEVEL_4_FIX_UNKN"); return TASK_SIZE_LEVEL_3; default: /* KERNEL_TS_LEVEL_5 */ pr_levels("KERNEL_TS_LEVEL_5"); return TASK_SIZE_LEVEL_5; } } /* * Get task registers (overwrites weak function) */ int ptrace_get_regs(int pid, user_regs_struct_t *regs) { struct iovec iov; int rc; pr_debug("ptrace_get_regs: pid=%d\n", pid); iov.iov_base = ®s->prstatus; iov.iov_len = sizeof(regs->prstatus); rc = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); if (rc != 0) return rc; iov.iov_base = ®s->system_call; iov.iov_len = sizeof(regs->system_call); return ptrace(PTRACE_GETREGSET, pid, NT_S390_SYSTEM_CALL, &iov); } /* * Set task registers (overwrites weak function) */ int ptrace_set_regs(int pid, user_regs_struct_t *regs) { uint32_t system_call = 0; struct iovec iov; int rc; pr_debug("ptrace_set_regs: pid=%d\n", pid); iov.iov_base = ®s->prstatus; iov.iov_len = sizeof(regs->prstatus); rc = ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); if (rc) return rc; /* * If we attached to an inferior that is sleeping in a restarting * system call like futex_wait(), we have to reset the system_call * to 0. Otherwise the kernel would try to finish the interrupted * system call after PTRACE_CONT and we could not run the * parasite code. */ iov.iov_base = &system_call; iov.iov_len = sizeof(system_call); return ptrace(PTRACE_SETREGSET, pid, NT_S390_SYSTEM_CALL, &iov); } crac-criu-1.5.0/compel/arch/x86/000077500000000000000000000000001471504326700162175ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/plugins/000077500000000000000000000000001471504326700177005ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/plugins/include/000077500000000000000000000000001471504326700213235ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/plugins/include/asm/000077500000000000000000000000001471504326700221035ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/plugins/include/asm/prologue.h000066400000000000000000000012561471504326700241140ustar00rootroot00000000000000#ifndef __ASM_PROLOGUE_H__ #define __ASM_PROLOGUE_H__ #ifndef __ASSEMBLY__ #include #include #include #include #define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) typedef struct prologue_init_args { struct sockaddr_un ctl_sock_addr; unsigned int ctl_sock_addr_len; unsigned int arg_s; void *arg_p; void *sigframe; } prologue_init_args_t; #endif /* __ASSEMBLY__ */ /* * Reserve enough space for sigframe. * * FIXME It is rather should be taken from sigframe header. */ #define PROLOGUE_SGFRAME_SIZE 4096 #define PROLOGUE_INIT_ARGS_SIZE 1024 #endif /* __ASM_PROLOGUE_H__ */ crac-criu-1.5.0/compel/arch/x86/plugins/include/asm/syscall-types.h000066400000000000000000000031001471504326700250620ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ /* Types for sigaction, sigprocmask syscalls */ typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define SA_RESTORER 0x04000000 #define _KNSIG 64 #define _NSIG_BPW 64 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) /* * Note: as k_rtsigset_t is the same size for 32-bit and 64-bit, * sig defined as uint64_t rather than (unsigned long) - for the * purpose if we ever going to support native 32-bit compilation. */ typedef struct { uint64_t sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; /* * Note: there is unaligned access on x86_64 and it's fine. * However, when porting this code -- keep in mind about possible issues * with unaligned rt_sa_mask. */ typedef struct __attribute__((packed)) { unsigned int rt_sa_handler; unsigned int rt_sa_flags; unsigned int rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t_compat; /* Types for set_thread_area, get_thread_area syscalls */ typedef struct { unsigned int entry_number; unsigned int base_addr; unsigned int limit; unsigned int seg_32bit : 1; unsigned int contents : 2; unsigned int read_exec_only : 1; unsigned int limit_in_pages : 1; unsigned int seg_not_present : 1; unsigned int usable : 1; unsigned int lm : 1; } user_desc_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ crac-criu-1.5.0/compel/arch/x86/plugins/include/features.h000066400000000000000000000002021471504326700233040ustar00rootroot00000000000000#ifndef __COMPEL_ARCH_FEATURES_H #define __COMPEL_ARCH_FEATURES_H #define ARCH_HAS_MEMCPY #endif /* __COMPEL_ARCH_FEATURES_H */ crac-criu-1.5.0/compel/arch/x86/plugins/std/000077500000000000000000000000001471504326700204725ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/plugins/std/memcpy.S000066400000000000000000000010061471504326700221050ustar00rootroot00000000000000#include "common/asm/linkage.h" /* The following code is taken from Linux kernel (arch/x86/lib/memcpy_64.S). * There are 3 implementations in there, we use the one that relies on * X86_FEATURE_REP_GOOD ("rep microcode works well"). */ /* * memcpy - Copy a memory block. * * Input: * rdi destination * rsi source * rdx count * * Output: * rax original destination */ ENTRY(memcpy) movq %rdi, %rax movq %rdx, %rcx shrq $3, %rcx andl $7, %edx rep movsq movl %edx, %ecx rep movsb ret END(memcpy) crac-criu-1.5.0/compel/arch/x86/plugins/std/parasite-head.S000066400000000000000000000016101471504326700233230ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .head.text, "ax" #ifndef CONFIG_X86_64 # error 64-bit parasite should compile with CONFIG_X86_64 #endif #ifdef CONFIG_COMPAT .code32 ENTRY(__export_parasite_head_start_compat) /* A long jump to 64-bit parasite. */ jmp $__USER_CS,$1f 1: .code64 call parasite_service pushq $__USER32_CS xor %r11, %r11 movl $2f, %r11d pushq %r11 lretq 2: .code32 /* * parasite_service() can run commands in non-daemon mode * with parasite_trap_cmd(): it waits that after return there * is a software break. * compel_run_in_thread() uses this and after hitting the break, * it restores register set - that's the reason, why we should * stop in 32-bit mode for compat tasks here. */ int $0x03 END(__export_parasite_head_start_compat) .code64 #endif ENTRY(__export_parasite_head_start) call parasite_service int $0x03 END(__export_parasite_head_start) crac-criu-1.5.0/compel/arch/x86/plugins/std/syscalls/000077500000000000000000000000001471504326700223275ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/plugins/std/syscalls/Makefile.syscalls000066400000000000000000000124441471504326700256300ustar00rootroot00000000000000std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o sys-proto-types := $(obj)/include/uapi/std/syscall-types.h sys-proto-generic := $(obj)/include/uapi/std/syscall.h sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h sys-proto = $(obj)/include/uapi/std/syscall-$(1).h sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S sys-asm-common-name = std/syscalls/syscall-common-x86-$(1).S sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c sys-bits := 64 AV := $$$$ define gen-rule-sys-codes $(sys-codes): $(sys-def) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ sub("^__NR", "SYS", SYSN); \ print "\n#ifndef ", $(AV)1; \ print "#define", $(AV)1, $(AV)2; \ print "#endif"; \ print "\n#ifndef ", SYSN; \ print "#define ", SYSN, $(AV)1; \ print "#endif";}' >> $$@ $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ endef define gen-rule-sys-proto $(sys-proto): $(sys-def) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ $(Q) echo "/* musl defines loff_t as off_t */" >> $$@ $(Q) echo '#ifndef loff_t' >> $$@ $(Q) echo '#define loff_t off_t' >> $$@ $(Q) echo '#endif' >> $$@ $(Q) echo '#include ' >> $$@ $(Q) echo '#include ' >> $$@ ifeq ($(1),32) $(Q) echo '#include "asm/syscall32.h"' >> $$@ endif $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ endef define gen-rule-sys-asm $(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo '#include ' >> $$@ $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ endef define gen-rule-sys-exec-tbl $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) cat $$< | awk '/^__NR/{print \ "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ endef $(sys-codes-generic): $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_32.tbl $(sys-proto-types) $(call msg-gen, $@) $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo "/* musl defines loff_t as off_t */" >> $@ $(Q) echo '#ifndef loff_t' >> $@ $(Q) echo '#define loff_t off_t' >> $@ $(Q) echo '#endif' >> $@ $(Q) echo '#include ' >> $@ $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ sub("^__NR", "__NR32", NR32); \ print "\n#ifndef ", NR32; \ print "#define ", NR32, $$2; \ print "#endif";}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ mrproper-y += $(sys-codes-generic) $(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) $(call msg-gen, $@) $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "" >> $@ $(Q) echo "#ifdef CONFIG_X86_32" >> $@ $(Q) echo '#include ' >> $@ $(Q) echo "#else" >> $@ $(Q) echo '#include ' >> $@ $(Q) echo "#endif /* CONFIG_X86_32 */" >> $@ $(Q) echo "" >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ mrproper-y += $(sys-proto-generic) define gen-rule-sys-exec-tbl $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) cat $$< | awk '/^__NR/{print \ "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ endef $(eval $(call map,gen-rule-sys-codes,$(sys-bits))) $(eval $(call map,gen-rule-sys-proto,$(sys-bits))) $(eval $(call map,gen-rule-sys-asm,$(sys-bits))) $(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) $(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(call msg-gen, $@) $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) std-headers-deps += $(call sys-codes,$(sys-bits)) std-headers-deps += $(call sys-proto,$(sys-bits)) std-headers-deps += $(call sys-asm,$(sys-bits)) std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) std-headers-deps += $(sys-codes-generic) std-headers-deps += $(sys-proto-generic) std-headers-deps += $(sys-asm-types) mrproper-y += $(std-headers-deps) crac-criu-1.5.0/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-32.S000066400000000000000000000010731471504326700262610ustar00rootroot00000000000000#include "common/asm/linkage.h" #define SYSCALL(name, opcode) \ ENTRY(name); \ movl $opcode, %eax; \ jmp __syscall_common; \ END(name) ENTRY(__syscall_common) pushl %ebx pushl %esi pushl %edi pushl %ebp #define __arg(n) (4 * (n) + 20)(%esp) movl __arg(0),%ebx movl __arg(1),%ecx movl __arg(2),%edx movl __arg(3),%esi movl __arg(4),%edi movl __arg(5),%ebp #undef __arg int $0x80 popl %ebp popl %edi popl %esi popl %ebx ret END(__syscall_common) ENTRY(__cr_restore_rt) movl $__NR_rt_sigreturn, %eax jmp __syscall_common END(__cr_restore_rt) crac-criu-1.5.0/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-64.S000066400000000000000000000005051471504326700262650ustar00rootroot00000000000000#include "common/asm/linkage.h" #define SYSCALL(name, opcode) \ ENTRY(name); \ movl $opcode, %eax; \ jmp __syscall_common; \ END(name) .text .align 4 ENTRY(__syscall_common) movq %rcx, %r10 syscall ret END(__syscall_common) ENTRY(__cr_restore_rt) movq $__NR_rt_sigreturn, %rax syscall END(__cr_restore_rt) crac-criu-1.5.0/compel/arch/x86/plugins/std/syscalls/syscall32.c000066400000000000000000000060511471504326700243140ustar00rootroot00000000000000#include "asm/types.h" #include "syscall-32.h" #define SYS_SOCKET 1 /* sys_socket(2) */ #define SYS_BIND 2 /* sys_bind(2) */ #define SYS_CONNECT 3 /* sys_connect(2) */ #define SYS_SENDTO 11 /* sys_sendto(2) */ #define SYS_RECVFROM 12 /* sys_recvfrom(2) */ #define SYS_SHUTDOWN 13 /* sys_shutdown(2) */ #define SYS_SETSOCKOPT 14 /* sys_setsockopt(2) */ #define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */ #define SYS_SENDMSG 16 /* sys_sendmsg(2) */ #define SYS_RECVMSG 17 /* sys_recvmsg(2) */ long sys_socket(int domain, int type, int protocol) { uint32_t a[] = { (uint32_t)domain, (uint32_t)type, (uint32_t)protocol }; return sys_socketcall(SYS_SOCKET, (unsigned long *)a); } long sys_connect(int sockfd, struct sockaddr *addr, int addrlen) { uint32_t a[] = { (uint32_t)sockfd, (uint32_t)addr, (uint32_t)addrlen }; return sys_socketcall(SYS_CONNECT, (unsigned long *)a); } long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) { uint32_t a[] = { (uint32_t)sockfd, (uint32_t)buff, (uint32_t)len, (uint32_t)flags, (uint32_t)addr, (uint32_t)addr_len }; return sys_socketcall(SYS_SENDTO, (unsigned long *)a); } long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) { uint32_t a[] = { (uint32_t)sockfd, (uint32_t)ubuf, (uint32_t)size, (uint32_t)flags, (uint32_t)addr, (uint32_t)addr_len }; return sys_socketcall(SYS_RECVFROM, (unsigned long *)a); } long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags) { uint32_t a[] = { (uint32_t)sockfd, (uint32_t)msg, (uint32_t)flags }; return sys_socketcall(SYS_SENDMSG, (unsigned long *)a); } long sys_recvmsg(int sockfd, struct msghdr *msg, int flags) { uint32_t a[] = { (uint32_t)sockfd, (uint32_t)msg, (uint32_t)flags }; return sys_socketcall(SYS_RECVMSG, (unsigned long *)a); } long sys_shutdown(int sockfd, int how) { uint32_t a[] = { (uint32_t)sockfd, (uint32_t)how }; return sys_socketcall(SYS_SHUTDOWN, (unsigned long *)a); } long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen) { uint32_t a[] = { (uint32_t)sockfd, (uint32_t)addr, (uint32_t)addrlen }; return sys_socketcall(SYS_BIND, (unsigned long *)a); } long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen) { uint32_t a[] = { (uint32_t)sockfd, (uint32_t)level, (uint32_t)optname, (uint32_t)optval, (uint32_t)optlen }; return sys_socketcall(SYS_SETSOCKOPT, (unsigned long *)a); } long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen) { uint32_t a[] = { (uint32_t)sockfd, (uint32_t)level, (uint32_t)optname, (uint32_t)optval, (uint32_t)optlen }; return sys_socketcall(SYS_GETSOCKOPT, (unsigned long *)a); } #define SHMAT 21 long sys_shmat(int shmid, void *shmaddr, int shmflag) { return sys_ipc(SHMAT, shmid, shmflag, 0, shmaddr, 0); } long sys_pread(unsigned int fd, char *ubuf, uint32_t count, uint64_t pos) { return sys_pread64(fd, ubuf, count, (uint32_t)(pos & 0xffffffffu), (uint32_t)(pos >> 32)); } crac-criu-1.5.0/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl000066400000000000000000000210431471504326700250100ustar00rootroot00000000000000# # System calls table, please make sure the table consist only the syscalls # really used somewhere in project. # # code name arguments # ------------------------------------------------------------------------------------------------------------------------------------------------------------- __NR_restart_syscall 0 sys_restart_syscall (void) __NR_exit 1 sys_exit (unsigned long error_code) __NR_read 3 sys_read (int fd, void *buf, unsigned long count) __NR_write 4 sys_write (int fd, const void *buf, unsigned long count) __NR_open 5 sys_open (const char *filename, int flags, unsigned int mode) __NR_close 6 sys_close (int fd) __NR_unlink 10 sys_unlink (char *pathname) __NR_lseek 19 sys_lseek (int fd, int32_t offset, unsigned int origin) __NR_getpid 20 sys_getpid (void) __NR_mount 21 sys_mount (const char *dev_name, const char *dir_name, const char *type, unsigned long flags, const void *data) __NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data) __NR_kill 37 sys_kill (long pid, int sig) __NR_mkdir 39 sys_mkdir (const char *name, int mode) __NR_rmdir 40 sys_rmdir (const char *name) __NR_brk 45 sys_brk (void *addr) __NR_umount2 52 sys_umount2 (char *name, int flags) __NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_fcntl 55 sys_fcntl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_umask 60 sys_umask (int mask) __NR_setrlimit 75 sys_setrlimit (unsigned int resource, struct krlimit *rlim) __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_munmap 91 sys_munmap (void *addr, unsigned long len) __NR_setpriority 97 sys_setpriority (int which, int who, int nice) __NR_socketcall 102 sys_socketcall (int call, unsigned long *args) __NR_setitimer 104 sys_setitimer (int which, struct itimerval *in, struct itimerval *out) __NR_getitimer 105 sys_getitimer (int which, struct itimerval *it) __NR_wait4 114 sys_wait4 (pid_t pid, int *stat_addr, int options, struct rusage *ru) __NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, void *ptr, long fifth) __NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) __NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) __NR_getpgid 132 sys_getpgid (pid_t pid) __NR_personality 136 sys_personality (unsigned int personality) __NR_flock 143 sys_flock (int fd, unsigned long cmd) __NR_getsid 147 sys_getsid (void) __NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_nanosleep 162 sys_nanosleep (struct timespec *rqtp, struct timespec *rmtp) __NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) __NR_prctl 172 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) __NR_rt_sigreturn 173 sys_rt_sigreturn (void) __NR_rt_sigaction 174 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) __NR_rt_sigprocmask 175 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *oset, size_t sigsetsize) __NR_rt_sigqueueinfo 178 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *uinfo) __NR_pread64 180 sys_pread64 (unsigned int fd, char *ubuf, uint32_t count, uint32_t poslo, uint32_t poshi) __NR_capget 184 sys_capget (struct cap_header *h, struct cap_data *d) __NR_capset 185 sys_capset (struct cap_header *h, struct cap_data *d) __NR_sigaltstack 186 sys_sigaltstack (const void *uss_ptr, void *uoss_ptr) __NR_mmap2 192 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) __NR_getgroups32 205 sys_getgroups (int gsize, unsigned int *groups) __NR_setgroups32 206 sys_setgroups (int gsize, unsigned int *groups) __NR_setresuid32 208 sys_setresuid (int uid, int euid, int suid) __NR_getresuid32 209 sys_getresuid (int *uid, int *euid, int *suid) __NR_setresgid32 210 sys_setresgid (int gid, int egid, int sgid) __NR_getresgid32 211 sys_getresgid (int *gid, int *egid, int *sgid) __NR_setfsuid32 215 sys_setfsuid (int fsuid) __NR_setfsgid32 216 sys_setfsgid (int fsgid) __NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char *vec) __NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior) __NR_gettid 224 sys_gettid (void) __NR_futex 240 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info) __NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info) __NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p) __NR_io_getevents 247 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) __NR_io_submit 248 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp) __NR_exit_group 252 sys_exit_group (int error_code) __NR_set_tid_address 258 sys_set_tid_address (int *tid_addr) __NR_timer_create 259 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_timer_settime 260 sys_timer_settime (kernel_timer_t timer_id, int flags, struct itimerspec *new, struct itimerspec *old) __NR_timer_gettime 261 sys_timer_gettime (int timer_id, struct itimerspec *setting) __NR_timer_getoverrun 262 sys_timer_getoverrun (int timer_id) __NR_timer_delete 263 sys_timer_delete (kernel_timer_t timer_id) __NR_clock_gettime 265 sys_clock_gettime (int which_clock, struct timespec *tp) __NR_waitid 284 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_openat 295 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_readlinkat 305 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) __NR_set_robust_list 311 sys_set_robust_list (struct robust_list_head *head, size_t len) __NR_get_robust_list 312 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_vmsplice 316 sys_vmsplice (int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags) __NR_signalfd 321 sys_signalfd (int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize) __NR_fallocate 324 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) __NR_timerfd_settime 325 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_preadv 333 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_rt_tgsigqueueinfo 335 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo) __NR_fanotify_init 338 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) __NR_fanotify_mark 339 sys_fanotify_mark (int fanotify_fd, unsigned int flag, uint32_t mask, int dfd, const char *pathname) __NR_open_by_handle_at 342 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) __NR_setns 346 sys_setns (int fd, int nstype) __NR_kcmp 349 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) __NR_seccomp 354 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_memfd_create 356 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 374 sys_userfaultfd (int flags) __NR_ppoll 309 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) __NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) __NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_membarrier 375 sys_membarrier (int cmd, unsigned int flags, int cpu_id) crac-criu-1.5.0/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl000066400000000000000000000230111471504326700250120ustar00rootroot00000000000000# # System calls table, please make sure the table consist only the syscalls # really used somewhere in project. # # __NR_name code name arguments # ------------------------------------------------------------------------------------------------------------------------------------------------------------- __NR_read 0 sys_read (int fd, void *buf, unsigned long count) __NR_write 1 sys_write (int fd, const void *buf, unsigned long count) __NR_open 2 sys_open (const char *filename, unsigned long flags, unsigned long mode) __NR_close 3 sys_close (int fd) __NR_lseek 8 sys_lseek (int fd, unsigned long offset, unsigned long origin) __NR_mmap 9 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) __NR_mprotect 10 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) __NR_munmap 11 sys_munmap (void *addr, unsigned long len) __NR_brk 12 sys_brk (void *addr) __NR_rt_sigaction 13 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) __NR_rt_sigprocmask 14 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) __NR_rt_sigreturn 15 sys_rt_sigreturn (void) __NR_ioctl 16 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_pread64 17 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) __NR_mremap 25 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) __NR_mincore 27 sys_mincore (void *addr, unsigned long size, unsigned char *vec) __NR_madvise 28 sys_madvise (unsigned long start, size_t len, int behavior) __NR_shmat 30 sys_shmat (int shmid, void *shmaddr, int shmflag) __NR_dup2 33 sys_dup2 (int oldfd, int newfd) __NR_nanosleep 35 sys_nanosleep (struct timespec *req, struct timespec *rem) __NR_getitimer 36 sys_getitimer (int which, const struct itimerval *val) __NR_setitimer 38 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) __NR_getpid 39 sys_getpid (void) __NR_socket 41 sys_socket (int domain, int type, int protocol) __NR_connect 42 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) __NR_sendto 44 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) __NR_recvfrom 45 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) __NR_sendmsg 46 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) __NR_recvmsg 47 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) __NR_shutdown 48 sys_shutdown (int sockfd, int how) __NR_bind 49 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) __NR_setsockopt 54 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) __NR_getsockopt 55 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) __NR_clone 56 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid, unsigned long new_tls) __NR_exit 60 sys_exit (unsigned long error_code) __NR_wait4 61 sys_wait4 (int pid, int *status, int options, struct rusage *ru) __NR_kill 62 sys_kill (long pid, int sig) __NR_fcntl 72 sys_fcntl (int fd, int type, long arg) __NR_flock 73 sys_flock (int fd, unsigned long cmd) __NR_mkdir 83 sys_mkdir (const char *name, int mode) __NR_rmdir 84 sys_rmdir (const char *name) __NR_unlink 87 sys_unlink (char *pathname) __NR_umask 95 sys_umask (int mask) __NR_gettimeofday 96 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_ptrace 101 sys_ptrace (long request, pid_t pid, void *addr, void *data) __NR_getgroups 115 sys_getgroups (int gsize, unsigned int *groups) __NR_setgroups 116 sys_setgroups (int gsize, unsigned int *groups) __NR_setresuid 117 sys_setresuid (int uid, int euid, int suid) __NR_getresuid 118 sys_getresuid (int *uid, int *euid, int *suid) __NR_setresgid 119 sys_setresgid (int gid, int egid, int sgid) __NR_getresgid 120 sys_getresgid (int *gid, int *egid, int *sgid) __NR_getpgid 121 sys_getpgid (pid_t pid) __NR_setfsuid 122 sys_setfsuid (int fsuid) __NR_setfsgid 123 sys_setfsgid (int fsgid) __NR_getsid 124 sys_getsid (void) __NR_capget 125 sys_capget (struct cap_header *h, struct cap_data *d) __NR_capset 126 sys_capset (struct cap_header *h, struct cap_data *d) __NR_rt_sigqueueinfo 129 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) __NR_sigaltstack 131 sys_sigaltstack (const void *uss, void *uoss) __NR_personality 135 sys_personality (unsigned int personality) __NR_setpriority 141 sys_setpriority (int which, int who, int nice) __NR_sched_setscheduler 144 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_prctl 157 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) __NR_arch_prctl 158 sys_arch_prctl (int option, unsigned long addr) __NR_setrlimit 160 sys_setrlimit (int resource, struct krlimit *rlim) __NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) __NR_umount2 166 sys_umount2 (char *name, int flags) __NR_gettid 186 sys_gettid (void) __NR_futex 202 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) __NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info) __NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx) __NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) __NR_io_submit 209 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) __NR_get_thread_area 211 sys_get_thread_area (user_desc_t *info) __NR_set_tid_address 218 sys_set_tid_address (int *tid_addr) __NR_restart_syscall 219 sys_restart_syscall (void) __NR_sys_timer_create 222 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_sys_timer_settime 223 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) __NR_sys_timer_gettime 224 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 225 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 226 sys_timer_delete (kernel_timer_t timer_id) __NR_clock_gettime 228 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 231 sys_exit_group (int error_code) __NR_openat 257 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_waitid 247 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_readlinkat 267 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) __NR_set_robust_list 273 sys_set_robust_list (struct robust_list_head *head, size_t len) __NR_get_robust_list 274 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_seccomp 317 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_vmsplice 278 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_fallocate 285 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) __NR_timerfd_settime 286 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_signalfd4 289 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_preadv 295 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_rt_tgsigqueueinfo 297 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) __NR_fanotify_init 300 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) __NR_fanotify_mark 301 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) __NR_open_by_handle_at 304 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) __NR_setns 308 sys_setns (int fd, int nstype) __NR_kcmp 312 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) __NR_memfd_create 319 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 323 sys_userfaultfd (int flags) __NR_ppoll 271 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) __NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) __NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) crac-criu-1.5.0/compel/arch/x86/scripts/000077500000000000000000000000001471504326700177065ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/scripts/compel-pack-compat.lds.S000066400000000000000000000007261471504326700242740ustar00rootroot00000000000000OUTPUT_ARCH(i386) TARGET(elf32-i386) EXTERN(__export_parasite_head_start) SECTIONS { .text : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) *(.compel.exit) *(.compel.init) } .data : { *(.data*) *(.bss*) } .rodata : { *(.rodata*) *(.got*) } .toc : ALIGN(8) { *(.toc*) } /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) } } crac-criu-1.5.0/compel/arch/x86/scripts/compel-pack.lds.S000066400000000000000000000007551471504326700230150ustar00rootroot00000000000000OUTPUT_ARCH(i386:x86-64) TARGET(elf64-x86-64) EXTERN(__export_parasite_head_start) SECTIONS { .text : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) *(.compel.exit) *(.compel.init) } .data : ALIGN(0x1000) { *(.data*) *(.bss*) } .rodata : { *(.rodata*) *(.got*) } .toc : ALIGN(8) { *(.toc*) } /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) } } crac-criu-1.5.0/compel/arch/x86/src/000077500000000000000000000000001471504326700170065ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/src/lib/000077500000000000000000000000001471504326700175545ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/src/lib/cpu.c000066400000000000000000000322261471504326700205140ustar00rootroot00000000000000#include #include #include "compel-cpu.h" #include "common/bitops.h" #include "common/compiler.h" #include "log.h" #include "common/bug.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static void fetch_rt_cpuinfo(void) { static bool rt_info_done = false; if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } } /* * Although we spell it out in here, the Processor Trace * xfeature is completely unused. We use other mechanisms * to save/restore PT state in Linux. */ static const char *const xfeature_names[] = { "x87 floating point registers", "SSE registers", "AVX registers", "MPX bounds registers", "MPX CSR", "AVX-512 opmask", "AVX-512 Hi256", "AVX-512 ZMM_Hi256", "Processor Trace", "Protection Keys User registers", "Hardware Duty Cycling", }; static short xsave_cpuid_features[] = { X86_FEATURE_FPU, X86_FEATURE_XMM, X86_FEATURE_AVX, X86_FEATURE_MPX, X86_FEATURE_MPX, X86_FEATURE_AVX512F, X86_FEATURE_AVX512F, X86_FEATURE_AVX512F, X86_FEATURE_INTEL_PT, X86_FEATURE_PKU, X86_FEATURE_HDC, }; void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { if (likely(feature < NCAPINTS_BITS)) set_bit(feature, (unsigned long *)c->x86_capability); } void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { if (likely(feature < NCAPINTS_BITS)) clear_bit(feature, (unsigned long *)c->x86_capability); } int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { if (likely(feature < NCAPINTS_BITS)) return test_bit(feature, (unsigned long *)c->x86_capability); return 0; } int compel_test_fpu_cap(compel_cpuinfo_t *c, unsigned int feature) { if (likely(feature < XFEATURE_MAX)) return (c->xfeatures_mask & (1UL << feature)); return 0; } static int compel_fpuid(compel_cpuinfo_t *c) { unsigned int last_good_offset; uint32_t eax, ebx, ecx, edx; size_t i; BUILD_BUG_ON(ARRAY_SIZE(xsave_cpuid_features) != ARRAY_SIZE(xfeature_names)); if (!compel_test_cpu_cap(c, X86_FEATURE_FPU)) { pr_err("fpu: No FPU detected\n"); return -1; } if (!compel_test_cpu_cap(c, X86_FEATURE_XSAVE)) { pr_info("fpu: x87 FPU will use %s\n", compel_test_cpu_cap(c, X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); return 0; } cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); c->xfeatures_mask = eax + ((uint64_t)edx << 32); if ((c->xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. */ pr_err("fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx\n", (unsigned long long)c->xfeatures_mask); return -1; } /* * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { if (!compel_test_cpu_cap(c, xsave_cpuid_features[i])) c->xfeatures_mask &= ~(1 << i); } c->xfeatures_mask &= XFEATURE_MASK_USER; c->xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; /* * xsaves is not enabled in userspace, so * xsaves is mostly for debug purpose. */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); c->xsave_size = ebx; c->xsave_size_max = ecx; cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); c->xsaves_size = ebx; pr_debug("fpu: xfeatures_mask 0x%llx xsave_size %u xsave_size_max %u xsaves_size %u\n", (unsigned long long)c->xfeatures_mask, c->xsave_size, c->xsave_size_max, c->xsaves_size); if (c->xsave_size_max > sizeof(struct xsave_struct)) pr_warn_once("fpu: max xsave frame exceed xsave_struct (%u %u)\n", c->xsave_size_max, (unsigned)sizeof(struct xsave_struct)); memset(c->xstate_offsets, 0xff, sizeof(c->xstate_offsets)); memset(c->xstate_sizes, 0xff, sizeof(c->xstate_sizes)); memset(c->xstate_comp_offsets, 0xff, sizeof(c->xstate_comp_offsets)); memset(c->xstate_comp_sizes, 0xff, sizeof(c->xstate_comp_sizes)); /* start at the beginning of the "extended state" */ last_good_offset = offsetof(struct xsave_struct, extended_state_area); /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form * or standard form. */ c->xstate_offsets[0] = 0; c->xstate_sizes[0] = offsetof(struct i387_fxsave_struct, xmm_space); c->xstate_offsets[1] = c->xstate_sizes[0]; c->xstate_sizes[1] = FIELD_SIZEOF(struct i387_fxsave_struct, xmm_space); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!(c->xfeatures_mask & (1UL << i))) continue; /* * If an xfeature is supervisor state, the offset * in EBX is invalid. We leave it to -1. * * SDM says: If state component 'i' is a user state component, * ECX[0] return 0; if state component i is a supervisor * state component, ECX[0] returns 1. */ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); if (!(ecx & 1)) c->xstate_offsets[i] = ebx; c->xstate_sizes[i] = eax; /* * In our xstate size checks, we assume that the * highest-numbered xstate feature has the * highest offset in the buffer. Ensure it does. */ if (last_good_offset > c->xstate_offsets[i]) pr_warn_once("fpu: misordered xstate %d %d\n", last_good_offset, c->xstate_offsets[i]); last_good_offset = c->xstate_offsets[i]; } BUILD_BUG_ON(sizeof(c->xstate_offsets) != sizeof(c->xstate_sizes)); BUILD_BUG_ON(sizeof(c->xstate_comp_offsets) != sizeof(c->xstate_comp_sizes)); c->xstate_comp_offsets[0] = 0; c->xstate_comp_sizes[0] = offsetof(struct i387_fxsave_struct, xmm_space); c->xstate_comp_offsets[1] = c->xstate_comp_sizes[0]; c->xstate_comp_sizes[1] = FIELD_SIZEOF(struct i387_fxsave_struct, xmm_space); if (!compel_test_cpu_cap(c, X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if ((c->xfeatures_mask & (1UL << i))) { c->xstate_comp_offsets[i] = c->xstate_offsets[i]; c->xstate_comp_sizes[i] = c->xstate_sizes[i]; } } } else { c->xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] = FXSAVE_SIZE + XSAVE_HDR_SIZE; for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if ((c->xfeatures_mask & (1UL << i))) c->xstate_comp_sizes[i] = c->xstate_sizes[i]; else c->xstate_comp_sizes[i] = 0; if (i > FIRST_EXTENDED_XFEATURE) { c->xstate_comp_offsets[i] = c->xstate_comp_offsets[i - 1] + c->xstate_comp_sizes[i - 1]; /* * The value returned by ECX[1] indicates the alignment * of state component 'i' when the compacted format * of the extended region of an XSAVE area is used: */ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); if (ecx & 2) c->xstate_comp_offsets[i] = ALIGN(c->xstate_comp_offsets[i], 64); } } } if (!pr_quelled(COMPEL_LOG_DEBUG)) { for (i = 0; i < ARRAY_SIZE(c->xstate_offsets); i++) { if (!(c->xfeatures_mask & (1UL << i))) continue; pr_debug("fpu: %-32s xstate_offsets %6d / %-6d xstate_sizes %6d / %-6d\n", xfeature_names[i], c->xstate_offsets[i], c->xstate_comp_offsets[i], c->xstate_sizes[i], c->xstate_comp_sizes[i]); } } return 0; } int compel_cpuid(compel_cpuinfo_t *c) { uint32_t eax, ebx, ecx, edx; /* * See cpu_detect() in the kernel, also * read cpuid specs not only from general * SDM but for extended instructions set * reference. */ /* Get vendor name */ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, (unsigned int *)&c->x86_vendor_id[0], (unsigned int *)&c->x86_vendor_id[8], (unsigned int *)&c->x86_vendor_id[4]); if (!strcmp(c->x86_vendor_id, "GenuineIntel")) { c->x86_vendor = X86_VENDOR_INTEL; } else if (!strcmp(c->x86_vendor_id, "AuthenticAMD") || !strcmp(c->x86_vendor_id, "HygonGenuine")) { c->x86_vendor = X86_VENDOR_AMD; } else { pr_err("Unsupported CPU vendor %s\n", c->x86_vendor_id); return -1; } c->x86_family = 4; /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { cpuid(0x00000001, &eax, &ebx, &ecx, &edx); c->x86_family = (eax >> 8) & 0xf; c->x86_model = (eax >> 4) & 0xf; c->x86_mask = eax & 0xf; if (c->x86_family == 0xf) c->x86_family += (eax >> 20) & 0xff; if (c->x86_family >= 0x6) c->x86_model += ((eax >> 16) & 0xf) << 4; c->x86_capability[CPUID_1_EDX] = edx; c->x86_capability[CPUID_1_ECX] = ecx; } /* Thermal and Power Management Leaf: level 0x00000006 (eax) */ if (c->cpuid_level >= 0x00000006) c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); /* Additional Intel-defined flags: level 0x00000007 */ if (c->cpuid_level >= 0x00000007) { cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_7_0_ECX] = ecx; c->x86_capability[CPUID_7_0_EDX] = edx; } /* Extended state features: level 0x0000000d */ if (c->cpuid_level >= 0x0000000d) { cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_D_1_EAX] = eax; } /* Additional Intel-defined flags: level 0x0000000F */ if (c->cpuid_level >= 0x0000000F) { /* QoS sub-leaf, EAX=0Fh, ECX=0 */ cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_F_0_EDX] = edx; if (compel_test_cpu_cap(c, X86_FEATURE_CQM_LLC)) { /* QoS sub-leaf, EAX=0Fh, ECX=1 */ cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_F_1_EDX] = edx; } } /* AMD-defined flags: level 0x80000001 */ eax = cpuid_eax(0x80000000); c->extended_cpuid_level = eax; if ((eax & 0xffff0000) == 0x80000000) { if (eax >= 0x80000001) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_8000_0001_ECX] = ecx; c->x86_capability[CPUID_8000_0001_EDX] = edx; } } /* * We're don't care about scattered features for now, * otherwise look into init_scattered_cpuid_features() * in kernel. * * Same applies to speculation control. Look into * init_speculation_control() otherwise. */ if (c->extended_cpuid_level >= 0x80000004) { unsigned int *v; char *p, *q; v = (unsigned int *)c->x86_model_id; cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); c->x86_model_id[48] = 0; /* * Intel chips right-justify this string for some dumb reason; * undo that brain damage: */ p = q = &c->x86_model_id[0]; while (*p == ' ') p++; if (p != q) { while (*p) *q++ = *p++; while (q <= &c->x86_model_id[48]) *q++ = '\0'; /* Zero-pad the rest */ } } if (c->extended_cpuid_level >= 0x80000007) { cpuid(0x80000007, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_8000_0007_EBX] = ebx; c->x86_power = edx; } if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); if (c->extended_cpuid_level >= 0x80000008) c->x86_capability[CPUID_8000_0008_EBX] = cpuid_ebx(0x80000008); /* On x86-64 CPUID is always present */ compel_set_cpu_cap(c, X86_FEATURE_CPUID); /* On x86-64 NOP is always present */ compel_set_cpu_cap(c, X86_FEATURE_NOPL); /* * On x86-64 syscalls32 are enabled but we don't * set it yet for backward compatibility reason */ //compel_set_cpu_cap(c, X86_FEATURE_SYSCALL32); /* See filter_cpuid_features in kernel */ if ((int32_t)c->cpuid_level < (int32_t)0x0000000d) compel_clear_cpu_cap(c, X86_FEATURE_XSAVE); /* * We only care about small subset from c_early_init: * early_init_amd and early_init_intel */ switch (c->x86_vendor) { case X86_VENDOR_INTEL: /* * Strictly speaking we need to read MSR_IA32_MISC_ENABLE * here but on ring3 it's impossible. */ if (c->x86_family == 15) { compel_clear_cpu_cap(c, X86_FEATURE_REP_GOOD); compel_clear_cpu_cap(c, X86_FEATURE_ERMS); } else if (c->x86_family == 6) { /* On x86-64 rep is fine */ compel_set_cpu_cap(c, X86_FEATURE_REP_GOOD); } break; case X86_VENDOR_AMD: /* * Bit 31 in normal CPUID used for nonstandard 3DNow ID; * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ compel_clear_cpu_cap(c, 0 * 32 + 31); if (c->x86_family >= 0x10) compel_set_cpu_cap(c, X86_FEATURE_REP_GOOD); if (c->x86_family == 0xf) { uint32_t level; /* On C+ stepping K8 rep microcode works well for copy/memset */ level = cpuid_eax(1); if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) compel_set_cpu_cap(c, X86_FEATURE_REP_GOOD); } break; } pr_debug("x86_family %u x86_vendor_id %s x86_model_id %s\n", c->x86_family, c->x86_vendor_id, c->x86_model_id); return compel_fpuid(c); } bool compel_cpu_has_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_test_cpu_cap(&rt_info, feature); } bool compel_fpu_has_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_test_fpu_cap(&rt_info, feature); } uint32_t compel_fpu_feature_size(unsigned int feature) { fetch_rt_cpuinfo(); if (feature >= FIRST_EXTENDED_XFEATURE && feature < XFEATURE_MAX) return rt_info.xstate_sizes[feature]; return 0; } uint32_t compel_fpu_feature_offset(unsigned int feature) { fetch_rt_cpuinfo(); if (feature >= FIRST_EXTENDED_XFEATURE && feature < XFEATURE_MAX) return rt_info.xstate_offsets[feature]; return 0; } void compel_cpu_clear_feature(unsigned int feature) { fetch_rt_cpuinfo(); return compel_clear_cpu_cap(&rt_info, feature); } void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) { fetch_rt_cpuinfo(); memcpy(c, &rt_info, sizeof(rt_info)); } crac-criu-1.5.0/compel/arch/x86/src/lib/handle-elf-host.c000077700000000000000000000000001471504326700250302handle-elf.custar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/src/lib/handle-elf.c000066400000000000000000000007761471504326700217310ustar00rootroot00000000000000#include #include #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; int handle_binary(void *mem, size_t size) { if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) return handle_elf_x86_64(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } crac-criu-1.5.0/compel/arch/x86/src/lib/include/000077500000000000000000000000001471504326700211775ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/src/lib/include/cpu.h000066400000000000000000000024261471504326700221430ustar00rootroot00000000000000#ifndef __COMPEL_ASM_CPU_H__ #define __COMPEL_ASM_CPU_H__ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ asm volatile("cpuid" : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) : "0"(*eax), "2"(*ecx) : "memory"); } static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = 0; native_cpuid(eax, ebx, ecx, edx); } static inline void cpuid_count(unsigned int op, int count, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = count; native_cpuid(eax, ebx, ecx, edx); } static inline unsigned int cpuid_eax(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return eax; } static inline unsigned int cpuid_ebx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return ebx; } static inline unsigned int cpuid_ecx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return ecx; } static inline unsigned int cpuid_edx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return edx; } #endif crac-criu-1.5.0/compel/arch/x86/src/lib/include/handle-elf.h000066400000000000000000000007721471504326700233550ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf64-types.h" #define ELF_X86_64 #ifndef R_X86_64_GOTPCRELX #define R_X86_64_GOTPCRELX 41 #endif #ifndef R_X86_64_REX_GOTPCRELX #define R_X86_64_REX_GOTPCRELX 42 #endif #define __handle_elf handle_elf_x86_64 #define arch_is_machine_supported(e_machine) (e_machine == EM_X86_64) extern int handle_elf_x86_32(void *mem, size_t size); extern int handle_elf_x86_64(void *mem, size_t size); #endif /* COMPEL_HANDLE_ELF_H__ */ crac-criu-1.5.0/compel/arch/x86/src/lib/include/syscall.h000066400000000000000000000006511471504326700230240ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ #define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) /* * For x86_32 __NR_mmap inside the kernel represents old_mmap system * call, but since we didn't use it yet lets go further and simply * define own alias for __NR_mmap2 which would allow us to unify code * between 32 and 64 bits version. */ #define __NR32_mmap __NR32_mmap2 #endif crac-criu-1.5.0/compel/arch/x86/src/lib/include/uapi/000077500000000000000000000000001471504326700221355ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/src/lib/include/uapi/asm/000077500000000000000000000000001471504326700227155ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/src/lib/include/uapi/asm/.gitignore000066400000000000000000000000001471504326700246730ustar00rootroot00000000000000crac-criu-1.5.0/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000003211471504326700254030ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP SI_KERNEL extern int ptrace_set_breakpoint(pid_t pid, void *addr); extern int ptrace_flush_breakpoints(pid_t pid); #endif crac-criu-1.5.0/compel/arch/x86/src/lib/include/uapi/asm/cpu.h000066400000000000000000000512011471504326700236540ustar00rootroot00000000000000#ifndef __CR_ASM_CPU_H__ #define __CR_ASM_CPU_H__ #include #include /* * Adopted from linux kernel and enhanced from Intel/AMD manuals. * Note these bits are not ABI for linux kernel but they _are_ * for us, so make sure they are at proper position between * versions. * * In particular since we already used leaf 11 we have * to keep it here, since it's an ABI now. */ enum cpuid_leafs { CPUID_1_EDX = 0, CPUID_8000_0001_EDX = 1, CPUID_8086_0001_EDX = 2, CPUID_LNX_1 = 3, CPUID_1_ECX = 4, CPUID_C000_0001_EDX = 5, CPUID_8000_0001_ECX = 6, CPUID_LNX_2 = 7, CPUID_LNX_3 = 8, CPUID_7_0_EBX = 9, CPUID_D_1_EAX = 10, CPUID_7_0_ECX = 11, CPUID_F_1_EDX = 12, CPUID_8000_0008_EBX = 13, CPUID_6_EAX = 14, CPUID_8000_000A_EDX = 15, CPUID_F_0_EDX = 16, CPUID_8000_0007_EBX = 17, CPUID_7_0_EDX = 18, }; #define NCAPINTS_V1 12 #define NCAPINTS_V2 19 #define NCAPINTS (NCAPINTS_V2) /* N 32-bit words worth of info */ #define NCAPINTS_BITS (NCAPINTS * 32) /* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ #define X86_FEATURE_FPU (0 * 32 + 0) /* Onboard FPU */ #define X86_FEATURE_VME (0 * 32 + 1) /* Virtual Mode Extensions */ #define X86_FEATURE_DE (0 * 32 + 2) /* Debugging Extensions */ #define X86_FEATURE_PSE (0 * 32 + 3) /* Page Size Extensions */ #define X86_FEATURE_TSC (0 * 32 + 4) /* Time Stamp Counter */ #define X86_FEATURE_MSR (0 * 32 + 5) /* Model-Specific Registers */ #define X86_FEATURE_PAE (0 * 32 + 6) /* Physical Address Extensions */ #define X86_FEATURE_MCE (0 * 32 + 7) /* Machine Check Exception */ #define X86_FEATURE_CX8 (0 * 32 + 8) /* CMPXCHG8 instruction */ #define X86_FEATURE_APIC (0 * 32 + 9) /* Onboard APIC */ #define X86_FEATURE_SEP (0 * 32 + 11) /* SYSENTER/SYSEXIT */ #define X86_FEATURE_MTRR (0 * 32 + 12) /* Memory Type Range Registers */ #define X86_FEATURE_PGE (0 * 32 + 13) /* Page Global Enable */ #define X86_FEATURE_MCA (0 * 32 + 14) /* Machine Check Architecture */ #define X86_FEATURE_CMOV (0 * 32 + 15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ #define X86_FEATURE_PAT (0 * 32 + 16) /* Page Attribute Table */ #define X86_FEATURE_PSE36 (0 * 32 + 17) /* 36-bit PSEs */ #define X86_FEATURE_PN (0 * 32 + 18) /* Processor serial number */ #define X86_FEATURE_CLFLUSH (0 * 32 + 19) /* CLFLUSH instruction */ #define X86_FEATURE_DS (0 * 32 + 21) /* "dts" Debug Store */ #define X86_FEATURE_ACPI (0 * 32 + 22) /* ACPI via MSR */ #define X86_FEATURE_MMX (0 * 32 + 23) /* Multimedia Extensions */ #define X86_FEATURE_FXSR (0 * 32 + 24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ #define X86_FEATURE_XMM (0 * 32 + 25) /* "sse" */ #define X86_FEATURE_XMM2 (0 * 32 + 26) /* "sse2" */ #define X86_FEATURE_SELFSNOOP (0 * 32 + 27) /* "ss" CPU self snoop */ #define X86_FEATURE_HT (0 * 32 + 28) /* Hyper-Threading */ #define X86_FEATURE_ACC (0 * 32 + 29) /* "tm" Automatic clock control */ #define X86_FEATURE_IA64 (0 * 32 + 30) /* IA-64 processor */ #define X86_FEATURE_PBE (0 * 32 + 31) /* Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ #define X86_FEATURE_SYSCALL (1 * 32 + 11) /* SYSCALL/SYSRET */ #define X86_FEATURE_MP (1 * 32 + 19) /* MP Capable */ #define X86_FEATURE_NX (1 * 32 + 20) /* Execute Disable */ #define X86_FEATURE_MMXEXT (1 * 32 + 22) /* AMD MMX extensions */ #define X86_FEATURE_FXSR_OPT (1 * 32 + 25) /* FXSAVE/FXRSTOR optimizations */ #define X86_FEATURE_GBPAGES (1 * 32 + 26) /* "pdpe1gb" GB pages */ #define X86_FEATURE_RDTSCP (1 * 32 + 27) /* RDTSCP */ #define X86_FEATURE_LM (1 * 32 + 29) /* Long Mode (x86-64, 64-bit support) */ #define X86_FEATURE_3DNOWEXT (1 * 32 + 30) /* AMD 3DNow extensions */ #define X86_FEATURE_3DNOW (1 * 32 + 31) /* 3DNow */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ #define X86_FEATURE_RECOVERY (2 * 32 + 0) /* CPU in recovery mode */ #define X86_FEATURE_LONGRUN (2 * 32 + 1) /* Longrun power control */ #define X86_FEATURE_LRTI (2 * 32 + 3) /* LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ #define X86_FEATURE_CXMMX (3 * 32 + 0) /* Cyrix MMX extensions */ #define X86_FEATURE_K6_MTRR (3 * 32 + 1) /* AMD K6 nonstandard MTRRs */ #define X86_FEATURE_CYRIX_ARR (3 * 32 + 2) /* Cyrix ARRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR (3 * 32 + 3) /* Centaur MCRs (= MTRRs) */ /* CPU types for specific tunings: */ #define X86_FEATURE_K8 (3 * 32 + 4) /* "" Opteron, Athlon64 */ #define X86_FEATURE_K7 (3 * 32 + 5) /* "" Athlon */ #define X86_FEATURE_P3 (3 * 32 + 6) /* "" P3 */ #define X86_FEATURE_P4 (3 * 32 + 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC (3 * 32 + 8) /* TSC ticks at a constant rate */ #define X86_FEATURE_UP (3 * 32 + 9) /* SMP kernel running on UP */ #define X86_FEATURE_ART (3 * 32 + 10) /* Always running timer (ART) */ #define X86_FEATURE_ARCH_PERFMON (3 * 32 + 11) /* Intel Architectural PerfMon */ #define X86_FEATURE_PEBS (3 * 32 + 12) /* Precise-Event Based Sampling */ #define X86_FEATURE_BTS (3 * 32 + 13) /* Branch Trace Store */ #define X86_FEATURE_SYSCALL32 (3 * 32 + 14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 (3 * 32 + 15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD (3 * 32 + 16) /* REP microcode works well */ #define X86_FEATURE_MFENCE_RDTSC (3 * 32 + 17) /* "" MFENCE synchronizes RDTSC */ #define X86_FEATURE_LFENCE_RDTSC (3 * 32 + 18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER (3 * 32 + 19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL (3 * 32 + 20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS (3 * 32 + 21) /* "" Always-present feature */ #define X86_FEATURE_XTOPOLOGY (3 * 32 + 22) /* CPU topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE (3 * 32 + 23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC (3 * 32 + 24) /* TSC does not stop in C states */ #define X86_FEATURE_CPUID (3 * 32 + 25) /* CPU has CPUID instruction itself */ #define X86_FEATURE_EXTD_APICID (3 * 32 + 26) /* Extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM (3 * 32 + 27) /* AMD multi-node processor */ #define X86_FEATURE_APERFMPERF (3 * 32 + 28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ #define X86_FEATURE_NONSTOP_TSC_S3 (3 * 32 + 30) /* TSC doesn't stop in S3 state */ #define X86_FEATURE_TSC_KNOWN_FREQ (3 * 32 + 31) /* TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ #define X86_FEATURE_XMM3 (4 * 32 + 0) /* "pni" SSE-3 */ #define X86_FEATURE_PCLMULQDQ (4 * 32 + 1) /* PCLMULQDQ instruction */ #define X86_FEATURE_DTES64 (4 * 32 + 2) /* 64-bit Debug Store */ #define X86_FEATURE_MWAIT (4 * 32 + 3) /* "monitor" MONITOR/MWAIT support */ #define X86_FEATURE_DSCPL (4 * 32 + 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ #define X86_FEATURE_VMX (4 * 32 + 5) /* Hardware virtualization */ #define X86_FEATURE_SMX (4 * 32 + 6) /* Safer Mode eXtensions */ #define X86_FEATURE_EST (4 * 32 + 7) /* Enhanced SpeedStep */ #define X86_FEATURE_TM2 (4 * 32 + 8) /* Thermal Monitor 2 */ #define X86_FEATURE_SSSE3 (4 * 32 + 9) /* Supplemental SSE-3 */ #define X86_FEATURE_CID (4 * 32 + 10) /* Context ID */ #define X86_FEATURE_SDBG (4 * 32 + 11) /* Silicon Debug */ #define X86_FEATURE_FMA (4 * 32 + 12) /* Fused multiply-add */ #define X86_FEATURE_CX16 (4 * 32 + 13) /* CMPXCHG16B instruction */ #define X86_FEATURE_XTPR (4 * 32 + 14) /* Send Task Priority Messages */ #define X86_FEATURE_PDCM (4 * 32 + 15) /* Perf/Debug Capabilities MSR */ #define X86_FEATURE_PCID (4 * 32 + 17) /* Process Context Identifiers */ #define X86_FEATURE_DCA (4 * 32 + 18) /* Direct Cache Access */ #define X86_FEATURE_XMM4_1 (4 * 32 + 19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 (4 * 32 + 20) /* "sse4_2" SSE-4.2 */ #define X86_FEATURE_X2APIC (4 * 32 + 21) /* X2APIC */ #define X86_FEATURE_MOVBE (4 * 32 + 22) /* MOVBE instruction */ #define X86_FEATURE_POPCNT (4 * 32 + 23) /* POPCNT instruction */ #define X86_FEATURE_TSC_DEADLINE_TIMER (4 * 32 + 24) /* TSC deadline timer */ #define X86_FEATURE_AES (4 * 32 + 25) /* AES instructions */ #define X86_FEATURE_XSAVE (4 * 32 + 26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ #define X86_FEATURE_OSXSAVE (4 * 32 + 27) /* "" XSAVE instruction enabled in the OS */ #define X86_FEATURE_AVX (4 * 32 + 28) /* Advanced Vector Extensions */ #define X86_FEATURE_F16C (4 * 32 + 29) /* 16-bit FP conversions */ #define X86_FEATURE_RDRAND (4 * 32 + 30) /* RDRAND instruction */ #define X86_FEATURE_HYPERVISOR (4 * 32 + 31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE (5 * 32 + 2) /* "rng" RNG present (xstore) */ #define X86_FEATURE_XSTORE_EN (5 * 32 + 3) /* "rng_en" RNG enabled */ #define X86_FEATURE_XCRYPT (5 * 32 + 6) /* "ace" on-CPU crypto (xcrypt) */ #define X86_FEATURE_XCRYPT_EN (5 * 32 + 7) /* "ace_en" on-CPU crypto enabled */ #define X86_FEATURE_ACE2 (5 * 32 + 8) /* Advanced Cryptography Engine v2 */ #define X86_FEATURE_ACE2_EN (5 * 32 + 9) /* ACE v2 enabled */ #define X86_FEATURE_PHE (5 * 32 + 10) /* PadLock Hash Engine */ #define X86_FEATURE_PHE_EN (5 * 32 + 11) /* PHE enabled */ #define X86_FEATURE_PMM (5 * 32 + 12) /* PadLock Montgomery Multiplier */ #define X86_FEATURE_PMM_EN (5 * 32 + 13) /* PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ #define X86_FEATURE_LAHF_LM (6 * 32 + 0) /* LAHF/SAHF in long mode */ #define X86_FEATURE_CMP_LEGACY (6 * 32 + 1) /* If yes HyperThreading not valid */ #define X86_FEATURE_SVM (6 * 32 + 2) /* Secure Virtual Machine */ #define X86_FEATURE_EXTAPIC (6 * 32 + 3) /* Extended APIC space */ #define X86_FEATURE_CR8_LEGACY (6 * 32 + 4) /* CR8 in 32-bit mode */ #define X86_FEATURE_ABM (6 * 32 + 5) /* Advanced bit manipulation */ #define X86_FEATURE_SSE4A (6 * 32 + 6) /* SSE-4A */ #define X86_FEATURE_MISALIGNSSE (6 * 32 + 7) /* Misaligned SSE mode */ #define X86_FEATURE_3DNOWPREFETCH (6 * 32 + 8) /* 3DNow prefetch instructions */ #define X86_FEATURE_OSVW (6 * 32 + 9) /* OS Visible Workaround */ #define X86_FEATURE_IBS (6 * 32 + 10) /* Instruction Based Sampling */ #define X86_FEATURE_XOP (6 * 32 + 11) /* extended AVX instructions */ #define X86_FEATURE_SKINIT (6 * 32 + 12) /* SKINIT/STGI instructions */ #define X86_FEATURE_WDT (6 * 32 + 13) /* Watchdog timer */ #define X86_FEATURE_LWP (6 * 32 + 15) /* Light Weight Profiling */ #define X86_FEATURE_FMA4 (6 * 32 + 16) /* 4 operands MAC instructions */ #define X86_FEATURE_TCE (6 * 32 + 17) /* Translation Cache Extension */ #define X86_FEATURE_NODEID_MSR (6 * 32 + 19) /* NodeId MSR */ #define X86_FEATURE_TBM (6 * 32 + 21) /* Trailing Bit Manipulations */ #define X86_FEATURE_TOPOEXT (6 * 32 + 22) /* Topology extensions CPUID leafs */ #define X86_FEATURE_PERFCTR_CORE (6 * 32 + 23) /* Core performance counter extensions */ #define X86_FEATURE_PERFCTR_NB (6 * 32 + 24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6 * 32 + 26) /* Data breakpoint extension */ #define X86_FEATURE_PTSC (6 * 32 + 27) /* Performance time-stamp counter */ #define X86_FEATURE_PERFCTR_LLC (6 * 32 + 28) /* Last Level Cache performance counter extensions */ #define X86_FEATURE_MWAITX (6 * 32 + 29) /* MWAIT extension (MONITORX/MWAITX instructions) */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE (9 * 32 + 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ #define X86_FEATURE_TSC_ADJUST (9 * 32 + 1) /* TSC adjustment MSR 0x3B */ #define X86_FEATURE_BMI1 (9 * 32 + 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE (9 * 32 + 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 (9 * 32 + 5) /* AVX2 instructions */ #define X86_FEATURE_SMEP (9 * 32 + 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_BMI2 (9 * 32 + 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_ERMS (9 * 32 + 9) /* Enhanced REP MOVSB/STOSB instructions */ #define X86_FEATURE_INVPCID (9 * 32 + 10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM (9 * 32 + 11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM (9 * 32 + 12) /* Cache QoS Monitoring */ #define X86_FEATURE_MPX (9 * 32 + 14) /* Memory Protection Extension */ #define X86_FEATURE_RDT_A (9 * 32 + 15) /* Resource Director Technology Allocation */ #define X86_FEATURE_AVX512F (9 * 32 + 16) /* AVX-512 Foundation */ #define X86_FEATURE_AVX512DQ (9 * 32 + 17) /* AVX-512 DQ (Double/Quad granular) Instructions */ #define X86_FEATURE_RDSEED (9 * 32 + 18) /* RDSEED instruction */ #define X86_FEATURE_ADX (9 * 32 + 19) /* ADCX and ADOX instructions */ #define X86_FEATURE_SMAP (9 * 32 + 20) /* Supervisor Mode Access Prevention */ #define X86_FEATURE_AVX512IFMA (9 * 32 + 21) /* AVX-512 Integer Fused Multiply-Add instructions */ #define X86_FEATURE_CLFLUSHOPT (9 * 32 + 23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_CLWB (9 * 32 + 24) /* CLWB instruction */ #define X86_FEATURE_INTEL_PT (9 * 32 + 25) /* Intel Processor Trace */ #define X86_FEATURE_AVX512PF (9 * 32 + 26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER (9 * 32 + 27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD (9 * 32 + 28) /* AVX-512 Conflict Detection */ #define X86_FEATURE_SHA_NI (9 * 32 + 29) /* SHA1/SHA256 Instruction Extensions */ #define X86_FEATURE_AVX512BW (9 * 32 + 30) /* AVX-512 BW (Byte/Word granular) Instructions */ #define X86_FEATURE_AVX512VL (9 * 32 + 31) /* AVX-512 VL (128/256 Vector Length) Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ #define X86_FEATURE_XSAVEOPT (10 * 32 + 0) /* XSAVEOPT instruction */ #define X86_FEATURE_XSAVEC (10 * 32 + 1) /* XSAVEC instruction */ #define X86_FEATURE_XGETBV1 (10 * 32 + 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XSAVES (10 * 32 + 3) /* XSAVES/XRSTORS instructions */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 11 */ #define X86_FEATURE_PREFETCHWT1 (11 * 32 + 0) /* PREFETCHWT1 Intel® Xeon PhiTM only */ #define X86_FEATURE_AVX512VBMI (11 * 32 + 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_UMIP (11 * 32 + 2) /* User Mode Instruction Protection */ #define X86_FEATURE_PKU (11 * 32 + 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (11 * 32 + 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VBMI2 (11 * 32 + 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ #define X86_FEATURE_GFNI (11 * 32 + 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (11 * 32 + 9) /* Vector AES */ #define X86_FEATURE_VPCLMULQDQ (11 * 32 + 10) /* Carry-Less Multiplication Double Quadword */ #define X86_FEATURE_AVX512_VNNI (11 * 32 + 11) /* Vector Neural Network Instructions */ #define X86_FEATURE_AVX512_BITALG (11 * 32 + 12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ #define X86_FEATURE_TME (11 * 32 + 13) /* Intel Total Memory Encryption */ #define X86_FEATURE_AVX512_VPOPCNTDQ (11 * 32 + 14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (11 * 32 + 16) /* 5-level page tables */ #define X86_FEATURE_RDPID (11 * 32 + 22) /* RDPID instruction */ #define X86_FEATURE_CLDEMOTE (11 * 32 + 25) /* CLDEMOTE instruction */ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ #define X86_FEATURE_CQM_OCCUP_LLC (12 * 32 + 0) /* LLC occupancy monitoring */ #define X86_FEATURE_CQM_MBM_TOTAL (12 * 32 + 1) /* LLC Total MBM monitoring */ #define X86_FEATURE_CQM_MBM_LOCAL (12 * 32 + 2) /* LLC Local MBM monitoring */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13 * 32 + 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13 * 32 + 1) /* Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13 * 32 + 2) /* Always save/restore FP error pointers */ #define X86_FEATURE_IBPB (13 * 32 + 12) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_IBRS (13 * 32 + 14) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_STIBP (13 * 32 + 15) /* Single Thread Indirect Branch Predictors */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14 * 32 + 0) /* Digital Thermal Sensor */ #define X86_FEATURE_IDA (14 * 32 + 1) /* Intel Dynamic Acceleration */ #define X86_FEATURE_ARAT (14 * 32 + 2) /* Always Running APIC Timer */ #define X86_FEATURE_PLN (14 * 32 + 4) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (14 * 32 + 6) /* Intel Package Thermal Status */ #define X86_FEATURE_HWP (14 * 32 + 7) /* Intel Hardware P-states */ #define X86_FEATURE_HWP_NOTIFY (14 * 32 + 8) /* HWP Notification */ #define X86_FEATURE_HWP_ACT_WINDOW (14 * 32 + 9) /* HWP Activity Window */ #define X86_FEATURE_HWP_EPP (14 * 32 + 10) /* HWP Energy Perf. Preference */ #define X86_FEATURE_HWP_PKG_REQ (14 * 32 + 11) /* HWP Package Level Request */ #define X86_FEATURE_HDC (14 * 32 + 13) /* HDC base registers present */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ #define X86_FEATURE_NPT (15 * 32 + 0) /* Nested Page Table support */ #define X86_FEATURE_LBRV (15 * 32 + 1) /* LBR Virtualization support */ #define X86_FEATURE_SVML (15 * 32 + 2) /* "svm_lock" SVM locking MSR */ #define X86_FEATURE_NRIPS (15 * 32 + 3) /* "nrip_save" SVM next_rip save */ #define X86_FEATURE_TSCRATEMSR (15 * 32 + 4) /* "tsc_scale" TSC scaling support */ #define X86_FEATURE_VMCBCLEAN (15 * 32 + 5) /* "vmcb_clean" VMCB clean bits support */ #define X86_FEATURE_FLUSHBYASID (15 * 32 + 6) /* flush-by-ASID support */ #define X86_FEATURE_DECODEASSISTS (15 * 32 + 7) /* Decode Assists support */ #define X86_FEATURE_PAUSEFILTER (15 * 32 + 10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15 * 32 + 12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15 * 32 + 13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15 * 32 + 15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15 * 32 + 16) /* Virtual GIF */ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 16 */ #define X86_FEATURE_CQM_LLC (16 * 32 + 1) /* LLC QoS if 1 */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17 * 32 + 0) /* MCA overflow recovery support */ #define X86_FEATURE_SUCCOR (17 * 32 + 1) /* Uncorrectable error containment and recovery */ #define X86_FEATURE_SMCA (17 * 32 + 3) /* Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18 * 32 + 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18 * 32 + 3) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_PCONFIG (18 * 32 + 18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18 * 32 + 26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18 * 32 + 27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ARCH_CAPABILITIES (18 * 32 + 29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ #define X86_FEATURE_SPEC_CTRL_SSBD (18 * 32 + 31) /* "" Speculative Store Bypass Disable */ enum { X86_VENDOR_INTEL = 0, X86_VENDOR_AMD = 1, X86_VENDOR_MAX }; struct cpuinfo_x86 { /* cpu context */ uint8_t x86_family; uint8_t x86_vendor; uint8_t x86_model; uint8_t x86_mask; uint32_t x86_capability[NCAPINTS]; uint32_t x86_power; uint32_t extended_cpuid_level; int cpuid_level; char x86_vendor_id[16]; char x86_model_id[64]; /* fpu context */ uint64_t xfeatures_mask; uint32_t xsave_size_max; uint32_t xsave_size; uint32_t xstate_offsets[XFEATURE_MAX]; uint32_t xstate_sizes[XFEATURE_MAX]; uint32_t xsaves_size; uint32_t xstate_comp_offsets[XFEATURE_MAX]; uint32_t xstate_comp_sizes[XFEATURE_MAX]; }; typedef struct cpuinfo_x86 compel_cpuinfo_t; #endif /* __CR_ASM_CPU_H__ */ crac-criu-1.5.0/compel/arch/x86/src/lib/include/uapi/asm/fpu.h000066400000000000000000000214611471504326700236640ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #include #include #include #include #define FP_MIN_ALIGN_BYTES 64 #define FXSAVE_ALIGN_BYTES 16 #define FP_XSTATE_MAGIC1 0x46505853U #define FP_XSTATE_MAGIC2 0x46505845U #ifndef FP_XSTATE_MAGIC2_SIZE #define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) #endif #define XSTATE_FP 0x1 #define XSTATE_SSE 0x2 #define XSTATE_YMM 0x4 #define FXSAVE_SIZE 512 /* * This used to be 4096 (one page). There is a comment below concerning * this size: * "One page should be enough for the whole xsave state ;-)" * Which is kind of funny as it is no longer enough ;-) * * Older CPUs: * # cpuid -1 -l 0xd -s 0 * ... * bytes required by XSAVE/XRSTOR area = 0x00000988 (2440) * * Newer CPUs (Sapphire Rapids): * # cpuid -1 -l 0xd -s 0 * ... * bytes required by XSAVE/XRSTOR area = 0x00002b00 (11008) * * So one page is no longer enough... But: * * Four pages should be enough for the whole xsave state ;-) */ #define XSAVE_SIZE 4*4096 #define XSAVE_HDR_SIZE 64 #define XSAVE_HDR_OFFSET FXSAVE_SIZE #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) /* * List of XSAVE features Linux knows about: */ enum xfeature { XFEATURE_FP, XFEATURE_SSE, /* * Values above here are "legacy states". * Those below are "extended states". */ XFEATURE_YMM, XFEATURE_BNDREGS, XFEATURE_BNDCSR, XFEATURE_OPMASK, XFEATURE_ZMM_Hi256, XFEATURE_Hi16_ZMM, XFEATURE_PT, XFEATURE_PKRU, XFEATURE_HDC, XFEATURE_MAX, }; #define XSTATE_CPUID 0x0000000d #define XFEATURE_MASK_FP (1 << XFEATURE_FP) #define XFEATURE_MASK_SSE (1 << XFEATURE_SSE) #define XFEATURE_MASK_YMM (1 << XFEATURE_YMM) #define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS) #define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR) #define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) #define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) #define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) #define XFEATURE_MASK_PT (1 << XFEATURE_PT) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_HDC (1 << XFEATURE_HDC) #define XFEATURE_MASK_MAX (1 << XFEATURE_MAX) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) #define FIRST_EXTENDED_XFEATURE XFEATURE_YMM /* Supervisor features */ #define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT | XFEATURE_HDC) /* All currently supported features */ #define XFEATURE_MASK_USER \ (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | XFEATURE_MASK_PKRU | XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR) /* xsave structure features which is safe to fill with garbage (see validate_random_xstate()) */ #define XFEATURE_MASK_FAULTINJ \ (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM) struct fpx_sw_bytes { uint32_t magic1; uint32_t extended_size; uint64_t xstate_bv; uint32_t xstate_size; uint32_t padding[7]; }; struct i387_fxsave_struct { uint16_t cwd; /* Control Word */ uint16_t swd; /* Status Word */ uint16_t twd; /* Tag Word */ uint16_t fop; /* Last Instruction Opcode */ union { struct { uint64_t rip; /* Instruction Pointer */ uint64_t rdp; /* Data Pointer */ }; struct { uint32_t fip; /* FPU IP Offset */ uint32_t fcs; /* FPU IP Selector */ uint32_t foo; /* FPU Operand Offset */ uint32_t fos; /* FPU Operand Selector */ }; }; uint32_t mxcsr; /* MXCSR Register State */ uint32_t mxcsr_mask; /* MXCSR Mask */ /* 8*16 bytes for each FP-reg = 128 bytes */ uint32_t st_space[32]; /* 16*16 bytes for each XMM-reg = 256 bytes */ uint32_t xmm_space[64]; uint32_t padding[12]; union { uint32_t padding1[12]; uint32_t sw_reserved[12]; }; } __aligned(FXSAVE_ALIGN_BYTES); struct xsave_hdr_struct { uint64_t xstate_bv; uint64_t xcomp_bv; uint64_t reserved[6]; } __packed; /* * xstate_header.xcomp_bv[63] indicates that the extended_state_area * is in compacted format. */ #define XCOMP_BV_COMPACTED_FORMAT ((uint64_t)1 << 63) /* * State component 2: * * There are 16x 256-bit AVX registers named YMM0-YMM15. * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) * and are stored in 'struct fxregs_state::xmm_space[]' in the * "legacy" area. * * The high 128 bits are stored here. */ struct ymmh_struct { uint32_t ymmh_space[64]; } __packed; /* Intel MPX support: */ struct mpx_bndreg { uint64_t lower_bound; uint64_t upper_bound; } __packed; /* * State component 3 is used for the 4 128-bit bounds registers */ struct mpx_bndreg_state { struct mpx_bndreg bndreg[4]; } __packed; /* * State component 4 is used for the 64-bit user-mode MPX * configuration register BNDCFGU and the 64-bit MPX status * register BNDSTATUS. We call the pair "BNDCSR". */ struct mpx_bndcsr { uint64_t bndcfgu; uint64_t bndstatus; } __packed; /* * The BNDCSR state is padded out to be 64-bytes in size. */ struct mpx_bndcsr_state { union { struct mpx_bndcsr bndcsr; uint8_t pad_to_64_bytes[64]; }; } __packed; /* AVX-512 Components: */ /* * State component 5 is used for the 8 64-bit opmask registers * k0-k7 (opmask state). */ struct avx_512_opmask_state { uint64_t opmask_reg[8]; } __packed; /* * State component 6 is used for the upper 256 bits of the * registers ZMM0-ZMM15. These 16 256-bit values are denoted * ZMM0_H-ZMM15_H (ZMM_Hi256 state). */ struct avx_512_zmm_uppers_state { uint64_t zmm_upper[16 * 4]; } __packed; /* * State component 7 is used for the 16 512-bit registers * ZMM16-ZMM31 (Hi16_ZMM state). */ struct avx_512_hi16_state { uint64_t hi16_zmm[16 * 8]; } __packed; /* * State component 9: 32-bit PKRU register. The state is * 8 bytes long but only 4 bytes is used currently. */ struct pkru_state { uint32_t pkru; uint32_t pad; } __packed; /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. * * It consists of a legacy fxregs portion, an xstate header and * subsequent areas as defined by the xstate header. Not all CPUs * support all the extensions, so the size of the extended area * can vary quite a bit between CPUs. * * * One page should be enough for the whole xsave state ;-) * * Of course it was not ;-) Now using four pages... * */ #define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) /* * cpu requires it to be 64 byte aligned */ struct xsave_struct { struct i387_fxsave_struct i387; struct xsave_hdr_struct xsave_hdr; union { /* * This ymmh is unndeed, for * backward compatibility. */ struct ymmh_struct ymmh; uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; }; } __aligned(FP_MIN_ALIGN_BYTES) __packed; struct xsave_struct_ia32 { struct i387_fxsave_struct i387; struct xsave_hdr_struct xsave_hdr; union { /* * This ymmh is unndeed, for * backward compatibility. */ struct ymmh_struct ymmh; uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; }; }; typedef struct { /* * The FPU xsave area must be continuous and FP_MIN_ALIGN_BYTES * aligned, thus make sure the compiler won't insert any hole here. */ union { struct xsave_struct xsave; uint8_t __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE]; }; uint8_t has_fpu; } fpu_state_64_t; struct user_i387_ia32_struct { uint32_t cwd; /* FPU Control Word */ uint32_t swd; /* FPU Status Word */ uint32_t twd; /* FPU Tag Word */ uint32_t fip; /* FPU IP Offset */ uint32_t fcs; /* FPU IP Selector */ uint32_t foo; /* FPU Operand Pointer Offset */ uint32_t fos; /* FPU Operand Pointer Selector */ uint32_t st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ }; typedef struct { struct { struct user_i387_ia32_struct i387_ia32; /* Software status information [not touched by FSAVE]: */ uint32_t status; } fregs_state; union { struct xsave_struct_ia32 xsave; uint8_t __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE]; } __aligned(FXSAVE_ALIGN_BYTES); } __aligned(FXSAVE_ALIGN_BYTES) fpu_state_ia32_t; /* * This one is used in restorer. */ typedef struct { union { fpu_state_64_t fpu_state_64; struct { /* fpu_state_ia32->xsave has to be 64-byte aligned. */ uint32_t __pad[2]; fpu_state_ia32_t fpu_state_ia32; }; }; uint8_t has_fpu; } fpu_state_t; extern void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, struct i387_fxsave_struct *fxsave); #endif /* __CR_ASM_FPU_H__ */ crac-criu-1.5.0/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000072201471504326700255010ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #include #include #define SIGMAX 64 #define SIGMAX_OLD 31 #define ARCH_HAS_PTRACE_GET_THREAD_AREA /* * Linux preserves three TLS segments in GDT. * Offsets in GDT differ between 32-bit and 64-bit machines. * For 64-bit x86 those GDT offsets are the same * for native and compat tasks. */ #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 #define GDT_ENTRY_TLS_NUM 3 typedef struct { user_desc_t desc[GDT_ENTRY_TLS_NUM]; } tls_t; struct thread_ctx; struct parasite_ctl; struct parasite_thread_ctl; extern int __compel_arch_fetch_thread_area(int tid, struct thread_ctx *th); extern int compel_arch_fetch_thread_area(struct parasite_thread_ctl *tctl); extern void compel_arch_get_tls_thread(struct parasite_thread_ctl *tctl, tls_t *out); extern void compel_arch_get_tls_task(struct parasite_ctl *ctl, tls_t *out); typedef struct { uint64_t r15; uint64_t r14; uint64_t r13; uint64_t r12; uint64_t bp; uint64_t bx; uint64_t r11; uint64_t r10; uint64_t r9; uint64_t r8; uint64_t ax; uint64_t cx; uint64_t dx; uint64_t si; uint64_t di; uint64_t orig_ax; uint64_t ip; uint64_t cs; uint64_t flags; uint64_t sp; uint64_t ss; uint64_t fs_base; uint64_t gs_base; uint64_t ds; uint64_t es; uint64_t fs; uint64_t gs; } user_regs_struct64; typedef struct { uint32_t bx; uint32_t cx; uint32_t dx; uint32_t si; uint32_t di; uint32_t bp; uint32_t ax; uint32_t ds; uint32_t es; uint32_t fs; uint32_t gs; uint32_t orig_ax; uint32_t ip; uint32_t cs; uint32_t flags; uint32_t sp; uint32_t ss; } user_regs_struct32; /* * To be sure that we rely on inited reg->__is_native, this member * is (short int) instead of initial (bool). The right way to * check if regs are native or compat is to use user_regs_native() macro. * This should cost nothing, as *usually* sizeof(bool) == sizeof(short) */ typedef struct { union { user_regs_struct64 native; user_regs_struct32 compat; }; short __is_native; /* use user_regs_native macro to check it */ } user_regs_struct_t; #define NATIVE_MAGIC 0x0A #define COMPAT_MAGIC 0x0C static inline bool user_regs_native(user_regs_struct_t *pregs) { return pregs->__is_native == NATIVE_MAGIC; } #define get_user_reg(pregs, name) ((user_regs_native(pregs)) ? ((pregs)->native.name) : ((pregs)->compat.name)) #define set_user_reg(pregs, name, val) \ ((user_regs_native(pregs)) ? ((pregs)->native.name = (val)) : ((pregs)->compat.name = (val))) #if 0 typedef struct { unsigned short cwd; unsigned short swd; unsigned short twd; /* Note this is not the same as the 32bit/x87/FSAVE twd */ unsigned short fop; u64 rip; u64 rdp; u32 mxcsr; u32 mxcsr_mask; u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ u32 padding[24]; } user_fpregs_struct_t; #endif typedef struct xsave_struct user_fpregs_struct_t; #define REG_RES(regs) get_user_reg(®s, ax) #define REG_IP(regs) get_user_reg(®s, ip) #define SET_REG_IP(regs, val) set_user_reg(®s, ip, val) #define REG_SP(regs) get_user_reg(®s, sp) #define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) #define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) /* * For x86_32 __NR_mmap inside the kernel represents old_mmap system * call, but since we didn't use it yet lets go further and simply * define own alias for __NR_mmap2 which would allow us to unify code * between 32 and 64 bits version. */ #define __NR32_mmap __NR32_mmap2 #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ crac-criu-1.5.0/compel/arch/x86/src/lib/include/uapi/asm/processor-flags.h000066400000000000000000000021701471504326700261770ustar00rootroot00000000000000#ifndef __CR_PROCESSOR_FLAGS_H__ #define __CR_PROCESSOR_FLAGS_H__ /* Taken from linux kernel headers */ /* * EFLAGS bits */ #define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ #define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ #define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ #define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ #define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ #define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ #define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ #define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ #define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ #define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ #define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ #define X86_EFLAGS_NT 0x00004000 /* Nested Task */ #define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ #define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ #define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ #define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ #endif /* __CR_PROCESSOR_FLAGS_H__ */ crac-criu-1.5.0/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000114761471504326700246740ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include #include #include #include #include #define SIGFRAME_MAX_OFFSET 8 struct rt_sigcontext { uint64_t r8; uint64_t r9; uint64_t r10; uint64_t r11; uint64_t r12; uint64_t r13; uint64_t r14; uint64_t r15; uint64_t rdi; uint64_t rsi; uint64_t rbp; uint64_t rbx; uint64_t rdx; uint64_t rax; uint64_t rcx; uint64_t rsp; uint64_t rip; uint64_t eflags; uint16_t cs; uint16_t gs; uint16_t fs; uint16_t ss; uint64_t err; uint64_t trapno; uint64_t oldmask; uint64_t cr2; uint64_t fpstate; uint64_t reserved1[8]; }; struct rt_sigcontext_32 { uint32_t gs; uint32_t fs; uint32_t es; uint32_t ds; uint32_t di; uint32_t si; uint32_t bp; uint32_t sp; uint32_t bx; uint32_t dx; uint32_t cx; uint32_t ax; uint32_t trapno; uint32_t err; uint32_t ip; uint32_t cs; uint32_t flags; uint32_t sp_at_signal; uint32_t ss; uint32_t fpstate; uint32_t oldmask; uint32_t cr2; }; #include /* * XXX: move declarations to generic sigframe.h or sigframe-compat.h * when (if) other architectures will support compatible C/R */ typedef uint32_t compat_uptr_t; typedef uint32_t compat_size_t; typedef uint32_t compat_sigset_word; typedef struct compat_siginfo { int si_signo; int si_errno; int si_code; int _pad[128 / sizeof(int) - 3]; } compat_siginfo_t; typedef struct compat_sigaltstack { compat_uptr_t ss_sp; int ss_flags; compat_size_t ss_size; } compat_stack_t; #define _COMPAT_NSIG 64 #define _COMPAT_NSIG_BPW 32 #define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW) typedef struct { compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; struct ucontext_ia32 { unsigned int uc_flags; unsigned int uc_link; compat_stack_t uc_stack; struct rt_sigcontext_32 uc_mcontext; compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; struct rt_sigframe_ia32 { uint32_t pretcode; int32_t sig; uint32_t pinfo; uint32_t puc; compat_siginfo_t info; struct ucontext_ia32 uc; char retcode[8]; /* fp state follows here */ fpu_state_t fpu_state; }; struct rt_sigframe_64 { char *pretcode; struct rt_ucontext uc; struct rt_siginfo info; /* fp state follows here */ fpu_state_t fpu_state; }; struct rt_sigframe { union { struct rt_sigframe_ia32 compat; struct rt_sigframe_64 native; }; bool is_native; }; static inline void rt_sigframe_copy_sigset(struct rt_sigframe *to, k_rtsigset_t *from) { size_t sz = sizeof(k_rtsigset_t); BUILD_BUG_ON(sz != sizeof(compat_sigset_t)); if (to->is_native) memcpy(&to->native.uc.uc_sigmask, from, sz); else memcpy(&to->compat.uc.uc_sigmask, from, sz); } static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) { size_t sz = sizeof(k_rtsigset_t); if (sigframe->is_native) memset(&sigframe->native.uc.uc_sigmask, 0, sz); else memset(&sigframe->compat.uc.uc_sigmask, 0, sz); } #define RT_SIGFRAME_REGIP(rt_sigframe) \ ((rt_sigframe->is_native) ? (rt_sigframe)->native.uc.uc_mcontext.rip : (rt_sigframe)->compat.uc.uc_mcontext.ip) #define RT_SIGFRAME_FPU(rt_sigframe) \ ((rt_sigframe->is_native) ? (&(rt_sigframe)->native.fpu_state) : (&(rt_sigframe)->compat.fpu_state)) #define RT_SIGFRAME_HAS_FPU(rt_sigframe) (RT_SIGFRAME_FPU(rt_sigframe)->has_fpu) /* * Sigframe offset is different for native/compat tasks. * Offsets calculations one may see at kernel: * - compatible is in sys32_rt_sigreturn at arch/x86/ia32/ia32_signal.c * - native is in sys_rt_sigreturn at arch/x86/kernel/signal.c */ #define RT_SIGFRAME_OFFSET(rt_sigframe) (((rt_sigframe)->is_native) ? 8 : 4) #define USER32_CS 0x23 /* clang-format off */ #define ARCH_RT_SIGRETURN_NATIVE(new_sp) \ asm volatile( \ "movq %0, %%rax \n" \ "movq %%rax, %%rsp \n" \ "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n" \ "syscall \n" \ : \ : "r"(new_sp) \ : "rax","memory") #define ARCH_RT_SIGRETURN_COMPAT(new_sp) \ asm volatile( \ "pushq $"__stringify(USER32_CS)" \n" \ "xor %%rax, %%rax \n" \ "movl $1f, %%eax \n" \ "pushq %%rax \n" \ "lretq \n" \ "1: \n" \ ".code32 \n" \ "movl %%edi, %%esp \n" \ "movl $"__stringify(__NR32_rt_sigreturn)",%%eax \n" \ "int $0x80 \n" \ ".code64 \n" \ : \ : "rdi"(new_sp) \ : "eax", "r8", "r9", "r10", "r11", "memory") #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ do { \ if ((rt_sigframe)->is_native) \ ARCH_RT_SIGRETURN_NATIVE(new_sp); \ else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) /* clang-format off */ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ crac-criu-1.5.0/compel/arch/x86/src/lib/infect.c000066400000000000000000000434451471504326700212020ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "asm/cpu.h" #include #include #include "errno.h" #include #include #include "common/err.h" #include "asm/infect-types.h" #include "ptrace.h" #include "infect.h" #include "infect-priv.h" #include "log.h" #ifndef NT_X86_XSTATE #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ #endif #ifndef NT_PRSTATUS #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ #endif /* * Injected syscall instruction */ const char code_syscall[] = { 0x0f, 0x05, /* syscall */ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ }; const char code_int_80[] = { 0xcd, 0x80, /* int $0x80 */ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ }; static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); static const int code_int_80_aligned = round_up(sizeof(code_syscall), sizeof(long)); static inline __always_unused void __check_code_syscall(void) { BUILD_BUG_ON(code_int_80_aligned != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } /* 10-byte legacy floating point register */ struct fpreg { uint16_t significand[4]; uint16_t exponent; }; /* 16-byte floating point register */ struct fpxreg { uint16_t significand[4]; uint16_t exponent; uint16_t padding[3]; }; #define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n)*16) #define FP_EXP_TAG_VALID 0 #define FP_EXP_TAG_ZERO 1 #define FP_EXP_TAG_SPECIAL 2 #define FP_EXP_TAG_EMPTY 3 static inline uint32_t twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) { struct fpxreg *st; uint32_t tos = (fxsave->swd >> 11) & 7; uint32_t twd = (unsigned long)fxsave->twd; uint32_t tag; uint32_t ret = 0xffff0000u; int i; for (i = 0; i < 8; i++, twd >>= 1) { if (twd & 0x1) { st = FPREG_ADDR(fxsave, (i - tos) & 7); switch (st->exponent & 0x7fff) { case 0x7fff: tag = FP_EXP_TAG_SPECIAL; break; case 0x0000: if (!st->significand[0] && !st->significand[1] && !st->significand[2] && !st->significand[3]) tag = FP_EXP_TAG_ZERO; else tag = FP_EXP_TAG_SPECIAL; break; default: if (st->significand[3] & 0x8000) tag = FP_EXP_TAG_VALID; else tag = FP_EXP_TAG_SPECIAL; break; } } else { tag = FP_EXP_TAG_EMPTY; } ret |= tag << (2 * i); } return ret; } void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, struct i387_fxsave_struct *fxsave) { struct fpxreg *from = (struct fpxreg *)&fxsave->st_space[0]; struct fpreg *to = (struct fpreg *)env->st_space; int i; env->cwd = fxsave->cwd | 0xffff0000u; env->swd = fxsave->swd | 0xffff0000u; env->twd = twd_fxsr_to_i387(fxsave); env->fip = fxsave->rip; env->foo = fxsave->rdp; /* * should be actually ds/cs at fpu exception time, but * that information is not available in 64bit mode. */ env->fcs = 0x23; /* __USER32_CS */ env->fos = 0x2b; /* __USER32_DS */ env->fos |= 0xffff0000; for (i = 0; i < 8; ++i) memcpy(&to[i], &from[i], sizeof(to[0])); } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { bool is_native = user_regs_native(regs); fpu_state_t *fpu_state = is_native ? &sigframe->native.fpu_state : &sigframe->compat.fpu_state; if (is_native) { #define cpreg64_native(d, s) sigframe->native.uc.uc_mcontext.d = regs->native.s cpreg64_native(rdi, di); cpreg64_native(rsi, si); cpreg64_native(rbp, bp); cpreg64_native(rsp, sp); cpreg64_native(rbx, bx); cpreg64_native(rdx, dx); cpreg64_native(rcx, cx); cpreg64_native(rip, ip); cpreg64_native(rax, ax); cpreg64_native(r8, r8); cpreg64_native(r9, r9); cpreg64_native(r10, r10); cpreg64_native(r11, r11); cpreg64_native(r12, r12); cpreg64_native(r13, r13); cpreg64_native(r14, r14); cpreg64_native(r15, r15); cpreg64_native(cs, cs); cpreg64_native(eflags, flags); sigframe->is_native = true; #undef cpreg64_native } else { #define cpreg32_compat(d) sigframe->compat.uc.uc_mcontext.d = regs->compat.d cpreg32_compat(gs); cpreg32_compat(fs); cpreg32_compat(es); cpreg32_compat(ds); cpreg32_compat(di); cpreg32_compat(si); cpreg32_compat(bp); cpreg32_compat(sp); cpreg32_compat(bx); cpreg32_compat(dx); cpreg32_compat(cx); cpreg32_compat(ip); cpreg32_compat(ax); cpreg32_compat(cs); cpreg32_compat(ss); cpreg32_compat(flags); #undef cpreg32_compat sigframe->is_native = false; } fpu_state->has_fpu = true; if (is_native) { memcpy(&fpu_state->fpu_state_64.xsave, fpregs, sizeof(*fpregs)); } else { memcpy(&fpu_state->fpu_state_ia32.xsave, fpregs, sizeof(*fpregs)); compel_convert_from_fxsr(&fpu_state->fpu_state_ia32.fregs_state.i387_ia32, &fpu_state->fpu_state_ia32.xsave.i387); } return 0; } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { fpu_state_t *fpu_state = (sigframe->is_native) ? &rsigframe->native.fpu_state : &rsigframe->compat.fpu_state; if (sigframe->is_native) { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_64.xsave; if ((addr % 64ul)) { pr_err("Unaligned address passed: %lx (native %d)\n", addr, sigframe->is_native); return -1; } sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; } else if (!sigframe->is_native) { sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; } return 0; } #define get_signed_user_reg(pregs, name) \ ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : (int32_t)((pregs)->compat.name)) static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) { if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { pr_perror("Can't obtain FPU registers for %d", pid); return -1; } return 0; } static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) { struct iovec iov; iov.iov_base = xsave; iov.iov_len = sizeof(*xsave); if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) { pr_perror("Can't obtain FPU registers for %d", pid); return -1; } if ((xsave->xsave_hdr.xstate_bv & 3) != 3) { // Due to init-optimisation [1] x87 FPU or SSE state may not be filled in. // Since those are restored unconditionally, make sure the init values are // filled by retrying with old PTRACE_GETFPREGS. // // [1] Intel® 64 and IA-32 Architectures Software Developer's // Manual Volume 1: Basic Architecture // Section 13.6: Processor tracking of XSAVE-managed state return get_task_fpregs(pid, xsave); } return 0; } static inline void fixup_mxcsr(struct xsave_struct *xsave) { /* * Right now xsave->i387.mxcsr filled with the random garbage, * let's make it valid by applying mask which allows all * features, except the denormals-are-zero feature bit. * * See also fpu__init_system_mxcsr function: * https://github.com/torvalds/linux/blob/8cb1ae19/arch/x86/kernel/fpu/init.c#L117 */ xsave->i387.mxcsr &= 0x0000ffbf; } /* See arch/x86/kernel/fpu/xstate.c */ static void validate_random_xstate(struct xsave_struct *xsave) { struct xsave_hdr_struct *hdr = &xsave->xsave_hdr; unsigned int i; /* No unknown or supervisor features may be set */ hdr->xstate_bv &= XFEATURE_MASK_USER; hdr->xstate_bv &= ~XFEATURE_MASK_SUPERVISOR; hdr->xstate_bv &= XFEATURE_MASK_FAULTINJ; for (i = 0; i < XFEATURE_MAX; i++) { if (!compel_fpu_has_feature(i)) hdr->xstate_bv &= ~(1 << i); } /* Userspace must use the uncompacted format */ hdr->xcomp_bv = 0; /* * If 'reserved' is shrunken to add a new field, make sure to validate * that new field here! */ BUILD_BUG_ON(sizeof(hdr->reserved) != 48); /* No reserved bits may be set */ memset(&hdr->reserved, 0, sizeof(hdr->reserved)); } /* * TODO: Put fault-injection under CONFIG_* and move * extended regset corruption to generic code */ static int corrupt_extregs(pid_t pid) { bool use_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); user_fpregs_struct_t ext_regs; int *rand_to = (int *)&ext_regs; unsigned int seed, init_seed; size_t i; init_seed = seed = time(NULL); for (i = 0; i < sizeof(ext_regs) / sizeof(int); i++) *rand_to++ = rand_r(&seed); /* * Error log-level as: * - not intended to be used outside of testing; * - zdtm.py will grep it auto-magically from logs * (and the seed will be known from automatic testing). */ pr_err("Corrupting %s for %d, seed %u\n", use_xsave ? "xsave" : "fpuregs", pid, init_seed); fixup_mxcsr(&ext_regs); if (!use_xsave) { if (ptrace(PTRACE_SETFPREGS, pid, NULL, &ext_regs)) { pr_perror("Can't set FPU registers for %d", pid); return -1; } } else { struct iovec iov; validate_random_xstate((void *)&ext_regs); iov.iov_base = &ext_regs; iov.iov_len = sizeof(ext_regs); if (ptrace(PTRACE_SETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) { pr_perror("Can't set xstate for %d", pid); return -1; } } return 0; } int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, unsigned long flags) { user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping general registers for %d in %s mode\n", pid, user_regs_native(regs) ? "native" : "compat"); /* Did we come from a system call? */ if (get_signed_user_reg(regs, orig_ax) >= 0) { /* Restart the system call */ switch (get_signed_user_reg(regs, ax)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: set_user_reg(regs, ax, get_user_reg(regs, orig_ax)); set_user_reg(regs, ip, get_user_reg(regs, ip) - 2); break; case -ERESTART_RESTARTBLOCK: pr_warn("Will restore %d with interrupted system call\n", pid); set_user_reg(regs, ax, -EINTR); break; } } if (!compel_cpu_has_feature(X86_FEATURE_FPU)) goto out; /* * FPU fetched either via fxsave or via xsave, * thus decode it accordingly. */ pr_info("Dumping GP/FPU registers for %d\n", pid); if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { ret = get_task_fpregs(pid, xs); } else if (unlikely(flags & INFECT_X86_PTRACE_MXCSR_BUG)) { /* * get_task_fpregs() will fill FP state, * get_task_xsave() will overwrite rightly sse/mmx/etc */ pr_warn("Skylake xsave fpu bug workaround used\n"); ret = get_task_fpregs(pid, xs); if (!ret) ret = get_task_xsave(pid, xs); } else { ret = get_task_xsave(pid, xs); } if (!ret && unlikely(flags & INFECT_CORRUPT_EXTREGS)) ret = corrupt_extregs(pid); if (ret) goto err; out: ret = save(arg, regs, xs); err: return ret; } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { struct iovec iov; pr_info("Restoring GP/FPU registers for %d\n", pid); if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { if (ptrace(PTRACE_SETFPREGS, pid, NULL, ext_regs)) { pr_perror("Can't set FPU registers for %d", pid); return -1; } return 0; } iov.iov_base = ext_regs; iov.iov_len = sizeof(*ext_regs); if (ptrace(PTRACE_SETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) { pr_perror("Can't set FPU registers for %d", pid); return -1; } return 0; } int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; bool native = user_regs_native(®s); int err; if (native) { user_regs_struct64 *r = ®s.native; r->ax = (uint64_t)nr; r->di = arg1; r->si = arg2; r->dx = arg3; r->r10 = arg4; r->r8 = arg5; r->r9 = arg6; err = compel_execute_syscall(ctl, ®s, code_syscall); } else { user_regs_struct32 *r = ®s.compat; r->ax = (uint32_t)nr; r->bx = arg1; r->cx = arg2; r->dx = arg3; r->si = arg4; r->di = arg5; r->bp = arg6; err = compel_execute_syscall(ctl, ®s, code_int_80); } *ret = native ? (long)get_user_reg(®s, ax) : (int)get_user_reg(®s, ax); return err; } void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; bool compat_task = !user_regs_native(&ctl->orig.regs); err = compel_syscall(ctl, __NR(mmap, compat_task), &map, (unsigned long)addr, length, prot, flags, fd, offset); if (err < 0) return NULL; if (map == -EACCES && (prot & PROT_WRITE) && (prot & PROT_EXEC)) { pr_warn("mmap(PROT_WRITE | PROT_EXEC) failed for %d, " "check selinux execmem policy\n", ctl->rpid); return NULL; } if (IS_ERR_VALUE(map)) { pr_err("remote mmap() failed: %s\n", strerror(-map)); return NULL; } /* * For compat tasks the address in foreign process * must lay inside 4 bytes. */ if (compat_task) map &= 0xfffffffful; return (void *)map; } /* * regs must be inited when calling this function from original context */ void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { set_user_reg(regs, ip, new_ip); if (stack) set_user_reg(regs, sp, (unsigned long)stack); /* Avoid end of syscall processing */ set_user_reg(regs, orig_ax, -1); /* Make sure flags are in known state */ set_user_reg(regs, flags, get_user_reg(regs, flags) & ~(X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_IF)); } #define USER32_CS 0x23 #define USER_CS 0x33 static bool ldt_task_selectors(pid_t pid) { unsigned long cs; errno = 0; /* * Offset of register must be from 64-bit set even for * compatible tasks. Fix this to support native i386 tasks */ cs = ptrace(PTRACE_PEEKUSER, pid, offsetof(user_regs_struct64, cs), 0); if (errno != 0) { pr_perror("Can't get CS register for %d", pid); return -1; } return cs != USER_CS && cs != USER32_CS; } static int arch_task_compatible(pid_t pid) { user_regs_struct_t r; int ret = ptrace_get_regs(pid, &r); if (ret) return -1; return !user_regs_native(&r); } bool arch_can_dump_task(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; int ret; ret = arch_task_compatible(pid); if (ret < 0) return false; if (ret && !(ctl->ictx.flags & INFECT_COMPATIBLE)) { pr_err("Can't dump task %d running in 32-bit mode\n", pid); return false; } if (ldt_task_selectors(pid)) { pr_err("Can't dump task %d with LDT descriptors\n", pid); return false; } return true; } int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { int native = compel_mode_native(ctl); void *where = native ? (void *)&s->native.uc.uc_stack : (void *)&s->compat.uc.uc_stack; long ret; int err; err = compel_syscall(ctl, __NR(sigaltstack, !native), &ret, 0, (unsigned long)where, 0, 0, 0, 0); return err ? err : ret; } /* Copied from the gdb header gdb/nat/x86-dregs.h */ /* Debug registers' indices. */ #define DR_FIRSTADDR 0 #define DR_LASTADDR 3 #define DR_NADDR 4 /* The number of debug address registers. */ #define DR_STATUS 6 /* Index of debug status register (DR6). */ #define DR_CONTROL 7 /* Index of debug control register (DR7). */ #define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */ #define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */ #define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */ /* Locally enable the break/watchpoint in the I'th debug register. */ #define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i))) int ptrace_set_breakpoint(pid_t pid, void *addr) { k_rtsigset_t block; int ret; /* Set a breakpoint */ if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_FIRSTADDR]), addr)) { pr_perror("Unable to setup a breakpoint into %d", pid); return -1; } /* Enable the breakpoint */ if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_CONTROL]), X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) { pr_perror("Unable to enable the breakpoint for %d", pid); return -1; } /* * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler * will be reset to the default one. */ ksigfillset(&block); ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { pr_perror("Can't block signals for %d", pid); return -1; } ret = ptrace(PTRACE_CONT, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the stopped tracee process %d", pid); return -1; } return 1; } int ptrace_flush_breakpoints(pid_t pid) { /* Disable the breakpoint */ if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_CONTROL]), 0)) { pr_perror("Unable to disable the breakpoint for %d", pid); return -1; } return 0; } int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs) { struct iovec iov; int ret; iov.iov_base = ®s->native; iov.iov_len = sizeof(user_regs_struct64); ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); if (ret == -1) { pr_perror("PTRACE_GETREGSET failed"); return -1; } if (iov.iov_len == sizeof(regs->native)) { regs->__is_native = NATIVE_MAGIC; return ret; } if (iov.iov_len == sizeof(regs->compat)) { regs->__is_native = COMPAT_MAGIC; return ret; } pr_err("PTRACE_GETREGSET read %zu bytes for pid %d, but native/compat regs sizes are %zu/%zu bytes\n", iov.iov_len, pid, sizeof(regs->native), sizeof(regs->compat)); return -1; } int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs) { struct iovec iov; if (user_regs_native(regs)) { iov.iov_base = ®s->native; iov.iov_len = sizeof(user_regs_struct64); } else { iov.iov_base = ®s->compat; iov.iov_len = sizeof(user_regs_struct32); } return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); } #define TASK_SIZE ((1UL << 47) - PAGE_SIZE) /* * Task size may be limited to 3G but we need a * higher limit, because it's backward compatible. */ #define TASK_SIZE_IA32 (0xffffe000) unsigned long compel_task_size(void) { return TASK_SIZE; } crac-criu-1.5.0/compel/arch/x86/src/lib/thread_area.c000066400000000000000000000044041471504326700221610ustar00rootroot00000000000000#include #include #include #include #include "log.h" #include "asm/infect-types.h" #include "infect.h" #include "infect-priv.h" #ifndef PTRACE_GET_THREAD_AREA #define PTRACE_GET_THREAD_AREA 25 #endif /* * For 64-bit applications, TLS (fs_base for Glibc) is in MSR, * which are dumped with the help of ptrace() and restored with * arch_prctl(ARCH_SET_FS/ARCH_SET_GS). * * But SET_FS_BASE will update GDT if base pointer fits in 4 bytes. * Otherwise it will set only MSR, which allows for mixed 64/32-bit * code to use: 2 MSRs as TLS base _and_ 3 GDT entries. * Having in sum 5 TLS pointers, 3 of which are four bytes and * other two eight bytes: * struct thread_struct { * struct desc_struct tls_array[3]; * ... * #ifdef CONFIG_X86_64 * unsigned long fsbase; * unsigned long gsbase; * #endif * ... * }; * * Most x86_64 applications don't use GDT, but mixed code (i.e. Wine) * can use it. Be pessimistic and dump it for 64-bit applications too. */ int __compel_arch_fetch_thread_area(int tid, struct thread_ctx *th) { bool native_mode = user_regs_native(&th->regs); tls_t *ptls = &th->tls; int err, i; /* Initialise as not present by default */ for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *d = &ptls->desc[i]; memset(d, 0, sizeof(user_desc_t)); d->seg_not_present = 1; d->entry_number = GDT_ENTRY_TLS_MIN + i; } for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *d = &ptls->desc[i]; err = ptrace(PTRACE_GET_THREAD_AREA, tid, GDT_ENTRY_TLS_MIN + i, d); if (err) { /* * Ignoring absent syscall on !CONFIG_IA32_EMULATION * where such mixed code can't run. * XXX: Add compile CONFIG_X86_IGNORE_64BIT_TLS * (for x86_64 systems with CONFIG_IA32_EMULATION) */ if (errno == EIO && native_mode) return 0; pr_perror("get_thread_area failed for %d", tid); return err; } } return 0; } int compel_arch_fetch_thread_area(struct parasite_thread_ctl *tctl) { return __compel_arch_fetch_thread_area(tctl->tid, &tctl->th); } void compel_arch_get_tls_task(struct parasite_ctl *ctl, tls_t *out) { memcpy(out, &ctl->orig.tls, sizeof(tls_t)); } void compel_arch_get_tls_thread(struct parasite_thread_ctl *tctl, tls_t *out) { memcpy(out, &tctl->th.tls, sizeof(tls_t)); } crac-criu-1.5.0/compel/compel-host000077500000000000000000000003641471504326700170400ustar00rootroot00000000000000#!/bin/sh # # A wrapper to use compel-host right from the source dir # (i.e. when it is not yet installed). COMPEL_UNINSTALLED_ROOTDIR=$(dirname "$0") export COMPEL_UNINSTALLED_ROOTDIR exec "${COMPEL_UNINSTALLED_ROOTDIR}/compel-host-bin" "$@" crac-criu-1.5.0/compel/include/000077500000000000000000000000001471504326700163005ustar00rootroot00000000000000crac-criu-1.5.0/compel/include/compel-cpu.h000066400000000000000000000006401471504326700205150ustar00rootroot00000000000000#ifndef __COMPEL_CPU_H__ #define __COMPEL_CPU_H__ #include #include "asm/cpu.h" extern void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature); extern void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature); extern int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature); extern int compel_test_fpu_cap(compel_cpuinfo_t *c, unsigned int feature); #endif crac-criu-1.5.0/compel/include/elf32-types.h000066400000000000000000000005631471504326700205320ustar00rootroot00000000000000#ifndef COMPEL_ELF32_TYPES_H__ #define COMPEL_ELF32_TYPES_H__ #define Elf_Ehdr Elf32_Ehdr #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Rel Elf32_Rel #define Elf_Rela Elf32_Rela #define ELF_ST_TYPE ELF32_ST_TYPE #define ELF_ST_BIND ELF32_ST_BIND #define ELF_R_SYM ELF32_R_SYM #define ELF_R_TYPE ELF32_R_TYPE #endif /* COMPEL_ELF32_TYPES_H__ */ crac-criu-1.5.0/compel/include/elf64-types.h000066400000000000000000000005631471504326700205370ustar00rootroot00000000000000#ifndef COMPEL_ELF64_TYPES_H__ #define COMPEL_ELF64_TYPES_H__ #define Elf_Ehdr Elf64_Ehdr #define Elf_Shdr Elf64_Shdr #define Elf_Sym Elf64_Sym #define Elf_Rel Elf64_Rel #define Elf_Rela Elf64_Rela #define ELF_ST_TYPE ELF64_ST_TYPE #define ELF_ST_BIND ELF64_ST_BIND #define ELF_R_SYM ELF64_R_SYM #define ELF_R_TYPE ELF64_R_TYPE #endif /* COMPEL_ELF64_TYPES_H__ */ crac-criu-1.5.0/compel/include/errno.h000066400000000000000000000003261471504326700175770ustar00rootroot00000000000000#ifndef __COMPEL_ERRNO_H__ #define __COMPEL_ERRNO_H__ #define ERESTARTSYS 512 #define ERESTARTNOINTR 513 #define ERESTARTNOHAND 514 #define ERESTART_RESTARTBLOCK 516 #endif /* __CR_ERRNO_H__ */ crac-criu-1.5.0/compel/include/infect-priv.h000066400000000000000000000047321471504326700207050ustar00rootroot00000000000000#ifndef __COMPEL_INFECT_PRIV_H__ #define __COMPEL_INFECT_PRIV_H__ #include #define BUILTIN_SYSCALL_SIZE 8 struct thread_ctx { k_rtsigset_t sigmask; user_regs_struct_t regs; #ifdef ARCH_HAS_PTRACE_GET_THREAD_AREA tls_t tls; #endif user_fpregs_struct_t ext_regs; }; /* parasite control block */ struct parasite_ctl { int rpid; /* Real pid of the victim */ void *remote_map; void *local_map; void *sigreturn_addr; /* A place for the breakpoint */ unsigned long map_length; struct infect_ctx ictx; /* thread leader data */ bool daemonized; struct thread_ctx orig; void *rstack; /* thread leader stack*/ struct rt_sigframe *sigframe; struct rt_sigframe *rsigframe; /* address in a parasite */ void *r_thread_stack; /* stack for non-leader threads */ unsigned long parasite_ip; /* service routine start ip */ unsigned int *cmd; /* address for command */ void *args; /* address for arguments */ unsigned long args_size; int tsock; /* transport socket for transferring fds */ struct parasite_blob_desc pblob; }; struct parasite_thread_ctl { int tid; struct parasite_ctl *ctl; struct thread_ctx th; }; #define MEMFD_FNAME "CRIUMFD" #define MEMFD_FNAME_SZ sizeof(MEMFD_FNAME) struct ctl_msg; int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m); extern void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs); extern void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset); extern bool arch_can_dump_task(struct parasite_ctl *ctl); /* * @regs: general purpose registers * @ext_regs: extended register set (fpu/mmx/sse/etc) * for task that is NULL, restored by sigframe on rt_sigreturn() * @save: callback to dump all info * @flags: see INFECT_* in infect_ctx::flags * @pid: mystery */ extern int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, unsigned long flags); extern int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs); extern int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s); extern int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs); extern int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); extern int compel_execute_syscall(struct parasite_ctl *ctl, user_regs_struct_t *regs, const char *code_syscall); #endif crac-criu-1.5.0/compel/include/log.h000066400000000000000000000033601471504326700172340ustar00rootroot00000000000000#ifndef COMPEL_LOG_H__ #define COMPEL_LOG_H__ #include #include #include "uapi/compel/log.h" #ifndef LOG_PREFIX #define LOG_PREFIX #endif static inline int pr_quelled(unsigned int loglevel) { return compel_log_get_loglevel() < loglevel && loglevel != COMPEL_LOG_MSG; } extern void compel_print_on_level(unsigned int loglevel, const char *format, ...) __attribute__((__format__(__printf__, 2, 3))); #define pr_msg(fmt, ...) compel_print_on_level(COMPEL_LOG_MSG, fmt, ##__VA_ARGS__) #define pr_info(fmt, ...) compel_print_on_level(COMPEL_LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) #define pr_err(fmt, ...) \ compel_print_on_level(COMPEL_LOG_ERROR, "Error (%s:%d): " LOG_PREFIX fmt, __FILE__, __LINE__, ##__VA_ARGS__) #define pr_err_once(fmt, ...) \ do { \ static bool __printed; \ if (!__printed) { \ pr_err(fmt, ##__VA_ARGS__); \ __printed = 1; \ } \ } while (0) #define pr_warn(fmt, ...) \ compel_print_on_level(COMPEL_LOG_WARN, "Warn (%s:%d): " LOG_PREFIX fmt, __FILE__, __LINE__, ##__VA_ARGS__) #define pr_warn_once(fmt, ...) \ do { \ static bool __printed; \ if (!__printed) { \ pr_warn(fmt, ##__VA_ARGS__); \ __printed = 1; \ } \ } while (0) #define pr_debug(fmt, ...) compel_print_on_level(COMPEL_LOG_DEBUG, LOG_PREFIX fmt, ##__VA_ARGS__) #define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) #define pr_pwarn(fmt, ...) \ pr_warn(fmt ": %m\n", ##__VA_ARGS__) #endif /* COMPEL_LOG_H__ */ crac-criu-1.5.0/compel/include/piegen.h000066400000000000000000000011231471504326700177150ustar00rootroot00000000000000#ifndef COMPEL_PIEGEN_H__ #define COMPEL_PIEGEN_H__ #include #include #include #include "common/compiler.h" typedef struct { char *input_filename; char *output_filename; char *prefix; FILE *fout; } piegen_opt_t; extern piegen_opt_t opts; #define pr_out(fmt, ...) \ do { \ if (opts.fout) \ fprintf(opts.fout, fmt, ##__VA_ARGS__); \ } while (0) extern int handle_binary(void *mem, size_t size); #endif /* COMPEL_PIEGEN_H__ */ crac-criu-1.5.0/compel/include/ptrace.h000066400000000000000000000006211471504326700177260ustar00rootroot00000000000000#ifndef COMPEL_PTRACE_H__ #define COMPEL_PTRACE_H__ #include #include #include #define PTRACE_SYSCALL_TRAP 0x80 #define PTRACE_SI_EVENT(_si_code) (((_si_code)&0xFFFF) >> 8) extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs); extern int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs); #endif /* COMPEL_PTRACE_H__ */ crac-criu-1.5.0/compel/include/rpc-pie-priv.h000066400000000000000000000022501471504326700207650ustar00rootroot00000000000000#ifndef __COMPEL_RPC_H__ #define __COMPEL_RPC_H__ struct ctl_msg { uint32_t cmd; /* command itself */ uint32_t ack; /* ack on command */ int32_t err; /* error code on reply */ }; #define ctl_msg_cmd(_cmd) \ (struct ctl_msg) \ { \ .cmd = _cmd, \ } #define ctl_msg_ack(_cmd, _err) \ (struct ctl_msg) \ { \ .cmd = _cmd, .ack = _cmd, .err = _err, \ } /* * NOTE: each command's args should be arch-independed sized. * If you want to use one of the standard types, declare * alternative type for compatible tasks in parasite-compat.h */ enum { PARASITE_CMD_IDLE = 0, PARASITE_CMD_ACK, PARASITE_CMD_INIT_DAEMON, /* * This must be greater than INITs. */ PARASITE_CMD_FINI, __PARASITE_END_CMDS, }; struct parasite_init_args { int32_t h_addr_len; struct sockaddr_un h_addr; int32_t log_level; uint64_t sigreturn_addr; uint64_t sigframe; /* pointer to sigframe */ futex_t daemon_connected; #ifdef ARCH_HAS_LONG_PAGES uint32_t page_size; #endif }; struct parasite_unmap_args { uint64_t parasite_start; uint64_t parasite_len; }; #endif crac-criu-1.5.0/compel/include/shmem.h000066400000000000000000000003031471504326700175560ustar00rootroot00000000000000#ifndef __COMPEL_PLUGIN_SHMEM_PRIV_H__ #define __COMPEL_PLUGIN_SHMEM_PRIV_H__ struct shmem_plugin_msg { unsigned long start; unsigned long len; }; #endif /* __COMPEL_PLUGIN_SHMEM_PRIV_H__ */ crac-criu-1.5.0/compel/include/uapi/000077500000000000000000000000001471504326700172365ustar00rootroot00000000000000crac-criu-1.5.0/compel/include/uapi/asm000077700000000000000000000000001471504326700223702../asm/uapi/asmustar00rootroot00000000000000crac-criu-1.5.0/compel/include/uapi/common000077700000000000000000000000001471504326700241432../../../include/commonustar00rootroot00000000000000crac-criu-1.5.0/compel/include/uapi/compel000077700000000000000000000000001471504326700205132.ustar00rootroot00000000000000crac-criu-1.5.0/compel/include/uapi/cpu.h000066400000000000000000000011261471504326700201760ustar00rootroot00000000000000#ifndef UAPI_COMPEL_CPU_H__ #define UAPI_COMPEL_CPU_H__ #include #include #include extern int /* TODO: __must_check */ compel_cpuid(compel_cpuinfo_t *info); extern bool compel_cpu_has_feature(unsigned int feature); extern bool compel_fpu_has_feature(unsigned int feature); extern uint32_t compel_fpu_feature_size(unsigned int feature); extern uint32_t compel_fpu_feature_offset(unsigned int feature); extern void compel_cpu_clear_feature(unsigned int feature); extern void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c); #endif /* UAPI_COMPEL_CPU_H__ */ crac-criu-1.5.0/compel/include/uapi/handle-elf.h000066400000000000000000000010741471504326700214100ustar00rootroot00000000000000#ifndef __COMPEL_UAPI_HANDLE_ELF__ #define __COMPEL_UAPI_HANDLE_ELF__ #define COMPEL_TYPE_INT (1u << 0) #define COMPEL_TYPE_LONG (1u << 1) #define COMPEL_TYPE_GOTPCREL (1u << 2) #ifdef CONFIG_MIPS #define COMPEL_TYPE_MIPS_26 (1u << 3) #define COMPEL_TYPE_MIPS_HI16 (1u << 4) #define COMPEL_TYPE_MIPS_LO16 (1u << 5) #define COMPEL_TYPE_MIPS_HIGHER (1u << 6) #define COMPEL_TYPE_MIPS_HIGHEST (1u << 7) #define COMPEL_TYPE_MIPS_64 (1u << 8) #endif typedef struct { unsigned int offset; unsigned int type; long addend; long value; } compel_reloc_t; #endif crac-criu-1.5.0/compel/include/uapi/infect-rpc.h000066400000000000000000000010261471504326700214400ustar00rootroot00000000000000#ifndef __COMPEL_INFECT_RPC_H__ #define __COMPEL_INFECT_RPC_H__ #include #include #include #include struct parasite_ctl; extern int __must_check compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl); extern int __must_check compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl); extern int __must_check compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl); extern int compel_rpc_sock(struct parasite_ctl *ctl); #define PARASITE_USER_CMDS 64 #endif crac-criu-1.5.0/compel/include/uapi/infect-util.h000066400000000000000000000007031471504326700216320ustar00rootroot00000000000000#ifndef __COMPEL_INFECT_UTIL_H__ #define __COMPEL_INFECT_UTIL_H__ #include "common/compiler.h" /* * compel_run_id is a unique value of the current run. It can be used to * generate resource ID-s to avoid conflicts with other processes. */ extern uint64_t compel_run_id; struct parasite_ctl; extern int __must_check compel_util_send_fd(struct parasite_ctl *ctl, int fd); extern int compel_util_recv_fd(struct parasite_ctl *ctl, int *pfd); #endif crac-criu-1.5.0/compel/include/uapi/infect.h000066400000000000000000000147031471504326700206640ustar00rootroot00000000000000#ifndef __COMPEL_INFECT_H__ #define __COMPEL_INFECT_H__ #include #include #include #include #include #include #include "common/compiler.h" #define PARASITE_START_AREA_MIN (4096) extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { unsigned long long sigpnd; unsigned long long shdpnd; unsigned long long sigblk; char state; int vpid; int ppid; int seccomp_mode; }; extern int __must_check compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_task_status *, void *data), void (*free_status)(int pid, struct seize_task_status *, void *data), struct seize_task_status *st, void *data); extern int __must_check compel_stop_task(int pid); extern int __must_check compel_parse_stop_signo(int pid); extern int compel_resume_task(pid_t pid, int orig_state, int state); extern int compel_resume_task_sig(pid_t pid, int orig_state, int state, int stop_signo); struct parasite_ctl; struct parasite_thread_ctl; extern struct parasite_ctl __must_check *compel_prepare(int pid); extern struct parasite_ctl __must_check *compel_prepare_noctx(int pid); extern int __must_check compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); extern int __must_check compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *); extern int __must_check compel_start_daemon(struct parasite_ctl *ctl); extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); extern int __must_check compel_cure_local(struct parasite_ctl *ctl); extern int __must_check compel_cure(struct parasite_ctl *ctl); #define PARASITE_ARG_SIZE_MIN (1 << 12) #define compel_parasite_args(ctl, type) \ ({ \ void *___ret; \ BUILD_BUG_ON(sizeof(type) > PARASITE_ARG_SIZE_MIN); \ ___ret = compel_parasite_args_p(ctl); \ ___ret; \ }) extern void *compel_parasite_args_p(struct parasite_ctl *ctl); extern void *compel_parasite_args_s(struct parasite_ctl *ctl, unsigned long args_size); extern int __must_check compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6); extern int __must_check compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd); extern int __must_check compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs); /* * The PTRACE_SYSCALL will trap task twice -- on * enter into and on exit from syscall. If we trace * a single task, we may skip half of all getregs * calls -- on exit we don't need them. */ enum trace_flags { TRACE_ALL, TRACE_ENTER, TRACE_EXIT, }; extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat); extern int __must_check compel_stop_pie(pid_t pid, void *addr, bool no_bp); extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); extern int compel_mode_native(struct parasite_ctl *ctl); extern k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl); extern k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl); struct rt_sigframe; typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) __attribute__((__format__(__printf__, 3, 4))); typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *); typedef int (*make_sigframe_t)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *); struct infect_ctx { int sock; /* * Regs manipulation context. */ save_regs_t save_regs; make_sigframe_t make_sigframe; void *regs_arg; unsigned long task_size; unsigned long syscall_ip; /* entry point of infection */ unsigned long flags; /* fine-tune (e.g. faults) */ void (*child_handler)(int, siginfo_t *, void *); /* handler for SIGCHLD deaths */ struct sigaction orig_handler; open_proc_fn open_proc; int log_fd; /* fd for parasite code to send messages to */ }; extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *); /* Don't use memfd() */ #define INFECT_NO_MEMFD (1UL << 0) /* Make parasite connect() fail */ #define INFECT_FAIL_CONNECT (1UL << 1) /* No breakpoints in pie tracking */ #define INFECT_NO_BREAKPOINTS (1UL << 2) /* Can run parasite inside compat tasks */ #define INFECT_COMPATIBLE (1UL << 3) /* Workaround for ptrace bug on Skylake CPUs with kernels older than v4.14 */ #define INFECT_X86_PTRACE_MXCSR_BUG (1UL << 4) /* After infecting - corrupt extended registers (fault-injection) */ #define INFECT_CORRUPT_EXTREGS (1UL << 5) /* * There are several ways to describe a blob to compel * library. The simplest one derived from criu is to * provide it from .h files. */ #define COMPEL_BLOB_CHEADER 0x1 struct parasite_blob_desc { unsigned parasite_type; union { struct { const void *mem; size_t bsize; unsigned long parasite_ip_off; unsigned long cmd_off; unsigned long args_ptr_off; unsigned long got_off; unsigned long args_off; unsigned long data_off; compel_reloc_t *relocs; unsigned int nr_relocs; } hdr; }; }; extern struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *); extern int __must_check compel_get_thread_regs(struct parasite_thread_ctl *, save_regs_t, void *); extern void compel_relocs_apply(void *mem, void *vbase, struct parasite_blob_desc *pbd); extern void compel_relocs_apply_mips(void *mem, void *vbase, struct parasite_blob_desc *pbd); extern unsigned long compel_task_size(void); extern uint64_t compel_get_leader_sp(struct parasite_ctl *ctl); extern uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl); extern uint64_t compel_get_leader_ip(struct parasite_ctl *ctl); extern uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl); void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v); void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); #endif crac-criu-1.5.0/compel/include/uapi/ksigset.h000066400000000000000000000012161471504326700210600ustar00rootroot00000000000000#ifndef __COMPEL_KSIGSET_H__ #define __COMPEL_KSIGSET_H__ #include static inline void ksigfillset(k_rtsigset_t *set) { int i; for (i = 0; i < _KNSIG_WORDS; i++) set->sig[i] = (unsigned long)-1; } static inline void ksigemptyset(k_rtsigset_t *set) { int i; for (i = 0; i < _KNSIG_WORDS; i++) set->sig[i] = 0; } static inline void ksigaddset(k_rtsigset_t *set, int _sig) { int sig = _sig - 1; set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW); } static inline void ksigdelset(k_rtsigset_t *set, int _sig) { int sig = _sig - 1; set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW)); } #endif crac-criu-1.5.0/compel/include/uapi/log.h000066400000000000000000000005031471504326700201660ustar00rootroot00000000000000#ifndef __COMPEL_UAPI_LOG_H__ #define __COMPEL_UAPI_LOG_H__ #include #include typedef void (*compel_log_fn)(unsigned int lvl, const char *fmt, va_list parms); extern void compel_log_init(compel_log_fn log_fn, unsigned int level); extern unsigned int compel_log_get_loglevel(void); #endif crac-criu-1.5.0/compel/include/uapi/loglevels.h000066400000000000000000000010421471504326700214000ustar00rootroot00000000000000#ifndef UAPI_COMPEL_LOGLEVELS_H__ #define UAPI_COMPEL_LOGLEVELS_H__ /* * Log levels used by compel itself (see compel_log_init()), * also by log functions in the std plugin. */ enum __compel_log_levels { COMPEL_LOG_MSG, /* Print message regardless of log level */ COMPEL_LOG_ERROR, /* Errors only, when we're in trouble */ COMPEL_LOG_WARN, /* Warnings */ COMPEL_LOG_INFO, /* Informative, everything is fine */ COMPEL_LOG_DEBUG, /* Debug only */ COMPEL_DEFAULT_LOGLEVEL = COMPEL_LOG_WARN }; #endif /* UAPI_COMPEL_LOGLEVELS_H__ */ crac-criu-1.5.0/compel/include/uapi/plugins000077700000000000000000000000001471504326700252502../../plugins/include/uapiustar00rootroot00000000000000crac-criu-1.5.0/compel/include/uapi/plugins.h000066400000000000000000000021771471504326700210770ustar00rootroot00000000000000#ifndef UAPI_COMPEL_PLUGIN_H__ #define UAPI_COMPEL_PLUGIN_H__ #define __init __attribute__((__used__)) __attribute__((__section__(".compel.init"))) #define __exit __attribute__((__used__)) __attribute__((__section__(".compel.exit"))) #ifndef __ASSEMBLY__ typedef struct { const char *name; int (*init)(void); void (*exit)(void); } plugin_init_t; #define plugin_register(___desc) static const plugin_init_t *const ___ptr__##___desc __init = &___desc; #define PLUGIN_REGISTER(___id, ___name, ___init, ___exit) \ static const plugin_init_t __plugin_desc_##___id = { \ .name = ___name, \ .init = ___init, \ .exit = ___exit, \ }; \ plugin_register(__plugin_desc_##___id); #define PLUGIN_REGISTER_DUMMY(___id) \ static const plugin_init_t __plugin_desc_##___id = { \ .name = #___id, \ }; \ plugin_register(__plugin_desc_##___id); #endif /* __ASSEMBLY__ */ #endif /* UAPI_COMPEL_PLUGIN_H__ */ crac-criu-1.5.0/compel/include/uapi/ptrace.h000066400000000000000000000045531471504326700206740ustar00rootroot00000000000000#ifndef UAPI_COMPEL_PTRACE_H__ #define UAPI_COMPEL_PTRACE_H__ #include "common/compiler.h" /* * We'd want to include both sys/ptrace.h and linux/ptrace.h, * hoping that most definitions come from either one or another. * Alas, on Alpine/musl both files declare struct ptrace_peeksiginfo_args, * so there is no way they can be used together. Let's rely on libc one. */ #include #include #include /* * Some constants for ptrace that might be missing from the * standard library includes due to being (relatively) new. */ #ifndef PTRACE_SEIZE #define PTRACE_SEIZE 0x4206 #endif #ifndef PTRACE_O_SUSPEND_SECCOMP #define PTRACE_O_SUSPEND_SECCOMP (1 << 21) #endif #ifndef PTRACE_INTERRUPT #define PTRACE_INTERRUPT 0x4207 #endif #ifndef PTRACE_PEEKSIGINFO #define PTRACE_PEEKSIGINFO 0x4209 /* Read signals from a shared (process wide) queue */ #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) #endif #ifndef PTRACE_GETREGSET #define PTRACE_GETREGSET 0x4204 #define PTRACE_SETREGSET 0x4205 #endif #ifndef PTRACE_GETSIGMASK #define PTRACE_GETSIGMASK 0x420a #define PTRACE_SETSIGMASK 0x420b #endif #ifndef PTRACE_SECCOMP_GET_FILTER #define PTRACE_SECCOMP_GET_FILTER 0x420c #endif #ifndef PTRACE_SECCOMP_GET_METADATA #define PTRACE_SECCOMP_GET_METADATA 0x420d #endif /* PTRACE_SECCOMP_GET_METADATA */ /* * struct seccomp_metadata is not yet * settled down well in headers so use * own identical definition for a while. */ typedef struct { uint64_t filter_off; /* Input: which filter */ uint64_t flags; /* Output: filter's flags */ } seccomp_metadata_t; #ifndef PTRACE_GET_RSEQ_CONFIGURATION #define PTRACE_GET_RSEQ_CONFIGURATION 0x420f struct __ptrace_rseq_configuration { uint64_t rseq_abi_pointer; uint32_t rseq_abi_size; uint32_t signature; uint32_t flags; uint32_t pad; }; #endif #ifdef PTRACE_EVENT_STOP #if PTRACE_EVENT_STOP == 7 /* Bad value from Linux 3.1-3.3, fixed in 3.4 */ #undef PTRACE_EVENT_STOP #endif #endif #ifndef PTRACE_EVENT_STOP #define PTRACE_EVENT_STOP 128 #endif extern int ptrace_suspend_seccomp(pid_t pid); extern int __must_check ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); extern int __must_check ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); extern int __must_check ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes); #endif /* UAPI_COMPEL_PTRACE_H__ */ crac-criu-1.5.0/compel/include/uapi/sigframe-common.h000066400000000000000000000031131471504326700224700ustar00rootroot00000000000000/* * Don't include it directly but use "arch-sigframe.h" instead. */ #ifndef UAPI_COMPEL_SIGFRAME_COMMON_H__ #define UAPI_COMPEL_SIGFRAME_COMMON_H__ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #error "Direct inclusion is forbidden, use instead" #endif #include "common/compiler.h" #include #include struct rt_sigframe; #ifndef SIGFRAME_MAX_OFFSET #define SIGFRAME_MAX_OFFSET RT_SIGFRAME_OFFSET(0) #endif #define RESTORE_STACK_ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) /* sigframe should be aligned on 64 byte for x86 and 8 bytes for arm */ #define RESTORE_STACK_SIGFRAME RESTORE_STACK_ALIGN(sizeof(struct rt_sigframe) + SIGFRAME_MAX_OFFSET, 64) #ifndef __ARCH_SI_PREAMBLE_SIZE #define __ARCH_SI_PREAMBLE_SIZE (3 * sizeof(int)) #endif #define SI_MAX_SIZE 128 #ifndef SI_PAD_SIZE #define SI_PAD_SIZE ((SI_MAX_SIZE - __ARCH_SI_PREAMBLE_SIZE) / sizeof(int)) #endif typedef struct rt_siginfo { int si_signo; int si_errno; int si_code; int _pad[SI_PAD_SIZE]; } rt_siginfo_t; typedef struct rt_sigaltstack { void *ss_sp; int ss_flags; size_t ss_size; } rt_stack_t; struct rt_ucontext { unsigned long uc_flags; struct rt_ucontext *uc_link; rt_stack_t uc_stack; struct rt_sigcontext uc_mcontext; k_rtsigset_t uc_sigmask; /* mask last for extensibility */ int _unused[32 - (sizeof(k_rtsigset_t) / sizeof(int))]; unsigned long uc_regspace[128] __attribute__((aligned(8))); }; extern int __must_check sigreturn_prep_fpu_frame(struct rt_sigframe *frame, struct rt_sigframe *rframe); #endif /* UAPI_COMPEL_SIGFRAME_COMMON_H__ */ crac-criu-1.5.0/compel/include/uapi/task-state.h000066400000000000000000000007071471504326700214730ustar00rootroot00000000000000#ifndef __COMPEL_UAPI_TASK_STATE_H__ #define __COMPEL_UAPI_TASK_STATE_H__ /* * Task state, as returned by compel_wait_task() * and used in arguments to compel_resume_task(). */ enum __compel_task_state { COMPEL_TASK_ALIVE = 0x01, COMPEL_TASK_DEAD = 0x02, COMPEL_TASK_STOPPED = 0x03, COMPEL_TASK_ZOMBIE = 0x06, /* Don't ever change the above values, they are used by CRIU! */ COMPEL_TASK_MAX = 0x7f }; #endif /* __COMPEL_UAPI_TASK_STATE_H__ */ crac-criu-1.5.0/compel/plugins/000077500000000000000000000000001471504326700163365ustar00rootroot00000000000000crac-criu-1.5.0/compel/plugins/Makefile000066400000000000000000000065511471504326700200050ustar00rootroot00000000000000CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) CFLAGS += -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += -Wp,-U_FORTIFY_SOURCE -Wp,-D_FORTIFY_SOURCE=0 PLUGIN_ARCH_DIR := compel/arch/$(ARCH)/plugins # # CFLAGS, ASFLAGS, LDFLAGS # Required for pie code ccflags-y += $(CFLAGS_PIE) # UAPI inclusion, referred as ccflags-y += -I compel/include/uapi asflags-y += -I compel/include/uapi # General compel includes ccflags-y += -iquote compel/include ifeq ($(ARCH),mips) ccflags-y += -mno-abicalls -fno-pic -fno-stack-protector else ccflags-y += -fpie -fno-stack-protector endif # General compel/plugins includes ccflags-y += -iquote $(obj)/include asflags-y += -iquote $(obj)/include # Arch compel/plugins includes ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/include asflags-y += -iquote $(PLUGIN_ARCH_DIR)/include asflags-y += -iquote $(PLUGIN_ARCH_DIR) # General flags for assembly ifeq ($(ARCH),mips) asflags-y += -mno-abicalls -fno-pic -Wstrict-prototypes else asflags-y += -fpie -Wstrict-prototypes endif asflags-y += -nostdlib -fomit-frame-pointer asflags-y += -fno-stack-protector ldflags-y += -z noexecstack # # Shmem plugin target += shmem shmem-lib-y += shmem/shmem.o # # STD plugin target += std std-lib-y += std/std.o std-lib-y += std/fds.o std-lib-y += std/log.o std-lib-y += std/string.o std-lib-y += std/infect.o std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/parasite-head.o # # FDS plugin target += fds fds-lib-y += fds/fds.o ifeq ($(ARCH),x86) std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o endif ifeq ($(ARCH),mips) std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o endif ifeq ($(ARCH),ppc64) std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcmp.o endif include ./$(PLUGIN_ARCH_DIR)/std/syscalls/Makefile.syscalls define syscall-priority $(addprefix $(obj)/,$($(1):%.o=%.d)): | $($(2)) $(addprefix $(obj)/,$($(1):%.o=%.i)): | $($(2)) $(addprefix $(obj)/,$($(1):%.o=%.s)): | $($(2)) $(addprefix $(obj)/,$($(1))): | $($(2)) endef # # Almost all plugins depen on syscall headers # and definitions so we have to order their # generation manually. $(foreach t,$(target),$(eval $(call syscall-priority,$(t)-lib-y,std-headers-deps))) # # FIXME syscall-types.h should be setup earlier # install: compel/plugins/std.lib.a compel/plugins/fds.lib.a $(E) " INSTALL " compel plugins $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/compel/ $(Q) install -m 0644 $^ $(DESTDIR)$(LIBEXECDIR)/compel/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/compel/scripts $(Q) install -m 0644 compel/arch/$(ARCH)/scripts/compel-pack.lds.S $(DESTDIR)$(LIBEXECDIR)/compel/scripts $(E) " INSTALL " compel plugins uapi $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/crac-criu/compel/plugins/std/asm $(Q) cp -fL compel/plugins/include/uapi/*.h $(DESTDIR)$(INCLUDEDIR)/crac-criu/compel/plugins/ $(Q) cp -fL compel/plugins/include/uapi/std/*.h $(DESTDIR)$(INCLUDEDIR)/crac-criu/compel/plugins/std/ $(Q) cp -fL compel/plugins/include/uapi/std/asm/*.h $(DESTDIR)$(INCLUDEDIR)/crac-criu/compel/plugins/std/asm/ .PHONY: install uninstall: $(E) " UNINSTALL" compel plugins $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBEXECDIR)/compel/,*.lib.a) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBEXECDIR)/compel/scripts/,compel-pack.lds.S) $(E) " UNINSTALL" compel and plugins uapi $(Q) $(RM) -rf $(addprefix $(DESTDIR)$(INCLUDEDIR)/,crac-criu/compel/plugins) .PHONY: uninstall crac-criu-1.5.0/compel/plugins/fds/000077500000000000000000000000001471504326700171125ustar00rootroot00000000000000crac-criu-1.5.0/compel/plugins/fds/fds.c000066400000000000000000000006621471504326700200360ustar00rootroot00000000000000#include #include "uapi/plugins.h" #include "uapi/plugins/std.h" #include #define pr_err(fmt, ...) #include "common/compiler.h" #include "common/bug.h" #define __sys(foo) sys_##foo #define __sys_err(ret) ret #include "common/scm.h" int fds_send_fd(int fd) { return send_fd(parasite_get_rpc_sock(), NULL, 0, fd); } int fds_recv_fd(void) { return recv_fd(parasite_get_rpc_sock()); } crac-criu-1.5.0/compel/plugins/include/000077500000000000000000000000001471504326700177615ustar00rootroot00000000000000crac-criu-1.5.0/compel/plugins/include/std-priv.h000066400000000000000000000002251471504326700217010ustar00rootroot00000000000000#ifndef __COMPEL_PLUGIN_STD_PRIV_H__ #define __COMPEL_PLUGIN_STD_PRIV_H__ extern int std_ctl_sock(void); #endif /* __COMPEL_PLUGIN_STD_PRIV_H__ */ crac-criu-1.5.0/compel/plugins/include/uapi/000077500000000000000000000000001471504326700207175ustar00rootroot00000000000000crac-criu-1.5.0/compel/plugins/include/uapi/plugin-fds.h000066400000000000000000000002701471504326700231370ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_STD_H__ #define COMPEL_PLUGIN_STD_STD_H__ extern int __must_check fds_send_fd(int fd); extern int fds_recv_fd(void); #endif /* COMPEL_PLUGIN_STD_STD_H__ */ crac-criu-1.5.0/compel/plugins/include/uapi/shmem.h000066400000000000000000000007271471504326700222070ustar00rootroot00000000000000#ifndef __COMPEL_PLUGIN_SHMEM_H__ #define __COMPEL_PLUGIN_SHMEM_H__ /* * Creates local shmem mapping and announces it * to the peer. Peer can later "receive" one. The * local area should be munmap()-ed at the end. */ extern void *shmem_create(unsigned long size); /* * "Receives" shmem from peer and maps it. The * locally mapped area should be munmap()-ed at * the end */ extern void *shmem_receive(unsigned long *size); #endif /* __COMPEL_PLUGIN_SHMEM_H__ */ crac-criu-1.5.0/compel/plugins/include/uapi/std.h000066400000000000000000000005071471504326700216640ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_STD_H__ #define COMPEL_PLUGIN_STD_STD_H__ #include #include #include #include #include #include #endif /* COMPEL_PLUGIN_STD_STD_H__ */ crac-criu-1.5.0/compel/plugins/include/uapi/std/000077500000000000000000000000001471504326700215115ustar00rootroot00000000000000crac-criu-1.5.0/compel/plugins/include/uapi/std/asm/000077500000000000000000000000001471504326700222715ustar00rootroot00000000000000crac-criu-1.5.0/compel/plugins/include/uapi/std/asm/.gitignore000066400000000000000000000000471471504326700242620ustar00rootroot00000000000000# Dear git, please keep this directory crac-criu-1.5.0/compel/plugins/include/uapi/std/fds.h000066400000000000000000000002401471504326700224320ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_FDS_H__ #define COMPEL_PLUGIN_STD_FDS_H__ #include #include #endif /* COMPEL_PLUGIN_STD_FDS_H__ */ crac-criu-1.5.0/compel/plugins/include/uapi/std/infect.h000066400000000000000000000012521471504326700231320ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_INFECT_H__ #define COMPEL_PLUGIN_STD_INFECT_H__ #include "common/compiler.h" extern int parasite_get_rpc_sock(void); extern unsigned int __export_parasite_service_cmd; extern void *__export_parasite_service_args_ptr; extern int __must_check parasite_service(void); /* * Must be supplied by user plugins. */ extern int __must_check parasite_daemon_cmd(int cmd, void *args); extern int __must_check parasite_trap_cmd(int cmd, void *args); extern void parasite_cleanup(void); /* * FIXME: Should be supplied by log module. */ extern void log_set_fd(int fd); extern void log_set_loglevel(unsigned int level); #endif /* COMPEL_PLUGIN_STD_INFECT_H__ */ crac-criu-1.5.0/compel/plugins/include/uapi/std/log.h000066400000000000000000000023111471504326700224400ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_LOG_H__ #define COMPEL_PLUGIN_STD_LOG_H__ #include "compel/loglevels.h" #include "common/compiler.h" #define STD_LOG_SIMPLE_CHUNK 256 extern void std_log_set_fd(int fd); extern void std_log_set_loglevel(enum __compel_log_levels level); extern void std_log_set_start(struct timeval *tv); /* * Provides a function to get time *in the infected task* for log timings. * Expected use-case: address on the vdso page to get time. * If not set or called with NULL - compel will use raw syscall, * which requires enter in the kernel and as a result affects performance. */ typedef int (*gettimeofday_t)(struct timeval *tv, struct timezone *tz); extern void std_log_set_gettimeofday(gettimeofday_t gtod); /* std plugin helper to get time (hopefully, efficiently) */ extern int std_gettimeofday(struct timeval *tv, struct timezone *tz); extern int std_vprint_num(char *buf, int blen, int num, char **ps); extern void std_sprintf(char output[STD_LOG_SIMPLE_CHUNK], const char *format, ...) __attribute__((__format__(__printf__, 2, 3))); extern void print_on_level(unsigned int loglevel, const char *format, ...) __attribute__((__format__(__printf__, 2, 3))); #endif /* COMPEL_PLUGIN_STD_LOG_H__ */ crac-criu-1.5.0/compel/plugins/include/uapi/std/string.h000066400000000000000000000022321471504326700231670ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_STRING_H__ #define COMPEL_PLUGIN_STD_STRING_H__ #include #include #include /* Standard file descriptors. */ #define STDIN_FILENO 0 /* Standard input. */ #define STDOUT_FILENO 1 /* Standard output. */ #define STDERR_FILENO 2 /* Standard error output. */ extern void std_dputc(int fd, char c); extern void std_dputs(int fd, const char *s); extern void std_vdprintf(int fd, const char *format, va_list args); extern void std_dprintf(int fd, const char *format, ...) __attribute__((__format__(__printf__, 2, 3))); #define std_printf(fmt, ...) std_dprintf(STDOUT_FILENO, fmt, ##__VA_ARGS__) #define std_puts(s) std_dputs(STDOUT_FILENO, s) #define std_putchar(c) std_dputc(STDOUT_FILENO, c) extern unsigned long std_strtoul(const char *nptr, char **endptr, int base); extern int std_strcmp(const char *cs, const char *ct); extern int std_strncmp(const char *cs, const char *ct, size_t n); extern void *memcpy(void *dest, const void *src, size_t n); extern int memcmp(const void *s1, const void *s2, size_t n); extern void *memset(void *s, int c, size_t n); #endif /* COMPEL_PLUGIN_STD_STRING_H__ */ crac-criu-1.5.0/compel/plugins/include/uapi/std/syscall-types.h000066400000000000000000000026041471504326700245000ustar00rootroot00000000000000/* * Please add here type definitions if * syscall prototypes need them. */ #ifndef COMPEL_SYSCALL_TYPES_H__ #define COMPEL_SYSCALL_TYPES_H__ #include #include #include #include #include #include #include #include #include "common/bitsperlong.h" struct cap_header { uint32_t version; int pid; }; struct cap_data { uint32_t eff; uint32_t prm; uint32_t inh; }; struct robust_list_head; struct file_handle; struct itimerspec; struct io_event; struct sockaddr; struct timespec; struct siginfo; struct msghdr; struct rusage; struct iocb; struct pollfd; struct clone_args; struct open_how; typedef unsigned long aio_context_t; #ifndef F_GETFD #define F_GETFD 1 #endif struct krlimit { unsigned long rlim_cur; unsigned long rlim_max; }; /* Type of timers in the kernel. */ typedef int kernel_timer_t; #include extern long sys_preadv_raw(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h); static inline long sys_preadv(int fd, struct iovec *iov, unsigned long nr, off_t off) { #if BITS_PER_LONG == 64 return sys_preadv_raw(fd, iov, nr, off, 0); #elif BITS_PER_LONG == 32 return sys_preadv_raw(fd, iov, nr, off, ((uint64_t)off) >> 32); #else #error "BITS_PER_LONG isn't defined" #endif } #endif /* COMPEL_SYSCALL_TYPES_H__ */ crac-criu-1.5.0/compel/plugins/shmem/000077500000000000000000000000001471504326700174475ustar00rootroot00000000000000crac-criu-1.5.0/compel/plugins/shmem/shmem.c000066400000000000000000000013221471504326700207220ustar00rootroot00000000000000#include #include #include #include #include "shmem.h" #include "std-priv.h" void *shmem_create(unsigned long size) { int ret; void *mem; struct shmem_plugin_msg spi; mem = (void *)sys_mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (mem == MAP_FAILED) return NULL; spi.start = (unsigned long)mem; spi.len = size; ret = sys_write(std_ctl_sock(), &spi, sizeof(spi)); if (ret != sizeof(spi)) { sys_munmap(mem, size); return NULL; } return mem; } void *shmem_receive(unsigned long *size) { /* master -> parasite not implemented yet */ return NULL; } PLUGIN_REGISTER_DUMMY(shmem) crac-criu-1.5.0/compel/plugins/std/000077500000000000000000000000001471504326700171305ustar00rootroot00000000000000crac-criu-1.5.0/compel/plugins/std/fds.c000066400000000000000000000004221471504326700200460ustar00rootroot00000000000000#include #include #include #include "std-priv.h" #define pr_err(fmt, ...) #include "common/compiler.h" #include "common/bug.h" #define __sys(foo) sys_##foo #define __sys_err(ret) ret #include "common/scm-code.c" crac-criu-1.5.0/compel/plugins/std/infect.c000066400000000000000000000106211471504326700205440ustar00rootroot00000000000000#include #include "common/scm.h" #include "common/compiler.h" #include "common/lock.h" #include "common/page.h" #define pr_err(fmt, ...) print_on_level(1, fmt, ##__VA_ARGS__) #define pr_info(fmt, ...) print_on_level(3, fmt, ##__VA_ARGS__) #define pr_debug(fmt, ...) print_on_level(4, fmt, ##__VA_ARGS__) #include "common/bug.h" #include "uapi/compel/asm/sigframe.h" #include "uapi/compel/infect-rpc.h" #include "rpc-pie-priv.h" static int tsock = -1; static struct rt_sigframe *sigframe; #ifdef ARCH_HAS_LONG_PAGES /* * XXX: Make it compel's std plugin global variable. Drop parasite_size(). * Hint: compel on aarch64 shall learn relocs for that. */ static unsigned __page_size; unsigned long __attribute((weak)) page_size(void) { return __page_size; } #endif int parasite_get_rpc_sock(void) { return tsock; } /* RPC helpers */ static int __parasite_daemon_reply_ack(unsigned int cmd, int err) { struct ctl_msg m; int ret; m = ctl_msg_ack(cmd, err); ret = sys_sendto(tsock, &m, sizeof(m), 0, NULL, 0); if (ret != sizeof(m)) { pr_err("Sent only %d bytes while %zu expected\n", ret, sizeof(m)); return -1; } pr_debug("__sent ack msg: %d %d %d\n", m.cmd, m.ack, m.err); return 0; } static int __parasite_daemon_wait_msg(struct ctl_msg *m) { int ret; pr_debug("Daemon waits for command\n"); while (1) { *m = (struct ctl_msg){}; ret = sys_recvfrom(tsock, m, sizeof(*m), MSG_WAITALL, NULL, 0); if (ret != sizeof(*m)) { pr_err("Trimmed message received (%d/%d)\n", (int)sizeof(*m), ret); return -1; } pr_debug("__fetched msg: %d %d %d\n", m->cmd, m->ack, m->err); return 0; } return -1; } /* Core infect code */ static noinline void fini_sigreturn(unsigned long new_sp) { ARCH_RT_SIGRETURN(new_sp, sigframe); } static int fini(void) { unsigned long new_sp; parasite_cleanup(); new_sp = (long)sigframe + RT_SIGFRAME_OFFSET(sigframe); pr_debug("%ld: new_sp=%lx ip %lx\n", sys_gettid(), new_sp, RT_SIGFRAME_REGIP(sigframe)); sys_close(tsock); std_log_set_fd(-1); fini_sigreturn(new_sp); BUG(); return -1; } static noinline __used int noinline parasite_daemon(void *args) { struct ctl_msg m; int ret = -1; pr_debug("Running daemon thread leader\n"); /* Reply we're alive */ if (__parasite_daemon_reply_ack(PARASITE_CMD_INIT_DAEMON, 0)) goto out; ret = 0; while (1) { if (__parasite_daemon_wait_msg(&m)) break; if (ret && m.cmd != PARASITE_CMD_FINI) { pr_err("Command rejected\n"); continue; } if (m.cmd == PARASITE_CMD_FINI) goto out; ret = parasite_daemon_cmd(m.cmd, args); if (__parasite_daemon_reply_ack(m.cmd, ret)) break; if (ret) { pr_err("Close the control socket for writing\n"); sys_shutdown(tsock, SHUT_WR); } } out: fini(); return 0; } static noinline __used int parasite_init_daemon(void *data) { struct parasite_init_args *args = data; int ret; args->sigreturn_addr = (uint64_t)(uintptr_t)fini_sigreturn; sigframe = (void *)(uintptr_t)args->sigframe; #ifdef ARCH_HAS_LONG_PAGES __page_size = args->page_size; #endif ret = tsock = sys_socket(PF_UNIX, SOCK_SEQPACKET, 0); if (tsock < 0) { pr_err("Can't create socket: %d\n", tsock); goto err; } ret = sys_connect(tsock, (struct sockaddr *)&args->h_addr, args->h_addr_len); if (ret < 0) { pr_err("Can't connect the control socket\n"); goto err; } futex_set_and_wake(&args->daemon_connected, 1); ret = recv_fd(tsock); if (ret >= 0) { std_log_set_fd(ret); std_log_set_loglevel(args->log_level); ret = 0; } else goto err; parasite_daemon(data); err: futex_set_and_wake(&args->daemon_connected, ret); fini(); BUG(); return -1; } #ifndef __parasite_entry #define __parasite_entry #endif /* * __export_parasite_service_{cmd,args} serve as arguments to the * parasite_service() function. We use these global variables to make it * easier to pass arguments when invoking from ptrace. * * We need the linker to allocate these variables. Hence the dummy * initialization. Otherwise, we end up with COMMON symbols. */ unsigned int __export_parasite_service_cmd = 0; void *__export_parasite_service_args_ptr = NULL; int __used __parasite_entry parasite_service(void) { unsigned int cmd = __export_parasite_service_cmd; void *args = __export_parasite_service_args_ptr; pr_info("Parasite cmd %d/%x process\n", cmd, cmd); if (cmd == PARASITE_CMD_INIT_DAEMON) return parasite_init_daemon(args); return parasite_trap_cmd(cmd, args); } crac-criu-1.5.0/compel/plugins/std/log.c000066400000000000000000000156101471504326700200600ustar00rootroot00000000000000#include #include "common/bitsperlong.h" #include #include #include #include struct simple_buf { char buf[STD_LOG_SIMPLE_CHUNK]; char *bp; int prefix_len; void (*flush)(struct simple_buf *b); }; static int logfd = -1; static int cur_loglevel = COMPEL_DEFAULT_LOGLEVEL; static struct timeval start; static gettimeofday_t __std_gettimeofday; static void sbuf_log_flush(struct simple_buf *b); static inline void timediff(struct timeval *from, struct timeval *to) { to->tv_sec -= from->tv_sec; if (to->tv_usec >= from->tv_usec) to->tv_usec -= from->tv_usec; else { to->tv_sec--; to->tv_usec += 1000000 - from->tv_usec; } } static inline void pad_num(char **s, int *n, int nr) { while (*n < nr) { (*s)--; (*n)++; **s = '0'; } } static void sbuf_log_init(struct simple_buf *b) { char pbuf[12], *s; int n; /* * Format: * * (time)pie: pid: string-itself */ b->bp = b->buf; if (start.tv_sec != 0) { struct timeval now; std_gettimeofday(&now, NULL); timediff(&start, &now); /* Seconds */ n = std_vprint_num(pbuf, sizeof(pbuf), (unsigned)now.tv_sec, &s); pad_num(&s, &n, 2); b->bp[0] = '('; memcpy(b->bp + 1, s, n); b->bp[n + 1] = '.'; b->bp += n + 2; /* Mu-seconds */ n = std_vprint_num(pbuf, sizeof(pbuf), (unsigned)now.tv_usec, &s); pad_num(&s, &n, 6); memcpy(b->bp, s, n); b->bp[n++] = ')'; b->bp[n++] = ' '; b->bp += n; } n = std_vprint_num(pbuf, sizeof(pbuf), sys_gettid(), &s); b->bp[0] = 'p'; b->bp[1] = 'i'; b->bp[2] = 'e'; b->bp[3] = ':'; b->bp[4] = ' '; memcpy(b->bp + 5, s, n); b->bp[n + 5] = ':'; b->bp[n + 6] = ' '; b->bp += n + 7; b->prefix_len = b->bp - b->buf; b->flush = sbuf_log_flush; } static void sbuf_log_flush(struct simple_buf *b) { if (b->bp == b->buf + b->prefix_len) return; sys_write(logfd, b->buf, b->bp - b->buf); b->bp = b->buf + b->prefix_len; } static void sbuf_putc(struct simple_buf *b, char c) { /* TODO: maybe some warning or error here? */ if (b->bp - b->buf >= STD_LOG_SIMPLE_CHUNK) return; *b->bp = c; b->bp++; if (b->bp - b->buf >= STD_LOG_SIMPLE_CHUNK - 2) { b->bp[0] = '>'; b->bp[1] = '\n'; b->bp += 2; if (b->flush) b->flush(b); } } void std_log_set_fd(int fd) { sys_close(logfd); logfd = fd; } void std_log_set_loglevel(enum __compel_log_levels level) { cur_loglevel = level; } void std_log_set_start(struct timeval *s) { start = *s; } void std_log_set_gettimeofday(gettimeofday_t gtod) { __std_gettimeofday = gtod; } int std_gettimeofday(struct timeval *tv, struct timezone *tz) { if (__std_gettimeofday != NULL) return __std_gettimeofday(tv, tz); return sys_gettimeofday(tv, tz); } static void print_string(const char *msg, struct simple_buf *b) { while (*msg) { sbuf_putc(b, *msg); msg++; } } int std_vprint_num(char *buf, int blen, int num, char **ps) { int neg = 0; char *s; s = &buf[blen - 1]; *s-- = 0; /* make sure the returned string is NULL terminated */ if (num < 0) { neg = 1; num = -num; } else if (num == 0) { *s = '0'; s--; goto done; } while (num > 0) { *s = (num % 10) + '0'; s--; num /= 10; } if (neg) { *s = '-'; s--; } done: s++; *ps = s; return blen - (s - buf) - 1; } static void print_num(int num, struct simple_buf *b) { char buf[12], *s; std_vprint_num(buf, sizeof(buf), num, &s); print_string(s, b); } static void print_num_l(long num, struct simple_buf *b) { int neg = 0; char buf[22], *s; buf[21] = '\0'; s = &buf[20]; if (num < 0) { neg = 1; num = -num; } else if (num == 0) { *s = '0'; s--; goto done; } while (num > 0) { *s = (num % 10) + '0'; s--; num /= 10; } if (neg) { *s = '-'; s--; } done: s++; print_string(s, b); } static void print_num_u(unsigned long num, struct simple_buf *b) { char buf[22], *s; buf[21] = '\0'; s = &buf[21]; do { s--; *s = (num % 10) + '0'; num /= 10; } while (num > 0); print_string(s, b); } static void hexdigit(unsigned int v, char *to, char **z) { *to = "0123456789abcdef"[v & 0xf]; if (*to != '0') *z = to; } static void print_hex(unsigned int num, struct simple_buf *b) { char buf[11], *z = &buf[9]; buf[10] = '\0'; hexdigit(num >> 0, &buf[9], &z); hexdigit(num >> 4, &buf[8], &z); hexdigit(num >> 8, &buf[7], &z); hexdigit(num >> 12, &buf[6], &z); hexdigit(num >> 16, &buf[5], &z); hexdigit(num >> 20, &buf[4], &z); hexdigit(num >> 24, &buf[3], &z); hexdigit(num >> 28, &buf[2], &z); z -= 2; z[0] = '0'; z[1] = 'x'; print_string(z, b); } static void print_hex_l(unsigned long num, struct simple_buf *b) { char buf[19], *z = &buf[17]; buf[18] = '\0'; hexdigit(num >> 0, &buf[17], &z); hexdigit(num >> 4, &buf[16], &z); hexdigit(num >> 8, &buf[15], &z); hexdigit(num >> 12, &buf[14], &z); hexdigit(num >> 16, &buf[13], &z); hexdigit(num >> 20, &buf[12], &z); hexdigit(num >> 24, &buf[11], &z); hexdigit(num >> 28, &buf[10], &z); #if BITS_PER_LONG == 64 hexdigit(num >> 32, &buf[9], &z); hexdigit(num >> 36, &buf[8], &z); hexdigit(num >> 40, &buf[7], &z); hexdigit(num >> 44, &buf[6], &z); hexdigit(num >> 48, &buf[5], &z); hexdigit(num >> 52, &buf[4], &z); hexdigit(num >> 56, &buf[3], &z); hexdigit(num >> 60, &buf[2], &z); #endif z -= 2; z[0] = '0'; z[1] = 'x'; print_string(z, b); } static void sbuf_printf(struct simple_buf *b, const char *format, va_list args) { const char *s = format; while (1) { int along = 0; if (*s == '\0') break; if (*s != '%') { sbuf_putc(b, *s); s++; continue; } s++; if (*s == 'l') { along = 1; s++; if (*s == 'l') s++; } else if (*s == 'z') { along = (sizeof(size_t) > sizeof(int)); s++; } switch (*s) { case 's': print_string(va_arg(args, char *), b); break; case 'd': if (along) print_num_l(va_arg(args, long), b); else print_num(va_arg(args, int), b); break; case 'x': if (along) print_hex_l(va_arg(args, long), b); else print_hex(va_arg(args, unsigned int), b); break; case 'p': print_hex_l((unsigned long)va_arg(args, void *), b); break; case 'u': if (along) print_num_u(va_arg(args, unsigned long), b); else print_num_u(va_arg(args, unsigned), b); break; default: print_string("\nError: Unknown printf format %", b); sbuf_putc(b, *s); sbuf_putc(b, '\n'); return; } s++; } } void print_on_level(unsigned int loglevel, const char *format, ...) { va_list args; struct simple_buf b; if (loglevel > cur_loglevel) return; sbuf_log_init(&b); va_start(args, format); sbuf_printf(&b, format, args); va_end(args); sbuf_log_flush(&b); } void std_sprintf(char output[STD_LOG_SIMPLE_CHUNK], const char *format, ...) { va_list args; struct simple_buf b; char *p; b.bp = b.buf; b.flush = NULL; va_start(args, format); sbuf_printf(&b, format, args); va_end(args); *b.bp = 0; for (p = b.buf; p <= b.bp; p++) output[p - b.buf] = *p; } crac-criu-1.5.0/compel/plugins/std/std.c000066400000000000000000000026441471504326700200740ustar00rootroot00000000000000#include #include #include #include "asm/prologue.h" static struct prologue_init_args *init_args; static int ctl_socket = -1; int std_ctl_sock(void) { return ctl_socket; } static int init_socket(struct prologue_init_args *args) { int ret; ctl_socket = sys_socket(PF_UNIX, SOCK_SEQPACKET, 0); if (ctl_socket < 0) return ctl_socket; ret = sys_connect(ctl_socket, (struct sockaddr *)&args->ctl_sock_addr, args->ctl_sock_addr_len); if (ret < 0) return ret; return 0; } static int fini_socket(void) { char buf[32]; int ret = 0; ret = sys_shutdown(ctl_socket, SHUT_WR); if (ret) goto err; ret = sys_recv(ctl_socket, buf, sizeof(buf), MSG_WAITALL); if (ret) goto err; err: sys_close(ctl_socket); ctl_socket = -1; return ret; } #define plugin_init_count(size) ((size) / (sizeof(plugin_init_t *))) int __export_std_compel_start(struct prologue_init_args *args, const plugin_init_t *const *init_array, size_t init_size) { unsigned int i; int ret = 0; init_args = args; ret = init_socket(args); if (ret) return ret; for (i = 0; i < plugin_init_count(init_size); i++) { const plugin_init_t *d = init_array[i]; if (d && d->init) { ret = d->init(); if (ret) break; } } for (; i > 0; i--) { const plugin_init_t *d = init_array[i - 1]; if (d && d->exit) d->exit(); } fini_socket(); return ret; } PLUGIN_REGISTER_DUMMY(std) crac-criu-1.5.0/compel/plugins/std/string.c000066400000000000000000000117701471504326700206100ustar00rootroot00000000000000#include #include #include #include #include #include "features.h" static const char conv_tab[] = "0123456789abcdefghijklmnopqrstuvwxyz"; void std_dputc(int fd, char c) { sys_write(fd, &c, 1); } void std_dputs(int fd, const char *s) { for (; *s; s++) std_dputc(fd, *s); } static size_t __std_vprint_long_hex(char *buf, size_t blen, unsigned long num, char **ps) { char *s = &buf[blen - 2]; buf[blen - 1] = '\0'; if (num == 0) { *s = '0', s--; goto done; } while (num > 0) { *s = conv_tab[num % 16], s--; num /= 16; } done: s++; *ps = s; return blen - (s - buf); } static size_t __std_vprint_long(char *buf, size_t blen, long num, char **ps) { char *s = &buf[blen - 2]; int neg = 0; buf[blen - 1] = '\0'; if (num < 0) { neg = 1; num = -num; } else if (num == 0) { *s = '0'; s--; goto done; } while (num > 0) { *s = (num % 10) + '0'; s--; num /= 10; } if (neg) { *s = '-'; s--; } done: s++; *ps = s; return blen - (s - buf); } void std_vdprintf(int fd, const char *format, va_list args) { const char *s = format; for (; *s != '\0'; s++) { char buf[32], *t; int along = 0; if (*s != '%') { std_dputc(fd, *s); continue; } s++; if (*s == 'l') { along = 1; s++; if (*s == 'l') s++; } switch (*s) { case 's': std_dputs(fd, va_arg(args, char *)); break; case 'd': __std_vprint_long(buf, sizeof(buf), along ? va_arg(args, long) : (long)va_arg(args, int), &t); std_dputs(fd, t); break; case 'x': __std_vprint_long_hex(buf, sizeof(buf), along ? va_arg(args, long) : (long)va_arg(args, int), &t); std_dputs(fd, t); break; } } } void std_dprintf(int fd, const char *format, ...) { va_list args; va_start(args, format); std_vdprintf(fd, format, args); va_end(args); } static inline bool __isspace(unsigned char c) { return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v'; } static unsigned char __tolower(unsigned char c) { return (c <= 'Z' && c >= 'A') ? c - 'A' + 'a' : c; } static inline bool __isalpha(unsigned char c) { return ((c <= 'Z' && c >= 'A') || (c <= 'z' && c >= 'a')); } static inline bool __isdigit(unsigned char c) { return (c <= '9' && c >= '0'); } static inline bool __isalnum(unsigned char c) { return (__isalpha(c) || __isdigit(c)); } static unsigned int __conv_val(unsigned char c) { if (__isdigit(c)) return c - '0'; else if (__isalpha(c)) /** * If we want the value of something which __isalpha() == true * it has to be base > 10. 'A' = 10, 'B' = 11 ... 'Z' = 35 */ return __tolower(c) - 'a' + 10; return -1u; } unsigned long std_strtoul(const char *nptr, char **endptr, int base) { const char *s = nptr; bool neg = false; unsigned int v; long num = 0; if (base < 0 || base == 1 || base > 36) goto fin; while (__isspace(*s)) s++; if (!*s) goto fin; if (*s == '-') neg = true, s++; if (base == 0) { if (s[0] == '0') { unsigned char p = __tolower(s[1]); switch (p) { case 'b': base = 2, s += 2; break; case 'x': base = 16, s += 2; break; default: base = 8, s += 1; break; } } else base = 10; } else if (base == 16) { if (s[0] == '0' && __tolower(s[1]) == 'x') s += 2; } for (; *s; s++) { if (__isspace(*s)) continue; if (!__isalnum(*s)) goto fin; v = __conv_val(*s); if (v == -1u || v > base) goto fin; num *= base; num += v; } fin: if (endptr) *endptr = (char *)s; return neg ? (unsigned long)-num : (unsigned long)num; } /* * C compiler is free to insert implicit calls to memcmp, memset, * memcpy and memmove, assuming they are available during linking. * As the parasite code is not linked with libc, it must provide * our own implementations of the above functions. * Surely, these functions can also be called explicitly. * * Note: for now, not having memmove() seems OK for both gcc and clang. */ #ifndef ARCH_HAS_MEMCPY void *memcpy(void *to, const void *from, size_t n) { size_t i; unsigned char *cto = to; const unsigned char *cfrom = from; for (i = 0; i < n; ++i, ++cto, ++cfrom) *cto = *cfrom; return to; } #endif #ifndef ARCH_HAS_MEMCMP int memcmp(const void *cs, const void *ct, size_t count) { const unsigned char *su1, *su2; int res = 0; for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--) if ((res = *su1 - *su2) != 0) break; return res; } #endif #ifndef ARCH_HAS_MEMSET void *memset(void *s, const int c, size_t count) { volatile char *dest = s; size_t i = 0; while (i < count) dest[i++] = (char)c; return s; } #endif int std_strcmp(const char *cs, const char *ct) { unsigned char c1, c2; while (1) { c1 = *cs++; c2 = *ct++; if (c1 != c2) return c1 < c2 ? -1 : 1; if (!c1) break; } return 0; } int std_strncmp(const char *cs, const char *ct, size_t count) { size_t i; for (i = 0; i < count; i++) { if (cs[i] != ct[i]) return cs[i] < ct[i] ? -1 : 1; if (!cs[i]) break; } return 0; } crac-criu-1.5.0/compel/src/000077500000000000000000000000001471504326700154445ustar00rootroot00000000000000crac-criu-1.5.0/compel/src/lib/000077500000000000000000000000001471504326700162125ustar00rootroot00000000000000crac-criu-1.5.0/compel/src/lib/handle-elf-host.c000077700000000000000000000000001471504326700234662handle-elf.custar00rootroot00000000000000crac-criu-1.5.0/compel/src/lib/handle-elf.c000066400000000000000000000562511471504326700203660ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "handle-elf.h" #include "piegen.h" #include "log.h" #ifdef CONFIG_MIPS #include "ldsodefs.h" #endif /* Check if pointer is out-of-bound */ static bool __ptr_oob(const uintptr_t ptr, const uintptr_t start, const size_t size) { uintptr_t end = start + size; return ptr >= end || ptr < start; } /* Check if pointed structure's end is out-of-bound */ static bool __ptr_struct_end_oob(const uintptr_t ptr, const size_t struct_size, const uintptr_t start, const size_t size) { /* the last byte of the structure should be inside [begin, end) */ return __ptr_oob(ptr + struct_size - 1, start, size); } /* Check if pointed structure is out-of-bound */ static bool __ptr_struct_oob(const uintptr_t ptr, const size_t struct_size, const uintptr_t start, const size_t size) { return __ptr_oob(ptr, start, size) || __ptr_struct_end_oob(ptr, struct_size, start, size); } static bool test_pointer(const void *ptr, const void *start, const size_t size, const char *name, const char *file, const int line) { if (__ptr_oob((const uintptr_t)ptr, (const uintptr_t)start, size)) { pr_err("Corrupted pointer %p (%s) at %s:%d\n", ptr, name, file, line); return true; } return false; } #define ptr_func_exit(__ptr) \ do { \ if (test_pointer((__ptr), mem, size, #__ptr, __FILE__, __LINE__)) { \ free(sec_hdrs); \ return -1; \ } \ } while (0) #ifdef ELF_PPC64 static int do_relative_toc(long value, uint16_t *location, unsigned long mask, int complain_signed) { if (complain_signed && (value + 0x8000 > 0xffff)) { pr_err("TOC16 relocation overflows (%ld)\n", value); return -1; } if ((~mask & 0xffff) & value) { pr_err("bad TOC16 relocation (%ld) (0x%lx)\n", value, (~mask & 0xffff) & value); return -1; } *location = (*location & ~mask) | (value & mask); return 0; } #endif static bool is_header_supported(Elf_Ehdr *hdr) { if (!arch_is_machine_supported(hdr->e_machine)) return false; if ((hdr->e_type != ET_REL #ifdef NO_RELOCS && hdr->e_type != ET_EXEC #endif ) || hdr->e_version != EV_CURRENT) return false; return true; } static const char *get_strings_section(Elf_Ehdr *hdr, uintptr_t mem, size_t size) { size_t sec_table_size = ((size_t)hdr->e_shentsize) * hdr->e_shnum; uintptr_t sec_table = mem + hdr->e_shoff; Elf_Shdr *secstrings_hdr; uintptr_t addr; if (__ptr_struct_oob(sec_table, sec_table_size, mem, size)) { pr_err("Section table [%#zx, %#zx) is out of [%#zx, %#zx)\n", sec_table, sec_table + sec_table_size, mem, mem + size); return NULL; } /* * strings section header's offset in section headers table is * (size of section header * index of string section header) */ addr = sec_table + ((size_t)hdr->e_shentsize) * hdr->e_shstrndx; if (__ptr_struct_oob(addr, sizeof(Elf_Shdr), sec_table, sec_table_size)) { pr_err("String section header @%#zx is out of [%#zx, %#zx)\n", addr, sec_table, sec_table + sec_table_size); return NULL; } secstrings_hdr = (void *)addr; addr = mem + secstrings_hdr->sh_offset; if (__ptr_struct_oob(addr, secstrings_hdr->sh_size, mem, size)) { pr_err("String section @%#zx size %#lx is out of [%#zx, %#zx)\n", addr, (unsigned long)secstrings_hdr->sh_size, mem, mem + size); return NULL; } return (void *)addr; } /* * This name @__handle_elf get renamed into * @handle_elf_ppc64 or say @handle_elf_x86_64 * depending on the architecture it's compiled * under. */ int __handle_elf(void *mem, size_t size) { const char *symstrings = NULL; Elf_Shdr *symtab_hdr = NULL; Elf_Sym *symbols = NULL; Elf_Ehdr *hdr = mem; Elf_Shdr *strtab_hdr = NULL; Elf_Shdr **sec_hdrs = NULL; const char *secstrings; size_t i, k, nr_gotpcrel = 0; #ifdef ELF_PPC64 int64_t toc_offset = 0; #endif int ret = -EINVAL; unsigned long data_off = 0; pr_debug("Header\n"); pr_debug("------------\n"); pr_debug("\ttype 0x%x machine 0x%x version 0x%x\n", (unsigned)hdr->e_type, (unsigned)hdr->e_machine, (unsigned)hdr->e_version); if (!is_header_supported(hdr)) { pr_err("Unsupported header detected\n"); goto err; } sec_hdrs = malloc(sizeof(*sec_hdrs) * hdr->e_shnum); if (!sec_hdrs) { pr_err("No memory for section headers\n"); ret = -ENOMEM; goto err; } secstrings = get_strings_section(hdr, (uintptr_t)mem, size); if (!secstrings) goto err; pr_debug("Sections\n"); pr_debug("------------\n"); for (i = 0; i < hdr->e_shnum; i++) { Elf_Shdr *sh = mem + hdr->e_shoff + hdr->e_shentsize * i; ptr_func_exit(sh); if (sh->sh_type == SHT_SYMTAB) symtab_hdr = sh; ptr_func_exit(&secstrings[sh->sh_name]); pr_debug("\t index %-2zd type 0x%-2x name %s\n", i, (unsigned)sh->sh_type, &secstrings[sh->sh_name]); sec_hdrs[i] = sh; #ifdef ELF_PPC64 if (!strcmp(&secstrings[sh->sh_name], ".toc")) { toc_offset = sh->sh_addr + 0x8000; pr_debug("\t\tTOC offset 0x%lx\n", toc_offset); } #endif } /* Calculate section addresses with proper alignment. * Note: some but not all linkers precalculate this information. */ for (i = 0, k = 0; i < hdr->e_shnum; i++) { Elf_Shdr *sh = sec_hdrs[i]; if (!(sh->sh_flags & SHF_ALLOC)) continue; if (sh->sh_addralign > 0 && k % sh->sh_addralign != 0) { k += sh->sh_addralign - k % sh->sh_addralign; } if (sh->sh_addr && sh->sh_addr != k) pr_info("Overriding unexpected precalculated address of section (section %s addr 0x%lx expected 0x%lx)\n", &secstrings[sh->sh_name], (unsigned long)sh->sh_addr, (unsigned long)k); sh->sh_addr = k; k += sh->sh_size; } if (!symtab_hdr) { pr_err("No symbol table present\n"); goto err; } if (!symtab_hdr->sh_link || symtab_hdr->sh_link >= hdr->e_shnum) { pr_err("Corrupted symtab header\n"); goto err; } pr_debug("Symbols\n"); pr_debug("------------\n"); strtab_hdr = sec_hdrs[symtab_hdr->sh_link]; ptr_func_exit(strtab_hdr); symbols = mem + symtab_hdr->sh_offset; ptr_func_exit(symbols); symstrings = mem + strtab_hdr->sh_offset; ptr_func_exit(symstrings); if (sizeof(*symbols) != symtab_hdr->sh_entsize) { pr_err("Symbol table align differ\n"); goto err; } pr_out("/* Autogenerated from %s */\n", opts.input_filename); pr_out("#include \n"); for (i = 0; i < symtab_hdr->sh_size / symtab_hdr->sh_entsize; i++) { Elf_Sym *sym = &symbols[i]; const char *name; Elf_Shdr *sh_src; ptr_func_exit(sym); name = &symstrings[sym->st_name]; ptr_func_exit(name); if (!*name) continue; pr_debug("\ttype 0x%-2x bind 0x%-2x shndx 0x%-4x value 0x%-2lx name %s\n", (unsigned)ELF_ST_TYPE(sym->st_info), (unsigned)ELF_ST_BIND(sym->st_info), (unsigned)sym->st_shndx, (unsigned long)sym->st_value, name); #ifdef ELF_PPC64 if (!sym->st_value && !strncmp(name, ".TOC.", 6)) { if (!toc_offset) { pr_err("No TOC pointer\n"); goto err; } sym->st_value = toc_offset; continue; } #endif if (strncmp(name, "__export", 8)) continue; if ((sym->st_shndx && sym->st_shndx < hdr->e_shnum) || sym->st_shndx == SHN_ABS) { if (sym->st_shndx == SHN_ABS) { sh_src = NULL; } else { sh_src = sec_hdrs[sym->st_shndx]; ptr_func_exit(sh_src); } pr_out("#define %s_sym%s 0x%lx\n", opts.prefix, name, (unsigned long)(sym->st_value + (sh_src ? sh_src->sh_addr : 0))); } } pr_out("static __maybe_unused compel_reloc_t %s_relocs[] = {\n", opts.prefix); #ifndef NO_RELOCS pr_debug("Relocations\n"); pr_debug("------------\n"); for (i = 0; i < hdr->e_shnum; i++) { Elf_Shdr *sh = sec_hdrs[i]; Elf_Shdr *sh_rel; if (sh->sh_type != SHT_REL && sh->sh_type != SHT_RELA) continue; sh_rel = sec_hdrs[sh->sh_info]; ptr_func_exit(sh_rel); pr_debug("\tsection %2zd type 0x%-2x link 0x%-2x info 0x%-2x name %s\n", i, (unsigned)sh->sh_type, (unsigned)sh->sh_link, (unsigned)sh->sh_info, &secstrings[sh->sh_name]); for (k = 0; k < sh->sh_size / sh->sh_entsize; k++) { int64_t __maybe_unused addend64, __maybe_unused value64; int32_t __maybe_unused addend32, __maybe_unused value32; unsigned long place; const char *name; void *where; Elf_Sym *sym; union { Elf_Rel rel; Elf_Rela rela; } *r = mem + sh->sh_offset + sh->sh_entsize * k; ptr_func_exit(r); sym = &symbols[ELF_R_SYM(r->rel.r_info)]; ptr_func_exit(sym); name = &symstrings[sym->st_name]; ptr_func_exit(name); where = mem + sh_rel->sh_offset + r->rel.r_offset; ptr_func_exit(where); pr_debug("\t\tr_offset 0x%-4lx r_info 0x%-4lx / sym 0x%-2lx type 0x%-2lx symsecoff 0x%-4lx\n", (unsigned long)r->rel.r_offset, (unsigned long)r->rel.r_info, (unsigned long)ELF_R_SYM(r->rel.r_info), (unsigned long)ELF_R_TYPE(r->rel.r_info), (unsigned long)sh_rel->sh_addr); if (sym->st_shndx == SHN_UNDEF) { #ifdef ELF_PPC64 /* On PowerPC, TOC symbols appear to be * undefined but should be processed as well. * Their type is STT_NOTYPE, so report any * other one. */ if (ELF32_ST_TYPE(sym->st_info) != STT_NOTYPE || strncmp(name, ".TOC.", 6)) { pr_err("Unexpected undefined symbol:%s\n", name); goto err; } #else pr_err("Unexpected undefined symbol: `%s'. External symbol in PIE?\n", name); goto err; #endif } else if (sym->st_shndx == SHN_COMMON) { /* * To support COMMON symbols, we could * allocate these variables somewhere, * perhaps somewhere near the GOT table. * For now, we punt. */ pr_err("Unsupported COMMON symbol: `%s'. Try initializing the variable\n", name); goto err; } if (sh->sh_type == SHT_REL) { addend32 = *(int32_t *)where; addend64 = *(int64_t *)where; } else { addend32 = (int32_t)r->rela.r_addend; addend64 = (int64_t)r->rela.r_addend; } place = sh_rel->sh_addr + r->rel.r_offset; pr_debug("\t\t\tvalue 0x%-8lx addend32 %-4d addend64 %-8ld place %-8lx symname %s\n", (unsigned long)sym->st_value, addend32, (long)addend64, (long)place, name); if (sym->st_shndx == SHN_ABS) { value32 = (int32_t)sym->st_value; value64 = (int64_t)sym->st_value; } else { Elf_Shdr *sh_src; if ((unsigned)sym->st_shndx > (unsigned)hdr->e_shnum) { pr_err("Unexpected symbol section index %u/%u\n", (unsigned)sym->st_shndx, hdr->e_shnum); goto err; } sh_src = sec_hdrs[sym->st_shndx]; ptr_func_exit(sh_src); value32 = (int32_t)sh_src->sh_addr + (int32_t)sym->st_value; value64 = (int64_t)sh_src->sh_addr + (int64_t)sym->st_value; } #ifdef ELF_PPC64 /* * Snippet from the OpenPOWER ABI for Linux Supplement: * * The OpenPOWER ABI uses the three most-significant bits in the symbol * st_other field specifies the number of instructions between a function's * global entry point and local entry point. The global entry point is used * when it is necessary to set up the TOC pointer (r2) for the function. The * local entry point is used when r2 is known to already be valid for the * function. A value of zero in these bits asserts that the function does * not use r2. * * The st_other values have the following meanings: * 0 and 1, the local and global entry points are the same. * 2, the local entry point is at 1 instruction past the global entry point. * 3, the local entry point is at 2 instructions past the global entry point. * 4, the local entry point is at 4 instructions past the global entry point. * 5, the local entry point is at 8 instructions past the global entry point. * 6, the local entry point is at 16 instructions past the global entry point. * 7, reserved. * * Here we are only handle the case '3' which is the most commonly seen. */ #define LOCAL_OFFSET(s) ((s->st_other >> 5) & 0x7) if (LOCAL_OFFSET(sym)) { if (LOCAL_OFFSET(sym) != 3) { pr_err("Unexpected local offset value %d\n", LOCAL_OFFSET(sym)); goto err; } pr_debug("\t\t\tUsing local offset\n"); value64 += 8; value32 += 8; } #endif switch (ELF_R_TYPE(r->rel.r_info)) { #ifdef CONFIG_MIPS case R_MIPS_PC16: /* s+a-p relative */ *((int32_t *)where) = *((int32_t *)where) | ((value32 + addend32 - place) >> 2); break; case R_MIPS_26: /* local : (((A << 2) | (P & 0xf0000000) + S) >> 2 * external : (sign–extend(A < 2) + S) >> 2 */ if (((unsigned)ELF_ST_BIND(sym->st_info) == 0x1) || ((unsigned)ELF_ST_BIND(sym->st_info) == 0x2)) { /* bind type local is 0x0 ,global is 0x1,WEAK is 0x2 */ addend32 = value32; } pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_26, " ".addend = %-8d, .value = 0x%-16x, }, /* R_MIPS_26 */\n", (unsigned int)place, addend32, value32); break; case R_MIPS_32: /* S+A */ break; case R_MIPS_64: pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_64, " ".addend = %-8ld, .value = 0x%-16lx, }, /* R_MIPS_64 */\n", (unsigned int)place, (long)addend64, (long)value64); break; case R_MIPS_HIGHEST: pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_HIGHEST, " ".addend = %-8d, .value = 0x%-16x, }, /* R_MIPS_HIGHEST */\n", (unsigned int)place, addend32, value32); break; case R_MIPS_HIGHER: pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_HIGHER, " ".addend = %-8d, .value = 0x%-16x, }, /* R_MIPS_HIGHER */\n", (unsigned int)place, addend32, value32); break; case R_MIPS_HI16: pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_HI16, " ".addend = %-8d, .value = 0x%-16x, }, /* R_MIPS_HI16 */\n", (unsigned int)place, addend32, value32); break; case R_MIPS_LO16: if ((unsigned)ELF_ST_BIND(sym->st_info) == 0x1) { /* bind type local is 0x0 ,global is 0x1 */ addend32 = value32; } pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_LO16, " ".addend = %-8d, .value = 0x%-16x, }, /* R_MIPS_LO16 */\n", (unsigned int)place, addend32, value32); break; #endif #ifdef ELF_PPC64 case R_PPC64_REL24: /* Update PC relative offset, linker has not done this yet */ pr_debug("\t\t\tR_PPC64_REL24 at 0x%-4lx val 0x%lx\n", place, value64); /* Convert value to relative */ value64 -= place; if (value64 + 0x2000000 > 0x3ffffff || (value64 & 3) != 0) { pr_err("REL24 %li out of range!\n", (long int)value64); goto err; } /* Only replace bits 2 through 26 */ *(uint32_t *)where = (*(uint32_t *)where & ~0x03fffffc) | (value64 & 0x03fffffc); break; case R_PPC64_ADDR32: case R_PPC64_REL32: pr_debug("\t\t\tR_PPC64_ADDR32 at 0x%-4lx val 0x%x\n", place, (unsigned int)(value32 + addend32)); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " " .addend = %-8d, .value = 0x%-16x, " "}, /* R_PPC64_ADDR32 */\n", (unsigned int)place, addend32, value32); break; case R_PPC64_ADDR64: case R_PPC64_REL64: pr_debug("\t\t\tR_PPC64_ADDR64 at 0x%-4lx val 0x%lx\n", place, value64 + addend64); pr_out("\t{ .offset = 0x%-8x, .type = COMPEL_TYPE_LONG," " .addend = %-8ld, .value = 0x%-16lx, " "}, /* R_PPC64_ADDR64 */\n", (unsigned int)place, (long)addend64, (long)value64); break; case R_PPC64_TOC16_HA: pr_debug("\t\t\tR_PPC64_TOC16_HA at 0x%-4lx val 0x%lx\n", place, value64 + addend64 - toc_offset + 0x8000); if (do_relative_toc((value64 + addend64 - toc_offset + 0x8000) >> 16, where, 0xffff, 1)) goto err; break; case R_PPC64_TOC16_LO: pr_debug("\t\t\tR_PPC64_TOC16_LO at 0x%-4lx val 0x%lx\n", place, value64 + addend64 - toc_offset); if (do_relative_toc(value64 + addend64 - toc_offset, where, 0xffff, 1)) goto err; break; case R_PPC64_TOC16_LO_DS: pr_debug("\t\t\tR_PPC64_TOC16_LO_DS at 0x%-4lx val 0x%lx\n", place, value64 + addend64 - toc_offset); if (do_relative_toc(value64 + addend64 - toc_offset, where, 0xfffc, 0)) goto err; break; case R_PPC64_REL16_HA: value64 += addend64 - place; pr_debug("\t\t\tR_PPC64_REL16_HA at 0x%-4lx val 0x%lx\n", place, value64); /* check that we are dealing with the addis 2,12 instruction */ if (((*(uint32_t *)where) & 0xffff0000) != 0x3c4c0000) { pr_err("Unexpected instruction for R_PPC64_REL16_HA\n"); goto err; } *(uint16_t *)where = ((value64 + 0x8000) >> 16) & 0xffff; break; case R_PPC64_REL16_LO: value64 += addend64 - place; pr_debug("\t\t\tR_PPC64_REL16_LO at 0x%-4lx val 0x%lx\n", place, value64); /* check that we are dealing with the addi 2,2 instruction */ if (((*(uint32_t *)where) & 0xffff0000) != 0x38420000) { pr_err("Unexpected instruction for R_PPC64_REL16_LO\n"); goto err; } *(uint16_t *)where = value64 & 0xffff; break; #endif /* ELF_PPC64 */ #ifdef ELF_X86_64 case R_X86_64_32: /* Symbol + Addend (4 bytes) */ case R_X86_64_32S: /* Symbol + Addend (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_32 at 0x%-4lx val 0x%x\n", place, value32); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " ".addend = %-8d, .value = 0x%-16x, }, /* R_X86_64_32 */\n", (unsigned int)place, addend32, value32); break; case R_X86_64_64: /* Symbol + Addend (8 bytes) */ pr_debug("\t\t\t\tR_X86_64_64 at 0x%-4lx val 0x%lx\n", place, (long)value64); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_LONG, " ".addend = %-8ld, .value = 0x%-16lx, }, /* R_X86_64_64 */\n", (unsigned int)place, (long)addend64, (long)value64); break; case R_X86_64_PC32: /* Symbol + Addend - Place (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); /* * R_X86_64_PC32 are relative, patch them inplace. */ *((int32_t *)where) = value32 + addend32 - place; break; case R_X86_64_PLT32: /* ProcLinkage + Addend - Place (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_PLT32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); /* * R_X86_64_PLT32 are relative, patch them inplace. */ *((int32_t *)where) = value32 + addend32 - place; break; case R_X86_64_GOTPCRELX: case R_X86_64_REX_GOTPCRELX: case R_X86_64_GOTPCREL: /* SymbolOffsetInGot + GOT + Addend - Place (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_GOTPCREL at 0x%-4lx val 0x%x\n", place, value32); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_LONG | COMPEL_TYPE_GOTPCREL, " ".addend = %-8d, .value = 0x%-16x, }, /* R_X86_64_GOTPCREL */\n", (unsigned int)place, addend32, value32); nr_gotpcrel++; break; #endif #ifdef ELF_X86_32 case R_386_32: /* Symbol + Addend */ pr_debug("\t\t\t\tR_386_32 at 0x%-4lx val 0x%x\n", place, value32 + addend32); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " ".addend = %-4d, .value = 0x%x, },\n", (unsigned int)place, addend32, value32); break; case R_386_PC32: /* Symbol + Addend - Place */ pr_debug("\t\t\t\tR_386_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); /* * R_386_PC32 are relative, patch them inplace. */ *((int32_t *)where) = value32 + addend32 - place; break; #endif #ifdef ELF_S390 /* * See also arch/s390/kernel/module.c/apply_rela(): * A PLT reads the GOT (global offset table). We can handle it like * R_390_PC32DBL because we have linked statically. */ case R_390_PLT32DBL: /* PC relative on a PLT (predure link table) */ pr_debug("\t\t\t\tR_390_PLT32DBL at 0x%-4lx val 0x%x\n", place, value32 + addend32); *((int32_t *)where) = (value64 + addend64 - place) >> 1; break; case R_390_PC32DBL: /* PC relative on a symbol */ pr_debug("\t\t\t\tR_390_PC32DBL at 0x%-4lx val 0x%x\n", place, value32 + addend32); *((int32_t *)where) = (value64 + addend64 - place) >> 1; break; case R_390_64: /* 64 bit absolute address */ pr_debug("\t\t\t\tR_390_64 at 0x%-4lx val 0x%lx\n", place, (long)value64); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_LONG, " ".addend = %-8ld, .value = 0x%-16lx, }, /* R_390_64 */\n", (unsigned int)place, (long)addend64, (long)value64); break; case R_390_PC64: /* 64 bit relative address */ *((int64_t *)where) = value64 + addend64 - place; pr_debug("\t\t\t\tR_390_PC64 at 0x%-4lx val 0x%lx\n", place, (long)value64); break; #endif default: pr_err("Unsupported relocation of type %lu\n", (unsigned long)ELF_R_TYPE(r->rel.r_info)); goto err; } } } #endif /* !NO_RELOCS */ pr_out("};\n"); pr_out("static __maybe_unused const char %s_blob[] = {\n\t", opts.prefix); for (i = 0, k = 0; i < hdr->e_shnum; i++) { Elf_Shdr *sh = sec_hdrs[i]; unsigned char *shdata; size_t j; if (!(sh->sh_flags & SHF_ALLOC) || !sh->sh_size) continue; shdata = mem + sh->sh_offset; pr_debug("Copying section '%s'\n" "\tstart:0x%lx (gap:0x%lx) size:0x%lx\n", &secstrings[sh->sh_name], (unsigned long)sh->sh_addr, (unsigned long)(sh->sh_addr - k), (unsigned long)sh->sh_size); /* write 0 in the gap between the 2 sections */ for (; k < sh->sh_addr; k++) { if (k && (k % 8) == 0) pr_out("\n\t"); pr_out("0x00,"); } for (j = 0; j < sh->sh_size; j++, k++) { if (k && (k % 8) == 0) pr_out("\n\t"); pr_out("0x%02x,", shdata[j]); } if (!strcmp(&secstrings[sh->sh_name], ".data")) data_off = sh->sh_addr; } pr_out("};\n"); pr_out("\n"); pr_out("static void __maybe_unused %s_setup_c_header_desc(struct parasite_blob_desc *pbd, bool native)\n", opts.prefix); pr_out("{\n" " pbd->parasite_type = COMPEL_BLOB_CHEADER;\n"); pr_out("\tpbd->hdr.mem = %s_blob;\n", opts.prefix); pr_out("\tpbd->hdr.bsize = sizeof(%s_blob);\n", opts.prefix); pr_out("\tif (native)\n"); pr_out("\t\tpbd->hdr.parasite_ip_off = " "%s_sym__export_parasite_head_start;\n", opts.prefix); pr_out("#ifdef CONFIG_COMPAT\n"); pr_out("\telse\n"); pr_out("\t\tpbd->hdr.parasite_ip_off = " "%s_sym__export_parasite_head_start_compat;\n", opts.prefix); pr_out("#endif /* CONFIG_COMPAT */\n"); pr_out("\tpbd->hdr.cmd_off = %s_sym__export_parasite_service_cmd;\n", opts.prefix); pr_out("\tpbd->hdr.args_ptr_off = %s_sym__export_parasite_service_args_ptr;\n", opts.prefix); pr_out("\tpbd->hdr.got_off = round_up(pbd->hdr.bsize, sizeof(long));\n"); pr_out("\tpbd->hdr.args_off = pbd->hdr.got_off + %zd*sizeof(long);\n", nr_gotpcrel); pr_out("\tpbd->hdr.data_off = %#lx;\n", data_off); pr_out("\tpbd->hdr.relocs = %s_relocs;\n", opts.prefix); pr_out("\tpbd->hdr.nr_relocs = " "sizeof(%s_relocs) / sizeof(%s_relocs[0]);\n", opts.prefix, opts.prefix); pr_out("}\n"); pr_out("\n"); pr_out("static void __maybe_unused %s_setup_c_header(struct parasite_ctl *ctl)\n", opts.prefix); pr_out("{\n"); pr_out("\t%s_setup_c_header_desc(compel_parasite_blob_desc(ctl), compel_mode_native(ctl));\n", opts.prefix); pr_out("}\n"); ret = 0; err: free(sec_hdrs); return ret; } crac-criu-1.5.0/compel/src/lib/infect-rpc.c000066400000000000000000000037111471504326700204120ustar00rootroot00000000000000#include "log.h" #include "common/bug.h" #include "common/xmalloc.h" #include "common/lock.h" #include "infect.h" #include "infect-priv.h" #include "infect-rpc.h" #include "rpc-pie-priv.h" static int __parasite_send_cmd(int sockfd, struct ctl_msg *m) { int ret; BUILD_BUG_ON(PARASITE_USER_CMDS < __PARASITE_END_CMDS); ret = send(sockfd, m, sizeof(*m), 0); if (ret == -1) { pr_perror("Failed to send command %d to daemon", m->cmd); return -1; } else if (ret != sizeof(*m)) { pr_err("Message to daemon is trimmed (%d/%d)\n", (int)sizeof(*m), ret); return -1; } pr_debug("Sent msg to daemon %d %d %d\n", m->cmd, m->ack, m->err); return 0; } int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m) { int ret; pr_debug("Wait for ack %d on daemon socket\n", cmd); while (1) { memzero(m, sizeof(*m)); ret = recv(sockfd, m, sizeof(*m), MSG_WAITALL); if (ret == -1) { pr_perror("Failed to read ack"); return -1; } else if (ret != sizeof(*m)) { pr_err("Message reply from daemon is trimmed (%d/%d)\n", (int)sizeof(*m), ret); return -1; } pr_debug("Fetched ack: %d %d %d\n", m->cmd, m->ack, m->err); if (m->cmd != cmd || m->ack != cmd) { pr_err("Communication error, this is not " "the ack we expected\n"); return -1; } return 0; } return -1; } int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl) { struct ctl_msg m; if (parasite_wait_ack(ctl->tsock, cmd, &m)) return -1; if (m.err != 0) { pr_err("Command %d for daemon failed with %d\n", cmd, m.err); return -1; } return 0; } int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl) { struct ctl_msg m; m = ctl_msg_cmd(cmd); return __parasite_send_cmd(ctl->tsock, &m); } int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl) { int ret; ret = compel_rpc_call(cmd, ctl); if (!ret) ret = compel_rpc_sync(cmd, ctl); return ret; } int compel_rpc_sock(struct parasite_ctl *ctl) { return ctl->tsock; } crac-criu-1.5.0/compel/src/lib/infect-util.c000066400000000000000000000011251471504326700206000ustar00rootroot00000000000000#include "log.h" #include "common/bug.h" #include "common/lock.h" #include "uapi/compel/plugins/std/fds.h" #include "infect-rpc.h" #include "infect-util.h" uint64_t compel_run_id; int compel_util_send_fd(struct parasite_ctl *ctl, int fd) { int sk; sk = compel_rpc_sock(ctl); if (send_fd(sk, NULL, 0, fd) < 0) { pr_perror("Can't send file descriptor"); return -1; } return 0; } int compel_util_recv_fd(struct parasite_ctl *ctl, int *pfd) { int sk; sk = compel_rpc_sock(ctl); if ((*pfd = recv_fd(sk)) < 0) { pr_perror("Can't send file descriptor"); return -1; } return 0; } crac-criu-1.5.0/compel/src/lib/infect.c000066400000000000000000001244011471504326700176300ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "log.h" #include "common/bug.h" #include "common/xmalloc.h" #include "common/lock.h" #include "common/page.h" #include #include #include "uapi/compel/plugins/std/syscall.h" #include "asm/infect-types.h" #include "asm/sigframe.h" #include "infect.h" #include "ptrace.h" #include "infect-rpc.h" #include "infect-priv.h" #include "infect-util.h" #include "rpc-pie-priv.h" #include "infect-util.h" #define __sys(foo) foo #define __sys_err(ret) (-errno) #include "common/scm.h" #include "common/scm-code.c" #ifndef UNIX_PATH_MAX #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif #define PARASITE_STACK_SIZE (16 << 10) #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif static int prepare_thread(int pid, struct thread_ctx *ctx); static inline void close_safe(int *pfd) { if (*pfd > -1) { close(*pfd); *pfd = -1; } } static int parse_pid_status(int pid, struct seize_task_status *ss, void *data) { char aux[128]; FILE *f; sprintf(aux, "/proc/%d/status", pid); f = fopen(aux, "r"); if (!f) return -1; ss->ppid = -1; /* Not needed at this point */ ss->seccomp_mode = SECCOMP_MODE_DISABLED; while (fgets(aux, sizeof(aux), f)) { if (!strncmp(aux, "State:", 6)) { ss->state = aux[7]; continue; } #ifdef ENABLE_SECCOMP if (!strncmp(aux, "Seccomp:", 8)) { if (sscanf(aux + 9, "%d", &ss->seccomp_mode) != 1) goto err_parse; continue; } #endif //ENABLE_SECCOMP if (!strncmp(aux, "ShdPnd:", 7)) { if (sscanf(aux + 7, "%llx", &ss->shdpnd) != 1) goto err_parse; continue; } if (!strncmp(aux, "SigPnd:", 7)) { if (sscanf(aux + 7, "%llx", &ss->sigpnd) != 1) goto err_parse; continue; } if (!strncmp(aux, "SigBlk:", 7)) { if (sscanf(aux + 7, "%llx", &ss->sigblk) != 1) goto err_parse; continue; } } fclose(f); return 0; err_parse: fclose(f); return -1; } int compel_stop_task(int pid) { int ret; struct seize_task_status ss = {}; ret = compel_interrupt_task(pid); if (ret == 0) ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &ss, NULL); return ret; } int compel_interrupt_task(int pid) { int ret; ret = ptrace(PTRACE_SEIZE, pid, NULL, 0); if (ret) { /* * ptrace API doesn't allow to distinguish * attaching to zombie from other errors. * All errors will be handled in compel_wait_task(). */ pr_warn("Unable to interrupt task: %d (%s)\n", pid, strerror(errno)); return ret; } /* * If we SEIZE-d the task stop it before going * and reading its stat from proc. Otherwise task * may die _while_ we're doing it and we'll have * inconsistent seize/state pair. * * If task dies after we seize it but before we * do this interrupt, we'll notice it via proc. */ ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL); if (ret < 0) { pr_warn("SEIZE %d: can't interrupt task: %s\n", pid, strerror(errno)); if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) pr_perror("Unable to detach from %d", pid); } return ret; } static int skip_sigstop(int pid, int nr_signals) { int i, status, ret; /* * 1) SIGSTOP is queued, but isn't handled yet: * SGISTOP can't be blocked, so we need to wait when the kernel * handles this signal. * * Otherwise the process will be stopped immediately after * starting it. * * 2) A seized task was stopped: * PTRACE_SEIZE doesn't affect signal or group stop state. * Currently ptrace reported that task is in stopped state. * We need to start task again, and it will be trapped * immediately, because we sent PTRACE_INTERRUPT to it. */ for (i = 0; i < nr_signals; i++) { ret = ptrace(PTRACE_CONT, pid, 0, 0); if (ret) { pr_perror("Unable to start process"); return -1; } ret = wait4(pid, &status, __WALL, NULL); if (ret < 0) { pr_perror("SEIZE %d: can't wait task", pid); return -1; } if (!WIFSTOPPED(status)) { pr_err("SEIZE %d: task not stopped after seize\n", pid); return -1; } } return 0; } #define SIG_MASK(sig) (1ULL << ((sig)-1)) #define SIG_IN_MASK(sig, mask) ((sig) > 0 && (sig) <= SIGMAX && (SIG_MASK(sig) & (mask))) #define SUPPORTED_STOP_MASK ((1ULL << (SIGSTOP - 1)) | (1ULL << (SIGTSTP - 1))) static inline int sig_stop(int sig) { return SIG_IN_MASK(sig, SUPPORTED_STOP_MASK); } int compel_parse_stop_signo(int pid) { siginfo_t si; if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si) < 0) { pr_perror("SEIZE %d: can't parse stopped siginfo", pid); return -1; } return si.si_signo; } /* * This routine seizes task putting it into a special * state where we can manipulate the task via ptrace * interface, and finally we can detach ptrace out of * of it so the task would not know if it was saddled * up with someone else. */ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_task_status *, void *), void (*free_status)(int pid, struct seize_task_status *, void *), struct seize_task_status *ss, void *data) { siginfo_t si; int status, nr_stopsig; int ret = 0, ret2, wait_errno = 0; /* * It's ugly, but the ptrace API doesn't allow to distinguish * attaching to zombie from other errors. Thus we have to parse * the target's /proc/pid/stat. Sad, but parse whatever else * we might need at that early point. */ try_again: ret = wait4(pid, &status, __WALL, NULL); if (ret < 0) { /* * wait4() can expectedly fail only in a first time * if a task is zombie. If we are here from try_again, * this means that we are tracing this task. * * So here we can be only once in this function. */ wait_errno = errno; } ret2 = get_status(pid, ss, data); if (ret2) goto err; if (ret < 0 || WIFEXITED(status) || WIFSIGNALED(status)) { if (ss->state != 'Z') { if (pid == getpid()) pr_err("The criu itself is within dumped tree.\n"); else pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n", pid, ss->state, ret, wait_errno); return -1; } if (ret < 0) return COMPEL_TASK_ZOMBIE; else return COMPEL_TASK_DEAD; } if ((ppid != -1) && (ss->ppid != ppid)) { pr_err("Task pid reused while suspending (%d: %d -> %d)\n", pid, ppid, ss->ppid); goto err; } if (!WIFSTOPPED(status)) { pr_err("SEIZE %d: task not stopped after seize\n", pid); goto err; } ret = ptrace(PTRACE_GETSIGINFO, pid, NULL, &si); if (ret < 0) { pr_perror("SEIZE %d: can't read signfo", pid); goto err; } if (PTRACE_SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) { /* * Kernel notifies us about the task being seized received some * event other than the STOP, i.e. -- a signal. Let the task * handle one and repeat. */ if (ptrace(PTRACE_CONT, pid, NULL, (void *)(unsigned long)si.si_signo)) { pr_perror("Can't continue signal handling, aborting"); goto err; } if (free_status) free_status(pid, ss, data); goto try_again; } if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); return -1; } if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && ptrace_suspend_seccomp(pid) < 0) goto err; /* * FIXME(issues/1429): parasite code contains instructions that trigger * SIGTRAP to stop at certain points. In such cases, the kernel sends a * force SIGTRAP that can't be ignored and if it is blocked, the kernel * resets its signal handler to a default one and unblocks it. It means * that if we want to save the origin signal handler, we need to run a * parasite code with the unblocked SIGTRAP. */ if ((ss->sigpnd | ss->shdpnd) & (1 << (SIGTRAP - 1))) { pr_err("Can't dump the %d thread with a pending SIGTRAP.\n", pid); goto err; } nr_stopsig = 0; if (SIG_IN_MASK(SIGSTOP, ss->sigpnd)) nr_stopsig++; if (SIG_IN_MASK(SIGSTOP, ss->shdpnd)) nr_stopsig++; if (SIG_IN_MASK(SIGTSTP, ss->sigpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) nr_stopsig++; if (SIG_IN_MASK(SIGTSTP, ss->shdpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) nr_stopsig++; if (sig_stop(si.si_signo)) nr_stopsig++; if (nr_stopsig) { if (skip_sigstop(pid, nr_stopsig)) { /* * Make sure that the task is stopped by a supported stop signal and * send it again to restore task state before criu intervention. */ if (sig_stop(si.si_signo)) kill(pid, si.si_signo); else kill(pid, SIGSTOP); goto err; } return COMPEL_TASK_STOPPED; } if (si.si_signo == SIGTRAP) return COMPEL_TASK_ALIVE; else { pr_err("SEIZE %d: unsupported stop signal %d\n", pid, si.si_signo); goto err; } err: if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) pr_perror("Unable to detach from %d", pid); return -1; } int compel_resume_task(pid_t pid, int orig_st, int st) { return compel_resume_task_sig(pid, orig_st, st, SIGSTOP); } int compel_resume_task_sig(pid_t pid, int orig_st, int st, int stop_signo) { int ret = 0; pr_debug("\tUnseizing %d into %d\n", pid, st); if (st == COMPEL_TASK_DEAD) { kill(pid, SIGKILL); return 0; } else if (st == COMPEL_TASK_STOPPED) { /* * Task might have had STOP in queue. We detected such * guy as COMPEL_TASK_STOPPED, but cleared signal to run * the parasite code. Thus after detach the task will become * running. That said -- STOP everyone regardless of * the initial state. */ kill(pid, SIGSTOP); } else if (st == COMPEL_TASK_ALIVE) { /* * Same as in the comment above -- there might be a * task with STOP in queue that would get lost after * detach, so stop it again. */ if (orig_st == COMPEL_TASK_STOPPED) { /* * Check that stop_signo contain supported stop signal. * If it isn't, then send SIGSTOP. It makes sense in the case * when we get COMPEL_TASK_STOPPED from old image, * where stop_signo was not yet supported. */ if (sig_stop(stop_signo)) kill(pid, stop_signo); else kill(pid, SIGSTOP); } } else { pr_err("Unknown final state %d\n", st); ret = -1; } if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) { pr_perror("Unable to detach from %d", pid); return -1; } return ret; } static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) { int sun_len; saddr->sun_family = AF_UNIX; snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%" PRIx64, key, compel_run_id); sun_len = SUN_LEN(saddr); *saddr->sun_path = '\0'; return sun_len; } static int prepare_tsock(struct parasite_ctl *ctl, pid_t pid, struct parasite_init_args *args) { int ssock = -1; socklen_t sk_len; struct sockaddr_un addr; pr_info("Putting tsock into pid %d\n", pid); args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid()); ssock = ctl->ictx.sock; sk_len = sizeof(addr); if (ssock == -1) { pr_err("No socket in ictx\n"); goto err; } if (getsockname(ssock, (struct sockaddr *)&addr, &sk_len) < 0) { pr_perror("Unable to get name for a socket"); return -1; } if (sk_len == sizeof(addr.sun_family)) { if (bind(ssock, (struct sockaddr *)&args->h_addr, args->h_addr_len) < 0) { pr_perror("Can't bind socket"); goto err; } if (listen(ssock, 1)) { pr_perror("Can't listen on transport socket"); goto err; } } /* Check a case when parasite can't initialize a command socket */ if (ctl->ictx.flags & INFECT_FAIL_CONNECT) args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid() + 1); /* * Set to -1 to prevent any accidental misuse. The * only valid user of it is accept_tsock(). */ ctl->tsock = -ssock; return 0; err: close_safe(&ssock); return -1; } static int setup_child_handler(struct parasite_ctl *ctl) { struct sigaction sa = { .sa_sigaction = ctl->ictx.child_handler, .sa_flags = SA_SIGINFO | SA_RESTART, }; sigemptyset(&sa.sa_mask); sigaddset(&sa.sa_mask, SIGCHLD); if (sigaction(SIGCHLD, &sa, NULL)) { pr_perror("Unable to setup SIGCHLD handler"); return -1; } return 0; } static int restore_child_handler(struct parasite_ctl *ctl) { if (sigaction(SIGCHLD, &ctl->ictx.orig_handler, NULL)) { pr_perror("Unable to setup SIGCHLD handler"); return -1; } return 0; } static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, user_regs_struct_t *regs, struct thread_ctx *octx) { k_rtsigset_t block; ksigfillset(&block); /* * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler * will be reset to the default one. */ ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { pr_perror("Can't block signals for %d", pid); goto err_sig; } parasite_setup_regs(ip, stack, regs); if (ptrace_set_regs(pid, regs)) { pr_perror("Can't set registers for %d", pid); goto err_regs; } if (ptrace(cmd, pid, NULL, NULL)) { pr_perror("Can't run parasite at %d", pid); goto err_cont; } return 0; err_cont: if (ptrace_set_regs(pid, &octx->regs)) pr_perror("Can't restore regs for %d", pid); err_regs: if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &octx->sigmask)) pr_perror("Can't restore sigmask for %d", pid); err_sig: return -1; } static int restore_thread_ctx(int pid, struct thread_ctx *ctx, bool restore_ext_regs) { int ret = 0; if (ptrace_set_regs(pid, &ctx->regs)) { pr_perror("Can't restore registers (pid: %d)", pid); ret = -1; } if (restore_ext_regs && compel_set_task_ext_regs(pid, &ctx->ext_regs)) ret = -1; if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) { pr_perror("Can't block signals"); ret = -1; } return ret; } /* we run at @regs->ip */ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, user_regs_struct_t *regs, struct thread_ctx *octx, bool may_use_extended_regs) { siginfo_t siginfo; int status; int ret = -1; /* * Most ideas are taken from Tejun Heo's parasite thread * https://code.google.com/p/ptrace-parasite/ */ if (wait4(pid, &status, __WALL, NULL) != pid) { pr_perror("Waited pid mismatch (pid: %d)", pid); goto err; } if (!WIFSTOPPED(status)) { pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); goto err; } if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) { pr_perror("Can't get siginfo (pid: %d)", pid); goto err; } if (ptrace_get_regs(pid, regs)) { pr_perror("Can't obtain registers (pid: %d)", pid); goto err; } if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != ARCH_SI_TRAP) { pr_debug("** delivering signal %d si_code=%d\n", siginfo.si_signo, siginfo.si_code); pr_err("Unexpected %d task interruption, aborting\n", pid); goto err; } /* * We've reached this point if int3 is triggered inside our * parasite code. So we're done. */ ret = 0; err: if (restore_thread_ctx(pid, octx, may_use_extended_regs)) ret = -1; return ret; } int compel_execute_syscall(struct parasite_ctl *ctl, user_regs_struct_t *regs, const char *code_syscall) { pid_t pid = ctl->rpid; int err; uint8_t code_orig[BUILTIN_SYSCALL_SIZE]; /* * Inject syscall instruction and remember original code, * we will need it to restore original program content. */ memcpy(code_orig, code_syscall, sizeof(code_orig)); if (ptrace_swap_area(pid, (void *)ctl->ictx.syscall_ip, (void *)code_orig, sizeof(code_orig))) { pr_err("Can't inject syscall blob (pid: %d)\n", pid); return -1; } err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig); if (!err) err = parasite_trap(ctl, pid, regs, &ctl->orig, false); if (ptrace_poke_area(pid, (void *)code_orig, (void *)ctl->ictx.syscall_ip, sizeof(code_orig))) { pr_err("Can't restore syscall blob (pid: %d)\n", ctl->rpid); err = -1; } return err; } int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs) { user_regs_struct_t regs = ctl->orig.regs; int ret; ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig); if (!ret) ret = parasite_trap(ctl, ctl->rpid, ret_regs ? ret_regs : ®s, &ctl->orig, false); return ret; } static int accept_tsock(struct parasite_ctl *ctl) { int sock; int ask = -ctl->tsock; /* this '-' is explained above */ sock = accept(ask, NULL, 0); if (sock < 0) { pr_perror("Can't accept connection to the transport socket"); close(ask); return -1; } ctl->tsock = sock; return 0; } static int parasite_init_daemon(struct parasite_ctl *ctl) { struct parasite_init_args *args; pid_t pid = ctl->rpid; user_regs_struct_t regs; struct ctl_msg m = {}; *ctl->cmd = PARASITE_CMD_INIT_DAEMON; args = compel_parasite_args(ctl, struct parasite_init_args); args->sigframe = (uintptr_t)ctl->rsigframe; args->log_level = compel_log_get_loglevel(); #ifdef ARCH_HAS_LONG_PAGES args->page_size = PAGE_SIZE; #endif futex_set(&args->daemon_connected, 0); if (prepare_tsock(ctl, pid, args)) goto err; /* after this we can catch parasite errors in chld handler */ if (setup_child_handler(ctl)) goto err; regs = ctl->orig.regs; if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig)) goto err; futex_wait_while_eq(&args->daemon_connected, 0); if (futex_get(&args->daemon_connected) != 1) { errno = -(int)futex_get(&args->daemon_connected); pr_perror("Unable to connect a transport socket"); goto err; } if (accept_tsock(ctl) < 0) goto err; if (compel_util_send_fd(ctl, ctl->ictx.log_fd)) goto err; pr_info("Wait for parasite being daemonized...\n"); if (parasite_wait_ack(ctl->tsock, PARASITE_CMD_INIT_DAEMON, &m)) { pr_err("Can't switch parasite %d to daemon mode %d\n", pid, m.err); goto err; } ctl->sigreturn_addr = (void *)(uintptr_t)args->sigreturn_addr; ctl->daemonized = true; pr_info("Parasite %d has been switched to daemon mode\n", pid); return 0; err: return -1; } static int parasite_start_daemon(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; struct infect_ctx *ictx = &ctl->ictx; /* * Get task registers before going daemon, since the * compel_get_task_regs() needs to call ptrace on _stopped_ task, * while in daemon it is not such. */ if (compel_get_task_regs(pid, &ctl->orig.regs, NULL, ictx->save_regs, ictx->regs_arg, ictx->flags)) { pr_err("Can't obtain regs for thread %d\n", pid); return -1; } if (__compel_arch_fetch_thread_area(pid, &ctl->orig)) { pr_err("Can't get thread area of %d\n", pid); return -1; } if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask)) return -1; if (parasite_init_daemon(ctl)) return -1; return 0; } static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size, int remote_prot) { int fd; ctl->remote_map = remote_mmap(ctl, NULL, size, remote_prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0); if (!ctl->remote_map) { pr_err("Can't allocate memory for parasite blob (pid: %d)\n", ctl->rpid); return -1; } ctl->map_length = round_up(size, page_size()); fd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "map_files/%lx-%lx", (long)ctl->remote_map, (long)ctl->remote_map + ctl->map_length); if (fd < 0) return -1; ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FILE, fd, 0); close(fd); if (ctl->local_map == MAP_FAILED) { ctl->local_map = NULL; pr_perror("Can't map remote parasite map"); return -1; } return 0; } static void parasite_memfd_close(struct parasite_ctl *ctl, int fd) { bool compat = !compel_mode_native(ctl); long ret; int err; err = compel_syscall(ctl, __NR(close, compat), &ret, fd, 0, 0, 0, 0, 0); if (err || ret) pr_err("Can't close memfd\n"); } static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, int remote_prot) { void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; bool compat_task = !compel_mode_native(ctl); uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; pid_t pid = ctl->rpid; long sret = -ENOSYS; int ret, fd, lfd; if (ctl->ictx.flags & INFECT_NO_MEMFD) return 1; BUILD_BUG_ON(sizeof(orig_code) < sizeof(long)); if (ptrace_swap_area(pid, where, (void *)orig_code, sizeof(orig_code))) { pr_err("Can't inject memfd args (pid: %d)\n", pid); return -1; } ret = compel_syscall(ctl, __NR(memfd_create, compat_task), &sret, (unsigned long)where, 0, 0, 0, 0, 0); if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) { fd = (int)sret; if (fd >= 0) parasite_memfd_close(ctl, fd); pr_err("Can't restore memfd args (pid: %d)\n", pid); return -1; } if (ret < 0) return ret; fd = (int)sret; if (fd == -ENOSYS) return 1; if (fd < 0) { errno = -fd; pr_perror("Can't create memfd in victim"); return fd; } ctl->map_length = round_up(size, page_size()); lfd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "fd/%d", fd); if (lfd < 0) goto err_cure; if (ftruncate(lfd, ctl->map_length) < 0) { pr_perror("Fail to truncate memfd for parasite"); goto err_cure; } ctl->remote_map = remote_mmap(ctl, NULL, size, remote_prot, MAP_FILE | MAP_SHARED, fd, 0); if (!ctl->remote_map) { pr_err("Can't rmap memfd for parasite blob\n"); goto err_curef; } ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FILE, lfd, 0); if (ctl->local_map == MAP_FAILED) { ctl->local_map = NULL; pr_perror("Can't lmap memfd for parasite blob"); goto err_curef; } parasite_memfd_close(ctl, fd); close(lfd); pr_info("Set up parasite blob using memfd\n"); return 0; err_curef: close(lfd); err_cure: parasite_memfd_close(ctl, fd); return -1; } void compel_relocs_apply(void *mem, void *vbase, struct parasite_blob_desc *pbd) { compel_reloc_t *elf_relocs = pbd->hdr.relocs; size_t nr_relocs = pbd->hdr.nr_relocs; size_t i, j; void **got = mem + pbd->hdr.got_off; /* * parasite_service() reads the value of __export_parasite_service_args_ptr. * The reason it is set here is that semantically, we are doing a symbol * resolution on parasite_service_args, and it turns out to be relocatable. */ *(void **)(mem + pbd->hdr.args_ptr_off) = vbase + pbd->hdr.args_off; #ifdef CONFIG_MIPS compel_relocs_apply_mips(mem, vbase, pbd); #else for (i = 0, j = 0; i < nr_relocs; i++) { if (elf_relocs[i].type & COMPEL_TYPE_LONG) { long *where = mem + elf_relocs[i].offset; if (elf_relocs[i].type & COMPEL_TYPE_GOTPCREL) { int *value = (int *)where; int rel; got[j] = vbase + elf_relocs[i].value; rel = (unsigned)((void *)&got[j] - (void *)mem) - elf_relocs[i].offset + elf_relocs[i].addend; *value = rel; j++; } else *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase; } else if (elf_relocs[i].type & COMPEL_TYPE_INT) { int *where = (mem + elf_relocs[i].offset); *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase; } else BUG(); } #endif } long remote_mprotect(struct parasite_ctl *ctl, void *addr, size_t len, int prot) { long ret; int err; bool compat_task = !user_regs_native(&ctl->orig.regs); err = compel_syscall(ctl, __NR(mprotect, compat_task), &ret, (unsigned long)addr, len, prot, 0, 0, 0); if (err < 0) { pr_err("compel_syscall for mprotect failed\n"); return -1; } return ret; } static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) { int ret, remote_prot; if (ctl->pblob.hdr.data_off) remote_prot = PROT_READ | PROT_EXEC; else remote_prot = PROT_READ | PROT_WRITE | PROT_EXEC; ret = parasite_memfd_exchange(ctl, size, remote_prot); if (ret == 1) { pr_info("MemFD parasite doesn't work, goto legacy mmap\n"); ret = parasite_mmap_exchange(ctl, size, remote_prot); if (ret) return ret; } if (!ctl->pblob.hdr.data_off) return 0; ret = remote_mprotect(ctl, ctl->remote_map + ctl->pblob.hdr.data_off, size - ctl->pblob.hdr.data_off, PROT_READ | PROT_WRITE); if (ret) pr_err("remote_mprotect failed\n"); return ret; } int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) { int ret; unsigned long p, map_exchange_size, parasite_size = 0; if (ctl->pblob.parasite_type != COMPEL_BLOB_CHEADER) goto err; if (ctl->ictx.log_fd < 0) goto err; if (!arch_can_dump_task(ctl)) goto err; /* * Inject a parasite engine. Ie allocate memory inside alien * space and copy engine code there. Then re-map the engine * locally, so we will get an easy way to access engine memory * without using ptrace at all. */ /* * The parasite memory layout is the following: * Low address start first. * The number in parenthesis denotes the size of the section. * The arrow on the right shows the different variables that * corresponds to a given offset. * +------------------------------------------------------+ <--- 0 * | Parasite blob (sizeof(parasite_blob)) | * +------------------------------------------------------+ <--- hdr.bsize * align 8 * +------------------------------------------------------+ <--- hdr.got_off * | GOT Table (nr_gotpcrel * sizeof(long)) | * +------------------------------------------------------+ <--- hdr.args_off * | Args area (args_size) | * +------------------------------------------------------+ * align 64 * +------------------------------------------------------+ <--- ctl->rsigframe * | sigframe (RESTORE_STACK_SIGFRAME) | ctl->sigframe * +------------------------------------------------------+ * | main stack (PARASITE_STACK_SIZE) | * +------------------------------------------------------+ <--- ctl->rstack * | compel_run_in_thread stack (PARASITE_STACK_SIZE) | * +------------------------------------------------------+ <--- ctl->r_thread_stack * map_exchange_size */ parasite_size = ctl->pblob.hdr.args_off; ctl->args_size = args_size; parasite_size += ctl->args_size; /* RESTORE_STACK_SIGFRAME needs a 64 bytes alignment */ parasite_size = round_up(parasite_size, 64); map_exchange_size = parasite_size; map_exchange_size += RESTORE_STACK_SIGFRAME + PARASITE_STACK_SIZE; if (nr_threads > 1) map_exchange_size += PARASITE_STACK_SIZE; ret = compel_map_exchange(ctl, map_exchange_size); if (ret) goto err; pr_info("Putting parasite blob into %p->%p\n", ctl->local_map, ctl->remote_map); ctl->parasite_ip = (unsigned long)(ctl->remote_map + ctl->pblob.hdr.parasite_ip_off); ctl->cmd = ctl->local_map + ctl->pblob.hdr.cmd_off; ctl->args = ctl->local_map + ctl->pblob.hdr.args_off; /* * args must be 4 bytes aligned as we use futexes() on them. It is * already the case, as args follows the GOT table, which is 8 bytes * aligned. */ if ((unsigned long)ctl->args & (4 - 1)) { pr_err("BUG: args are not 4 bytes aligned: %p\n", ctl->args); goto err; } memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); compel_relocs_apply(ctl->local_map, ctl->remote_map, &ctl->pblob); p = parasite_size; ctl->rsigframe = ctl->remote_map + p; ctl->sigframe = ctl->local_map + p; p += RESTORE_STACK_SIGFRAME; p += PARASITE_STACK_SIZE; ctl->rstack = ctl->remote_map + p; /* * x86-64 ABI requires a 16 bytes aligned stack. * It is already the case as RESTORE_STACK_SIGFRAME is a multiple of * 64, and PARASITE_STACK_SIZE is 0x4000. */ if ((unsigned long)ctl->rstack & (16 - 1)) { pr_err("BUG: stack is not 16 bytes aligned: %p\n", ctl->rstack); goto err; } if (nr_threads > 1) { p += PARASITE_STACK_SIZE; ctl->r_thread_stack = ctl->remote_map + p; } ret = arch_fetch_sas(ctl, ctl->rsigframe); if (ret) { pr_err("Can't fetch sigaltstack for task %d (ret %d)\n", ctl->rpid, ret); goto err; } return 0; err: return -1; } int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) { if (compel_infect_no_daemon(ctl, nr_threads, args_size)) return -1; if (parasite_start_daemon(ctl)) return -1; return 0; } struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid) { struct parasite_thread_ctl *tctl; tctl = xmalloc(sizeof(*tctl)); if (tctl) { if (prepare_thread(pid, &tctl->th)) { xfree(tctl); tctl = NULL; } else { tctl->tid = pid; tctl->ctl = ctl; } } return tctl; } static int prepare_thread(int pid, struct thread_ctx *ctx) { if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) { pr_perror("can't get signal blocking mask for %d", pid); return -1; } if (ptrace_get_regs(pid, &ctx->regs)) { pr_perror("Can't obtain registers (pid: %d)", pid); return -1; } return 0; } void compel_release_thread(struct parasite_thread_ctl *tctl) { /* * No stuff to cure in thread here, all routines leave the * guy intact (for now) */ xfree(tctl); } struct parasite_ctl *compel_prepare_noctx(int pid) { struct parasite_ctl *ctl = NULL; /* * Control block early setup. */ ctl = xzalloc(sizeof(*ctl)); if (!ctl) { pr_err("Parasite control block allocation failed (pid: %d)\n", pid); goto err; } ctl->tsock = -1; ctl->ictx.log_fd = -1; if (prepare_thread(pid, &ctl->orig)) goto err; ctl->rpid = pid; BUILD_BUG_ON(PARASITE_START_AREA_MIN < BUILTIN_SYSCALL_SIZE + MEMFD_FNAME_SZ); return ctl; err: xfree(ctl); return NULL; } /* * Find first executable VMA that would fit the initial * syscall injection. */ static unsigned long find_executable_area(int pid) { char aux[128]; FILE *f; unsigned long ret = (unsigned long)MAP_FAILED; sprintf(aux, "/proc/%d/maps", pid); f = fopen(aux, "r"); if (!f) goto out; while (fgets(aux, sizeof(aux), f)) { unsigned long start, end; char *f; start = strtoul(aux, &f, 16); end = strtoul(f + 1, &f, 16); /* f now points at " rwx" (yes, with space) part */ if (f[3] == 'x') { BUG_ON(end - start < PARASITE_START_AREA_MIN); ret = start; break; } } fclose(f); out: return ret; } /* * This routine is to create PF_UNIX/SOCK_SEQPACKET socket * in the target net namespace */ static int make_sock_for(int pid) { int ret, mfd, fd, sk = -1; char p[32]; pr_debug("Preparing seqsk for %d\n", pid); sprintf(p, "/proc/%d/ns/net", pid); fd = open(p, O_RDONLY); if (fd < 0) { pr_perror("Can't open %p", p); goto out; } mfd = open("/proc/self/ns/net", O_RDONLY); if (mfd < 0) { pr_perror("Can't open self netns"); goto out_c; } if (setns(fd, CLONE_NEWNET)) { pr_perror("Can't setup target netns"); goto out_cm; } sk = socket(PF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK, 0); if (sk < 0) pr_perror("Can't create seqsk"); ret = setns(mfd, CLONE_NEWNET); if (ret) { pr_perror("Can't restore former netns"); if (sk >= 0) close(sk); sk = -1; } out_cm: close(mfd); out_c: close(fd); out: return sk; } static int simple_open_proc(int pid, int mode, const char *fmt, ...) { int l; char path[128]; va_list args; l = sprintf(path, "/proc/%d/", pid); va_start(args, fmt); vsnprintf(path + l, sizeof(path) - l, fmt, args); va_end(args); return open(path, mode); } static void handle_sigchld(int signal, siginfo_t *siginfo, void *data) { int pid, status; pid = waitpid(-1, &status, WNOHANG); if (pid <= 0) return; pr_err("si_code=%d si_pid=%d si_status=%d\n", siginfo->si_code, siginfo->si_pid, siginfo->si_status); if (WIFEXITED(status)) pr_err("%d exited with %d unexpectedly\n", pid, WEXITSTATUS(status)); else if (WIFSIGNALED(status)) pr_err("%d was killed by %d unexpectedly: %s\n", pid, WTERMSIG(status), strsignal(WTERMSIG(status))); else if (WIFSTOPPED(status)) pr_err("%d was stopped by %d unexpectedly\n", pid, WSTOPSIG(status)); /* FIXME Should we exit? */ /* exit(1); */ } struct plain_regs_struct { user_regs_struct_t regs; user_fpregs_struct_t fpregs; }; static int save_regs_plain(void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) { struct plain_regs_struct *prs = to; prs->regs = *r; prs->fpregs = *f; return 0; } static int make_sigframe_plain(void *from, struct rt_sigframe *f, struct rt_sigframe *rtf, k_rtsigset_t *b) { struct plain_regs_struct *prs = from; /* * Make sure it's zeroified. */ memset(f, 0, sizeof(*f)); if (sigreturn_prep_regs_plain(f, &prs->regs, &prs->fpregs)) return -1; if (b) rt_sigframe_copy_sigset(f, b); if (RT_SIGFRAME_HAS_FPU(f)) { if (sigreturn_prep_fpu_frame_plain(f, rtf)) return -1; } /* * FIXME What about sas? * setup_sas(sigframe, core->thread_core->sas); */ return 0; } struct parasite_ctl *compel_prepare(int pid) { struct parasite_ctl *ctl; struct infect_ctx *ictx; ctl = compel_prepare_noctx(pid); if (ctl == NULL) goto out; ictx = &ctl->ictx; ictx->task_size = compel_task_size(); ictx->open_proc = simple_open_proc; ictx->syscall_ip = find_executable_area(pid); ictx->child_handler = handle_sigchld; sigaction(SIGCHLD, NULL, &ictx->orig_handler); ictx->save_regs = save_regs_plain; ictx->make_sigframe = make_sigframe_plain; ictx->regs_arg = xmalloc(sizeof(struct plain_regs_struct)); if (ictx->regs_arg == NULL) goto err; if (ictx->syscall_ip == (unsigned long)MAP_FAILED) goto err; ictx->sock = make_sock_for(pid); if (ictx->sock < 0) goto err; out: return ctl; err: xfree(ictx->regs_arg); xfree(ctl); ctl = NULL; goto out; } static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs) { void *addr = (void *)REG_IP(*regs); return addr >= ctl->remote_map && addr < ctl->remote_map + ctl->map_length; } static int parasite_fini_seized(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; user_regs_struct_t regs; int status, ret = 0; /* stop getting chld from parasite -- we're about to step-by-step it */ if (restore_child_handler(ctl)) return -1; /* Start to trace syscalls for each thread */ if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL)) { pr_perror("Unable to interrupt the process"); return -1; } pr_debug("Waiting for %d to trap\n", pid); if (wait4(pid, &status, __WALL, NULL) != pid) { pr_perror("Waited pid mismatch (pid: %d)", pid); return -1; } pr_debug("Daemon %d exited trapping\n", pid); if (!WIFSTOPPED(status)) { pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); return -1; } ret = ptrace_get_regs(pid, ®s); if (ret) { pr_perror("Unable to get registers"); return -1; } if (!task_in_parasite(ctl, ®s)) { pr_err("The task is not in parasite code\n"); return -1; } ret = compel_rpc_call(PARASITE_CMD_FINI, ctl); close_safe(&ctl->tsock); if (ret) return -1; /* Go to sigreturn as closer as we can */ ret = compel_stop_pie(pid, ctl->sigreturn_addr, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); if (ret < 0) return ret; if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1))) return -1; /* * All signals are unblocked now. The kernel notifies about leaving * syscall before starting to deliver signals. All parasite code are * executed with blocked signals, so we can sefly unmap a parasite blob. */ return 0; } int compel_start_daemon(struct parasite_ctl *ctl) { return parasite_start_daemon(ctl); } int compel_stop_daemon(struct parasite_ctl *ctl) { if (ctl->daemonized) { /* * Looks like a previous attempt failed, we should do * nothing in this case. parasite will try to cure itself. */ if (ctl->tsock < 0) return -1; if (parasite_fini_seized(ctl)) { close_safe(&ctl->tsock); return -1; } } ctl->daemonized = false; return 0; } int compel_cure_remote(struct parasite_ctl *ctl) { long ret; int err; if (compel_stop_daemon(ctl)) return -1; if (!ctl->remote_map) return 0; err = compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret, (unsigned long)ctl->remote_map, ctl->map_length, 0, 0, 0, 0); if (err) return err; if (ret) { pr_err("munmap for remote map %p, %lu returned %lu\n", ctl->remote_map, ctl->map_length, ret); return -1; } return 0; } int compel_cure_local(struct parasite_ctl *ctl) { int ret = 0; if (ctl->local_map) { if (munmap(ctl->local_map, ctl->map_length)) { pr_err("munmap failed (pid: %d)\n", ctl->rpid); ret = -1; } } free(ctl); return ret; } int compel_cure(struct parasite_ctl *ctl) { int ret; ret = compel_cure_remote(ctl); if (!ret) ret = compel_cure_local(ctl); return ret; } void *compel_parasite_args_p(struct parasite_ctl *ctl) { return ctl->args; } void *compel_parasite_args_s(struct parasite_ctl *ctl, unsigned long args_size) { BUG_ON(args_size > ctl->args_size); return compel_parasite_args_p(ctl); } int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd) { int pid = tctl->tid; struct parasite_ctl *ctl = tctl->ctl; struct thread_ctx *octx = &tctl->th; void *stack = ctl->r_thread_stack; user_regs_struct_t regs = octx->regs; int ret; *ctl->cmd = cmd; ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx); if (ret == 0) ret = parasite_trap(ctl, pid, ®s, octx, true); if (ret == 0) ret = (int)REG_RES(regs); if (ret) pr_err("Parasite exited with %d\n", ret); return ret; } /* * compel_unmap() is used for unmapping parasite and restorer blobs. * A blob can contain code for unmapping itself, so the process is * trapped on the exit from the munmap syscall. */ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) { user_regs_struct_t regs = ctl->orig.regs; pid_t pid = ctl->rpid; int ret = -1; ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig); if (ret) goto err; ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1)); /* * Don't touch extended registers here: they were restored * with rt_sigreturn from sigframe. */ if (restore_thread_ctx(pid, &ctl->orig, false)) ret = -1; err: return ret; } int compel_stop_pie(pid_t pid, void *addr, bool no_bp) { int ret; if (no_bp) { pr_debug("Force no-breakpoints restore of %d\n", pid); ret = 0; } else ret = ptrace_set_breakpoint(pid, addr); if (ret < 0) return ret; if (ret > 0) { /* * PIE will stop on a breakpoint, next * stop after that will be syscall enter. */ return 0; } /* * No breakpoints available -- start tracing it * in a per-syscall manner. */ ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the %d process", pid); return -1; } return 0; } static bool task_is_trapped(int status, pid_t pid) { if (WIFSTOPPED(status) && (WSTOPSIG(status) & ~PTRACE_SYSCALL_TRAP) == SIGTRAP) return true; pr_err("Task %d is in unexpected state: %x\n", pid, status); if (WIFEXITED(status)) pr_err("Task exited with %d\n", WEXITSTATUS(status)); if (WIFSIGNALED(status)) pr_err("Task signaled with %d: %s\n", WTERMSIG(status), strsignal(WTERMSIG(status))); if (WIFSTOPPED(status)) pr_err("Task stopped with %d: %s\n", WSTOPSIG(status), strsignal(WSTOPSIG(status))); if (WIFCONTINUED(status)) pr_err("Task continued\n"); return false; } static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, const int sys_nr, const int sys_nr_compat) { const char *mode = user_regs_native(regs) ? "native" : "compat"; int req_sysnr = user_regs_native(regs) ? sys_nr : sys_nr_compat; pr_debug("%d (%s) is going to execute the syscall %lu, required is %d\n", pid, mode, REG_SYSCALL_NR(*regs), req_sysnr); return (REG_SYSCALL_NR(*regs) == req_sysnr); } /* * Trap tasks on the exit from the specified syscall * * tasks - number of processes, which should be trapped * sys_nr - the required syscall number * sys_nr_compat - the required compatible syscall number */ int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat) { enum trace_flags trace = tasks > 1 ? TRACE_ALL : TRACE_ENTER; user_regs_struct_t regs; int status, ret; pid_t pid; /* Stop all threads on the enter point in sys_rt_sigreturn */ while (tasks) { pid = wait4(-1, &status, __WALL, NULL); if (pid == -1) { pr_perror("wait4 failed"); return -1; } if (!task_is_trapped(status, pid)) return -1; pr_debug("%d was trapped\n", pid); if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) { /* * On some platforms such as ARM64, it is impossible to * pass through a breakpoint, so let's clear it right * after it has been triggered. */ if (ptrace_flush_breakpoints(pid)) { pr_err("Unable to clear breakpoints\n"); return -1; } goto goon; } if (trace == TRACE_EXIT) { trace = TRACE_ENTER; pr_debug("`- Expecting exit\n"); goto goon; } if (trace == TRACE_ENTER) trace = TRACE_EXIT; ret = ptrace_get_regs(pid, ®s); if (ret) { pr_perror("ptrace"); return -1; } if (is_required_syscall(®s, pid, sys_nr, sys_nr_compat)) { /* * The process is going to execute the required syscall, * the next stop will be on the exit from this syscall */ ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); if (ret) { pr_perror("ptrace"); return -1; } pid = wait4(pid, &status, __WALL, NULL); if (pid == -1) { pr_perror("wait4 failed"); return -1; } if (!task_is_trapped(status, pid)) return -1; pr_debug("%d was stopped\n", pid); tasks--; continue; } goon: ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); if (ret) { pr_perror("ptrace"); return -1; } } return 0; } int compel_mode_native(struct parasite_ctl *ctl) { return user_regs_native(&ctl->orig.regs); } static inline k_rtsigset_t *thread_ctx_sigmask(struct thread_ctx *tctx) { return &tctx->sigmask; } k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl) { return thread_ctx_sigmask(&tctl->th); } k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl) { return thread_ctx_sigmask(&ctl->orig); } int compel_get_thread_regs(struct parasite_thread_ctl *tctl, save_regs_t save, void *arg) { return compel_get_task_regs(tctl->tid, &tctl->th.regs, &tctl->th.ext_regs, save, arg, tctl->ctl->ictx.flags); } struct infect_ctx *compel_infect_ctx(struct parasite_ctl *ctl) { return &ctl->ictx; } struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *ctl) { return &ctl->pblob; } uint64_t compel_get_leader_sp(struct parasite_ctl *ctl) { return REG_SP(ctl->orig.regs); } uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl) { return REG_SP(tctl->th.regs); } uint64_t compel_get_leader_ip(struct parasite_ctl *ctl) { return REG_IP(ctl->orig.regs); } uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl) { return REG_IP(tctl->th.regs); } void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v) { SET_REG_IP(ctl->orig.regs, v); } void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v) { SET_REG_IP(tctl->th.regs, v); } void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack) { if (rstack) *rstack = ctl->rstack; if (r_thread_stack) *r_thread_stack = ctl->r_thread_stack; } crac-criu-1.5.0/compel/src/lib/log-host.c000077700000000000000000000000001471504326700210322log.custar00rootroot00000000000000crac-criu-1.5.0/compel/src/lib/log.c000066400000000000000000000012541471504326700171410ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "log.h" static unsigned int current_loglevel = COMPEL_DEFAULT_LOGLEVEL; static compel_log_fn logfn; void compel_log_init(compel_log_fn log_fn, unsigned int level) { logfn = log_fn; current_loglevel = level; } unsigned int compel_log_get_loglevel(void) { return current_loglevel; } void compel_print_on_level(unsigned int loglevel, const char *format, ...) { va_list params; compel_log_fn fn = logfn; if (fn != NULL && !pr_quelled(loglevel)) { va_start(params, format); fn(loglevel, format, params); va_end(params); } } crac-criu-1.5.0/compel/src/lib/ptrace.c000066400000000000000000000050321471504326700176340ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/compiler.h" #include "uapi/compel/asm/infect-types.h" #include "ptrace.h" #include "log.h" int ptrace_suspend_seccomp(pid_t pid) { if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD) < 0) { pr_perror("suspending seccomp failed"); return -1; } return 0; } int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes) { unsigned long w; int old_errno = errno; if (bytes & (sizeof(long) - 1)) { pr_err("Peek request with non-word size %ld\n", bytes); return -1; } errno = 0; for (w = 0; w < bytes / sizeof(long); w++) { unsigned long *d = dst, *a = addr; d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL); if (d[w] == -1U && errno) { pr_perror("PEEKDATA failed"); goto err; } } errno = old_errno; return 0; err: return -errno; } int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes) { unsigned long w; if (bytes & (sizeof(long) - 1)) { pr_err("Poke request with non-word size %ld\n", bytes); return -1; } for (w = 0; w < bytes / sizeof(long); w++) { unsigned long *s = src, *a = addr; if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w])) { pr_perror("POKEDATA failed"); goto err; } } return 0; err: return -errno; } /* don't swap big space, it might overflow the stack */ int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes) { void *t = alloca(bytes); int err; err = ptrace_peek_area(pid, t, dst, bytes); if (err) return err; err = ptrace_poke_area(pid, src, dst, bytes); if (err) { int err2; pr_err("Can't poke %d @ %p from %p sized %ld\n", pid, dst, src, bytes); err2 = ptrace_poke_area(pid, t, dst, bytes); if (err2) { pr_err("Can't restore the original data with poke\n"); return err2; } return err; } memcpy(src, t, bytes); return 0; } int __attribute__((weak)) ptrace_get_regs(int pid, user_regs_struct_t *regs) { struct iovec iov; iov.iov_base = regs; iov.iov_len = sizeof(user_regs_struct_t); return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); } int __attribute__((weak)) ptrace_set_regs(int pid, user_regs_struct_t *regs) { struct iovec iov; iov.iov_base = regs; iov.iov_len = sizeof(user_regs_struct_t); return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); } crac-criu-1.5.0/compel/src/main-host.c000077700000000000000000000000001471504326700205722main.custar00rootroot00000000000000crac-criu-1.5.0/compel/src/main.c000066400000000000000000000215311471504326700165360ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "version.h" #include "piegen.h" #include "log.h" #define CFLAGS_DEFAULT_SET \ "-Wstrict-prototypes " \ "-ffreestanding " \ "-fno-stack-protector -nostdlib -fomit-frame-pointer " #define COMPEL_CFLAGS_PIE CFLAGS_DEFAULT_SET "-fpie" #define COMPEL_CFLAGS_NOPIC CFLAGS_DEFAULT_SET "-fno-pic" #ifdef NO_RELOCS #define COMPEL_LDFLAGS_COMMON "-z noexecstack -T " #else #define COMPEL_LDFLAGS_COMMON "-r -z noexecstack -T " #endif typedef struct { const char *arch; // dir name under arch/ const char *cflags; const char *cflags_compat; } flags_t; static const flags_t flags = { #if defined CONFIG_X86_64 .arch = "x86", .cflags = COMPEL_CFLAGS_PIE, .cflags_compat = COMPEL_CFLAGS_NOPIC, #elif defined CONFIG_AARCH64 .arch = "aarch64", .cflags = COMPEL_CFLAGS_PIE, #elif defined(CONFIG_ARMV6) || defined(CONFIG_ARMV7) .arch = "arm", .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_PPC64 .arch = "ppc64", .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_S390 .arch = "s390", .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_MIPS .arch = "mips", .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_LOONGARCH64 .arch = "loongarch64", .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif }; piegen_opt_t opts = {}; const char *uninst_root; static int piegen(void) { struct stat st; void *mem; int fd, ret = -1; fd = open(opts.input_filename, O_RDONLY); if (fd < 0) { pr_perror("Can't open file %s", opts.input_filename); return -1; } if (fstat(fd, &st)) { pr_perror("Can't stat file %s", opts.input_filename); goto err; } opts.fout = fopen(opts.output_filename, "w"); if (opts.fout == NULL) { pr_perror("Can't open %s", opts.output_filename); goto err; } mem = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, 0); if (mem == MAP_FAILED) { pr_perror("Can't mmap file %s", opts.input_filename); goto err; } if (handle_binary(mem, st.st_size)) { unlink(opts.output_filename); goto err; } ret = 0; err: close(fd); if (opts.fout) fclose(opts.fout); if (!ret) pr_info("%s generated successfully.\n", opts.output_filename); return ret; } static void cli_log(unsigned int lvl, const char *fmt, va_list parms) { FILE *f = stdout; if (pr_quelled(lvl)) return; if ((lvl == COMPEL_LOG_ERROR) || (lvl == COMPEL_LOG_WARN)) f = stderr; vfprintf(f, fmt, parms); } static int usage(int rc) { FILE *out = (rc == 0) ? stdout : stderr; fprintf(out, "Usage:\n" " compel [--compat] includes | cflags | ldflags\n" " compel plugins [PLUGIN_NAME ...]\n" " compel [--compat] [--static] libs\n" " compel -f FILE -o FILE [-p NAME] [-l N] hgen\n" " -f, --file FILE input (parasite object) file name\n" " -o, --output FILE output (header) file name\n" " -p, --prefix NAME prefix for var names\n" " -l, --log-level NUM log level (default: %d)\n" " compel -h|--help\n" " compel -V|--version\n", COMPEL_DEFAULT_LOGLEVEL); return rc; } static void print_includes(void) { int i; /* list of standard include dirs (built into C preprocessor) */ const char *standard_includes[] = { "/usr/include", "/usr/local/include", }; /* I am not installed, called via a wrapper */ if (uninst_root) { printf("-I %s/include/uapi\n", uninst_root); return; } /* I am installed * Make sure to not print banalities */ for (i = 0; i < ARRAY_SIZE(standard_includes); i++) if (strcmp(INCLUDEDIR, standard_includes[i]) == 0) return; /* Finally, print our non-standard include path */ printf("%s\n", "-I " INCLUDEDIR); } static void print_cflags(bool compat) { printf("%s\n", compat ? flags.cflags_compat : flags.cflags); print_includes(); } static void print_ldflags(bool compat) { const char *compat_str = (compat) ? "-compat" : ""; printf("%s", COMPEL_LDFLAGS_COMMON); if (uninst_root) { printf("%s/arch/%s/scripts/compel-pack%s.lds.S\n", uninst_root, flags.arch, compat_str); } else { printf("%s/compel/scripts/compel-pack%s.lds.S\n", LIBEXECDIR, compat_str); } } static void print_plugin(const char *name) { const char suffix[] = ".lib.a"; if (uninst_root) printf("%s/plugins/%s%s\n", uninst_root, name, suffix); else printf("%s/compel/%s%s\n", LIBEXECDIR, name, suffix); } static void print_plugins(char *const list[]) { char *builtin_list[] = { "std", NULL }; char **p = builtin_list; while (*p != NULL) print_plugin(*p++); while (*list != NULL) print_plugin(*list++); } static int print_libs(bool is_static) { if (uninst_root) { if (!is_static) { fprintf(stderr, "Compel is not installed, can " "only link with static libraries " "(use --static)\n"); return 1; } printf("%s/%s\n", uninst_root, STATIC_LIB); } else { printf("%s/%s\n", LIBDIR, (is_static) ? STATIC_LIB : DYN_LIB); } return 0; } /* Extracts the file name (removing directory path and suffix, * and checks the result for being a valid C identifier * (replacing - with _ along the way). * * If everything went fine, return the resulting string, * otherwise NULL. * * Example: get_prefix("./some/path/to/file.c") ==> "file" */ static char *gen_prefix(const char *path) { const char *p1 = NULL, *p2 = NULL; size_t len; int i; char *p, *ret; len = strlen(path); if (len == 0) return NULL; // Find the last slash (p1) // and the first dot after it (p2) for (i = len - 1; i >= 0; i--) { if (!p1 && path[i] == '.') { p2 = path + i - 1; } else if (!p1 && path[i] == '/') { p1 = path + i + 1; break; } } if (!p1) // no slash in path p1 = path; if (!p2) // no dot (after slash) p2 = path + len; len = p2 - p1 + 1; if (len < 1) return NULL; ret = strndup(p1, len); // Now, check if we got a valid C identifier. We don't need to care // about C reserved keywords, as this is only used as a prefix. for (p = ret; *p != '\0'; p++) { if (isalpha(*p)) continue; // digit is fine, except the first character if (isdigit(*p) && p > ret) continue; // only allowed special character is _ if (*p == '_') continue; // as a courtesy, replace - with _ if (*p == '-') { *p = '_'; continue; } // invalid character! free(ret); return NULL; } return ret; } int main(int argc, char *argv[]) { int log_level = COMPEL_DEFAULT_LOGLEVEL; bool compat = false; bool is_static = false; int opt, idx; char *action; static const char short_opts[] = "csf:o:p:hVl:"; static struct option long_opts[] = { { "compat", no_argument, 0, 'c' }, { "static", no_argument, 0, 's' }, { "file", required_argument, 0, 'f' }, { "output", required_argument, 0, 'o' }, { "prefix", required_argument, 0, 'p' }, { "help", no_argument, 0, 'h' }, { "version", no_argument, 0, 'V' }, { "log-level", required_argument, 0, 'l' }, {}, }; uninst_root = getenv("COMPEL_UNINSTALLED_ROOTDIR"); while (1) { idx = -1; opt = getopt_long(argc, argv, short_opts, long_opts, &idx); if (opt == -1) break; switch (opt) { case 'c': compat = true; break; case 's': is_static = true; break; case 'f': opts.input_filename = optarg; break; case 'o': opts.output_filename = optarg; break; case 'p': opts.prefix = optarg; break; case 'l': log_level = atoi(optarg); break; case 'h': return usage(0); case 'V': printf("Version: %d.%d.%d\n", COMPEL_SO_VERSION_MAJOR, COMPEL_SO_VERSION_MINOR, COMPEL_SO_VERSION_SUBLEVEL); exit(0); default: // '?' // error message already printed by getopt_long() return usage(1); } } if (optind >= argc) { fprintf(stderr, "Error: action argument required\n"); return usage(1); } action = argv[optind++]; if (!strcmp(action, "includes")) { print_includes(); return 0; } if (!strcmp(action, "cflags")) { print_cflags(compat); return 0; } if (!strcmp(action, "ldflags")) { print_ldflags(compat); return 0; } if (!strcmp(action, "plugins")) { print_plugins(argv + optind); return 0; } if (!strcmp(action, "libs")) { return print_libs(is_static); } if (!strcmp(action, "hgen")) { if (!opts.input_filename) { fprintf(stderr, "Error: option --file required\n"); return usage(1); } if (!opts.output_filename) { fprintf(stderr, "Error: option --output required\n"); return usage(1); } if (!opts.prefix) { // prefix not provided, let's autogenerate opts.prefix = gen_prefix(opts.input_filename); if (!opts.prefix) opts.prefix = gen_prefix(opts.output_filename); if (!opts.prefix) { fprintf(stderr, "Error: can't autogenerate " "prefix (supply --prefix)"); return 2; } } compel_log_init(&cli_log, log_level); return piegen(); } fprintf(stderr, "Error: unknown action '%s'\n", action); return usage(1); } crac-criu-1.5.0/compel/test/000077500000000000000000000000001471504326700156345ustar00rootroot00000000000000crac-criu-1.5.0/compel/test/Makefile000066400000000000000000000004751471504326700173020ustar00rootroot00000000000000all: fdspy infect rsys stack fdspy: $(Q) $(MAKE) -C fdspy $(Q) $(MAKE) -C fdspy run .PHONY: fdspy infect: $(Q) $(MAKE) -C infect $(Q) $(MAKE) -C infect run .PHONY: infect rsys: $(Q) $(MAKE) -C rsys $(Q) $(MAKE) -C rsys run .PHONY: rsys stack: $(Q) $(MAKE) -C stack $(Q) $(MAKE) -C stack run .PHONY: stack crac-criu-1.5.0/compel/test/fdspy/000077500000000000000000000000001471504326700167615ustar00rootroot00000000000000crac-criu-1.5.0/compel/test/fdspy/.gitignore000066400000000000000000000000421471504326700207450ustar00rootroot00000000000000parasite.h parasite.po spy victim crac-criu-1.5.0/compel/test/fdspy/Makefile000066400000000000000000000011211471504326700204140ustar00rootroot00000000000000CC := gcc CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host all: victim spy run: ./spy .PHONY: run clean: rm -f victim rm -f spy rm -f parasite.h rm -f parasite.po rm -f parasite.o victim: victim.c $(CC) $(CFLAGS) -o $@ $^ spy: spy.c parasite.h $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) parasite.h: parasite.po $(COMPEL) hgen -o $@ -f $< parasite.po: parasite.o ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins fds) parasite.o: parasite.c $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ crac-criu-1.5.0/compel/test/fdspy/parasite.c000066400000000000000000000007071471504326700207410ustar00rootroot00000000000000#include #include #include /* * Stubs for std compel plugin. */ int compel_main(void *arg_p, unsigned int arg_s) { return 0; } int parasite_trap_cmd(int cmd, void *args) { return 0; } void parasite_cleanup(void) { } #define PARASITE_CMD_GETFD PARASITE_USER_CMDS int parasite_daemon_cmd(int cmd, void *args) { if (cmd == PARASITE_CMD_GETFD) return (fds_send_fd(2) < 0); return 0; } crac-criu-1.5.0/compel/test/fdspy/spy.c000066400000000000000000000074201471504326700177430ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "parasite.h" #define PARASITE_CMD_GETFD PARASITE_USER_CMDS static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { printf("\tLC%u: ", lvl); vprintf(fmt, parms); } static int do_infection(int pid, int *stolen_fd) { #define err_and_ret(msg) \ do { \ fprintf(stderr, msg); \ return -1; \ } while (0) int state; struct parasite_ctl *ctl; struct infect_ctx *ictx; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); printf("Stopping task\n"); state = compel_stop_task(pid); if (state < 0) err_and_ret("Can't stop task"); printf("Preparing parasite ctl\n"); ctl = compel_prepare(pid); if (!ctl) err_and_ret("Can't prepare for infection"); printf("Configuring contexts\n"); /* * First -- the infection context. Most of the stuff * is already filled by compel_prepare(), just set the * log descriptor for parasite side, library cannot * live w/o it. */ ictx = compel_infect_ctx(ctl); ictx->log_fd = STDERR_FILENO; parasite_setup_c_header(ctl); printf("Infecting\n"); if (compel_infect(ctl, 1, sizeof(int))) err_and_ret("Can't infect victim"); printf("Stealing fd\n"); if (compel_rpc_call(PARASITE_CMD_GETFD, ctl)) err_and_ret("Can't run cmd"); if (compel_util_recv_fd(ctl, stolen_fd)) err_and_ret("Can't recv fd"); if (compel_rpc_sync(PARASITE_CMD_GETFD, ctl)) err_and_ret("Con't finalize cmd"); printf("Stole %d fd\n", *stolen_fd); /* * Done. Cure and resume the task. */ printf("Curing\n"); if (compel_cure(ctl)) err_and_ret("Can't cure victim"); if (compel_resume_task(pid, state, state)) err_and_ret("Can't unseize task"); printf("Done\n"); return 0; } static int check_pipe_ends(int wfd, int rfd) { struct stat r, w; char aux[4] = "0000"; printf("Check pipe ends are at hands\n"); if (fstat(wfd, &w) < 0) { perror("Can't stat wfd"); return 0; } if (fstat(rfd, &r) < 0) { perror("Can't stat rfd"); return 0; } if (w.st_dev != r.st_dev || w.st_ino != r.st_ino) { perror("Pipe's not the same"); return 0; } printf("Check pipe ends are connected\n"); if (write(wfd, "1", 2) != 2) { fprintf(stderr, "write to pipe failed\n"); return 0; } if (read(rfd, aux, sizeof(aux)) != sizeof(aux)) { fprintf(stderr, "read from pipe failed\n"); return 0; } if (aux[0] != '1' || aux[1] != '\0') { fprintf(stderr, "Pipe connectivity lost\n"); return 0; } return 1; } int main(int argc, char **argv) { int p_in[2], p_out[2], p_err[2], pid, pass = 1, stolen_fd = -1; /* * Prepare IO-s and fork the victim binary */ if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { perror("Can't make pipe"); return -1; } printf("Run the victim\n"); pid = vfork(); if (pid == 0) { close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); execl("./victim", "victim", NULL); exit(1); } close(p_in[0]); close(p_out[1]); close(p_err[1]); /* * Now do the infection with parasite.c */ printf("Infecting the victim\n"); if (do_infection(pid, &stolen_fd)) return 1; /* * Stop the victim and check the infection went well */ printf("Closing victim stdin\n"); close(p_in[1]); printf("Waiting for victim to die\n"); wait(NULL); printf("Checking the result\n"); /* * Stolen fd is the stderr of the task * Check these are the ends of the same pipe * and message passing works OK */ pass = check_pipe_ends(stolen_fd, p_err[0]); if (pass) printf("All OK\n"); else printf("Something went WRONG\n"); return 0; } crac-criu-1.5.0/compel/test/fdspy/victim.c000066400000000000000000000002031471504326700204130ustar00rootroot00000000000000#include int main(int argc, char **argv) { int i, aux; do { i = read(0, &aux, 1); } while (i > 0); return 0; } crac-criu-1.5.0/compel/test/infect/000077500000000000000000000000001471504326700171045ustar00rootroot00000000000000crac-criu-1.5.0/compel/test/infect/.gitignore000066400000000000000000000000421471504326700210700ustar00rootroot00000000000000parasite.h parasite.po spy victim crac-criu-1.5.0/compel/test/infect/Makefile000066400000000000000000000011151471504326700205420ustar00rootroot00000000000000CC := gcc CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host all: victim spy run: ./spy .PHONY: run clean: rm -f victim rm -f spy rm -f parasite.h rm -f parasite.po rm -f parasite.o victim: victim.c $(CC) $(CFLAGS) -o $@ $^ spy: spy.c parasite.h $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) parasite.h: parasite.po $(COMPEL) hgen -o $@ -f $< parasite.po: parasite.o ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins) parasite.o: parasite.c $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ crac-criu-1.5.0/compel/test/infect/parasite.c000066400000000000000000000010701471504326700210560ustar00rootroot00000000000000#include #include #include /* * Stubs for std compel plugin. */ int parasite_trap_cmd(int cmd, void *args) { return 0; } void parasite_cleanup(void) { } #define PARASITE_CMD_INC PARASITE_USER_CMDS #define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 int parasite_daemon_cmd(int cmd, void *args) { int v; switch (cmd) { case PARASITE_CMD_INC: v = (*(int *)args) + 1; break; case PARASITE_CMD_DEC: v = (*(int *)args) - 1; break; default: v = -1; break; } sys_write(1, &v, sizeof(int)); return 0; } crac-criu-1.5.0/compel/test/infect/spy.c000066400000000000000000000073661471504326700200770ustar00rootroot00000000000000#include #include #include #include #include #include #include "parasite.h" #define PARASITE_CMD_INC PARASITE_USER_CMDS #define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { printf("\tLC%u: ", lvl); vprintf(fmt, parms); } static int do_infection(int pid) { #define err_and_ret(msg) \ do { \ fprintf(stderr, msg); \ return -1; \ } while (0) int state; struct parasite_ctl *ctl; struct infect_ctx *ictx; int *arg; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); printf("Stopping task\n"); state = compel_stop_task(pid); if (state < 0) err_and_ret("Can't stop task"); printf("Preparing parasite ctl\n"); ctl = compel_prepare(pid); if (!ctl) err_and_ret("Can't prepare for infection"); printf("Configuring contexts\n"); /* * First -- the infection context. Most of the stuff * is already filled by compel_prepare(), just set the * log descriptor for parasite side, library cannot * live w/o it. */ ictx = compel_infect_ctx(ctl); ictx->log_fd = STDERR_FILENO; parasite_setup_c_header(ctl); printf("Infecting\n"); if (compel_infect(ctl, 1, sizeof(int))) err_and_ret("Can't infect victim"); /* * Now get the area with arguments and run two * commands one by one. */ arg = compel_parasite_args(ctl, int); printf("Running cmd 1\n"); *arg = 137; if (compel_rpc_call_sync(PARASITE_CMD_INC, ctl)) err_and_ret("Can't run parasite command 1"); printf("Running cmd 2\n"); *arg = 404; if (compel_rpc_call_sync(PARASITE_CMD_DEC, ctl)) err_and_ret("Can't run parasite command 2"); /* * Done. Cure and resume the task. */ printf("Curing\n"); if (compel_cure(ctl)) err_and_ret("Can't cure victim"); if (compel_resume_task(pid, state, state)) err_and_ret("Can't unseize task"); printf("Done\n"); return 0; } static inline int chk(int fd, int val) { int v = 0; if (read(fd, &v, sizeof(v)) != sizeof(v)) return 1; printf("%d, want %d\n", v, val); return v != val; } int main(int argc, char **argv) { int p_in[2], p_out[2], p_err[2], pid, i, err = 0; /* * Prepare IO-s and fork the victim binary */ if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { perror("Can't make pipe"); return -1; } pid = vfork(); if (pid == 0) { close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); execl("./victim", "victim", NULL); exit(1); } close(p_in[0]); close(p_out[1]); close(p_err[1]); /* * Tell the little guy some numbers */ i = 1; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; i = 42; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; printf("Checking the victim alive\n"); err = chk(p_out[0], 1); if (err) return 1; err = chk(p_out[0], 42); if (err) return 1; /* * Now do the infection with parasite.c */ printf("Infecting the victim\n"); if (do_infection(pid)) return 1; /* * Tell the victim some more stuff to check it's alive */ i = 1234; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; i = 4096; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; /* * Stop the victim and check the infection went well */ printf("Closing victim stdin\n"); close(p_in[1]); printf("Waiting for victim to die\n"); wait(NULL); printf("Checking the result\n"); /* These two came from parasite */ err = chk(p_out[0], 138); err |= chk(p_out[0], 403); /* These two came from post-infect */ err |= chk(p_out[0], 1234); err |= chk(p_out[0], 4096); if (!err) printf("All OK\n"); else printf("Something went WRONG\n"); return 0; } crac-criu-1.5.0/compel/test/infect/victim.c000066400000000000000000000003121471504326700205370ustar00rootroot00000000000000#include int main(int argc, char **argv) { int i; while (1) { if (read(0, &i, sizeof(i)) != sizeof(i)) break; if (write(1, &i, sizeof(i)) != sizeof(i)) break; } return 0; } crac-criu-1.5.0/compel/test/rsys/000077500000000000000000000000001471504326700166345ustar00rootroot00000000000000crac-criu-1.5.0/compel/test/rsys/.gitignore000066400000000000000000000000131471504326700206160ustar00rootroot00000000000000spy victim crac-criu-1.5.0/compel/test/rsys/Makefile000066400000000000000000000004531471504326700202760ustar00rootroot00000000000000CC := gcc CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host all: victim spy run: ./spy .PHONY: run clean: rm -f victim rm -f spy victim: victim.c $(CC) $(CFLAGS) -o $@ $^ spy: spy.c $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $^ $(shell $(COMPEL) --static libs) crac-criu-1.5.0/compel/test/rsys/spy.c000066400000000000000000000057761471504326700176320ustar00rootroot00000000000000#include #include #include #include #include #include #include static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { printf("\tLC%u: ", lvl); vprintf(fmt, parms); } static int do_rsetsid(int pid) { #define err_and_ret(msg) \ do { \ fprintf(stderr, msg); \ return -1; \ } while (0) int state; long ret; struct parasite_ctl *ctl; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); printf("Stopping task\n"); state = compel_stop_task(pid); if (state < 0) err_and_ret("Can't stop task"); printf("Preparing parasite ctl\n"); ctl = compel_prepare(pid); if (!ctl) err_and_ret("Can't prepare for infection"); ret = -1000; if (compel_syscall(ctl, __NR_getpid, &ret, 0, 0, 0, 0, 0, 0) < 0) err_and_ret("Can't run rgetpid"); printf("Remote getpid returned %ld\n", ret); if (ret != pid) err_and_ret("Pid mismatch!"); ret = -1000; if (compel_syscall(ctl, __NR_setsid, &ret, 0, 0, 0, 0, 0, 0) < 0) err_and_ret("Can't run rsetsid"); printf("Remote setsid returned %ld\n", ret); /* * Done. Cure and resume the task. */ printf("Curing\n"); if (compel_cure(ctl)) err_and_ret("Can't cure victim"); if (compel_resume_task(pid, state, state)) err_and_ret("Can't unseize task"); printf("Done\n"); return 0; } static inline int chk(int fd, int val) { int v = 0; if (read(fd, &v, sizeof(v)) != sizeof(v)) { fprintf(stderr, "read failed\n"); } printf("%d, want %d\n", v, val); return v == val; } int main(int argc, char **argv) { int p_in[2], p_out[2], p_err[2], pid, i, pass = 1, sid; /* * Prepare IO-s and fork the victim binary */ if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { perror("Can't make pipe"); return -1; } pid = vfork(); if (pid == 0) { close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); execl("./victim", "victim", NULL); exit(1); } close(p_in[0]); close(p_out[1]); close(p_err[1]); sid = getsid(0); /* * Kick the victim once */ i = 0; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) { fprintf(stderr, "write to pipe failed\n"); return -1; } printf("Checking the victim session to be %d\n", sid); pass = chk(p_out[0], sid); if (!pass) return 1; /* * Now do the infection with parasite.c */ printf("Setsid() the victim\n"); if (do_rsetsid(pid)) return 1; /* * Kick the victim again so it tells new session */ if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) { fprintf(stderr, "write to pipe failed\n"); return -1; } /* * Stop the victim and check the intrusion went well */ printf("Closing victim stdin\n"); close(p_in[1]); printf("Waiting for victim to die\n"); wait(NULL); printf("Checking the new session to be %d\n", pid); pass = chk(p_out[0], pid); if (pass) printf("All OK\n"); else printf("Something went WRONG\n"); return 0; } crac-criu-1.5.0/compel/test/rsys/victim.c000066400000000000000000000003331471504326700202720ustar00rootroot00000000000000#include int main(int argc, char **argv) { int i; while (1) { if (read(0, &i, sizeof(i)) != sizeof(i)) break; i = getsid(0); if (write(1, &i, sizeof(i)) != sizeof(i)) break; } return 0; } crac-criu-1.5.0/compel/test/stack/000077500000000000000000000000001471504326700167415ustar00rootroot00000000000000crac-criu-1.5.0/compel/test/stack/.gitignore000066400000000000000000000000421471504326700207250ustar00rootroot00000000000000parasite.h parasite.po spy victim crac-criu-1.5.0/compel/test/stack/Makefile000066400000000000000000000011151471504326700203770ustar00rootroot00000000000000CC := gcc CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host all: victim spy run: ./spy .PHONY: run clean: rm -f victim rm -f spy rm -f parasite.h rm -f parasite.po rm -f parasite.o victim: victim.c $(CC) $(CFLAGS) -o $@ $^ spy: spy.c parasite.h $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) parasite.h: parasite.po $(COMPEL) hgen -o $@ -f $< parasite.po: parasite.o ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins) parasite.o: parasite.c $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ crac-criu-1.5.0/compel/test/stack/parasite.c000066400000000000000000000010701471504326700207130ustar00rootroot00000000000000#include #include #include /* * Stubs for std compel plugin. */ int parasite_trap_cmd(int cmd, void *args) { return 0; } void parasite_cleanup(void) { } #define PARASITE_CMD_INC PARASITE_USER_CMDS #define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 int parasite_daemon_cmd(int cmd, void *args) { int v; switch (cmd) { case PARASITE_CMD_INC: v = (*(int *)args) + 1; break; case PARASITE_CMD_DEC: v = (*(int *)args) - 1; break; default: v = -1; break; } sys_write(1, &v, sizeof(int)); return 0; } crac-criu-1.5.0/compel/test/stack/spy.c000066400000000000000000000210311471504326700177150ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "parasite.h" #define PARASITE_CMD_INC PARASITE_USER_CMDS #define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 #define err_and_ret(msg) \ do { \ fprintf(stderr, msg); \ return -1; \ } while (0) void *saved_data = NULL; #define SAVED_DATA_MAX page_size() void cleanup_saved_data(void) { free(saved_data); } static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { printf("\tLC%u: ", lvl); vprintf(fmt, parms); } static void *get_parasite_rstack_start(struct parasite_ctl *ctl) { void *rstack, *r_thread_stack, *rstack_start; compel_get_stack(ctl, &rstack, &r_thread_stack); rstack_start = rstack; if (r_thread_stack != NULL && r_thread_stack < rstack_start) rstack_start = r_thread_stack; return rstack_start; } static int page_writable(struct parasite_ctl *ctl, int pid, void *page) { FILE *maps; size_t maps_line_len = 0; char *maps_line = NULL; char victim_maps_path[6 + 11 + 5 + 1]; int written; int ret = 0; if (((uintptr_t)page & (page_size() - 1)) != 0) { fprintf(stderr, "Page address not aligned\n"); ret = -1; goto done; } written = snprintf(victim_maps_path, sizeof(victim_maps_path), "/proc/%d/maps", pid); if (written < 0 || written >= sizeof(victim_maps_path)) { fprintf(stderr, "Failed to create path string to victim's /proc/%d/maps file\n", pid); ret = -1; goto done; } maps = fopen(victim_maps_path, "r"); if (maps == NULL) { perror("Can't open victim's /proc/$pid/maps"); ret = -1; goto done; } while (getline(&maps_line, &maps_line_len, maps) != -1) { unsigned long vmstart, vmend; char r, w; if (sscanf(maps_line, "%lx-%lx %c%c", &vmstart, &vmend, &r, &w) < 4) { fprintf(stderr, "Can't parse victim's /proc/%d/maps; line: %s\n", pid, maps_line); ret = -1; goto free_linebuf; } if (page >= (void *)vmstart && page < (void *)vmend) { if (w == 'w') { if (r != 'r') { fprintf(stderr, "Expecting writable memory to also be readable"); ret = -1; goto free_linebuf; } ret = 1; } break; } } if (errno) { perror("Can't read victim's /proc/$pid/maps"); ret = -1; } free_linebuf: free(maps_line); fclose(maps); done: return ret; } static void *read_proc_mem(int pid, void *offset, size_t len) { char victim_mem_path[6 + 11 + 4 + 1]; int written; int fd; void *data; ssize_t mem_read; written = snprintf(victim_mem_path, sizeof(victim_mem_path), "/proc/%d/mem", pid); if (written < 0 || written >= sizeof(victim_mem_path)) { fprintf(stderr, "Failed to create path string to victim's /proc/%d/mem file\n", pid); return NULL; } fd = open(victim_mem_path, O_RDONLY); if (fd < 0) { perror("Failed to open victim's /proc/$pid/mem file"); return NULL; } data = malloc(len); if (data == NULL) { perror("Can't allocate memory to read victim's /proc/$pid/mem file"); return NULL; } mem_read = pread(fd, data, len, (off_t)offset); if (mem_read == -1) { perror("Failed to read victim's /proc/$pid/mem file"); goto freebuf; } return data; freebuf: free(data); return NULL; } static int save_data_near_stack(struct parasite_ctl *ctl, int pid, void *stack, void **saved_data, size_t *saved_data_size) { size_t page_mask = page_size() - 1; size_t saved_size = 0; size_t stack_size_last_page = (uintptr_t)stack & page_mask; void *next_page = stack; if (stack_size_last_page != 0) { size_t empty_space_last_page = page_size() - stack_size_last_page; saved_size = min(empty_space_last_page, (size_t)SAVED_DATA_MAX); next_page += page_size() - stack_size_last_page; } while (saved_size < SAVED_DATA_MAX && next_page != NULL) { switch (page_writable(ctl, pid, next_page)) { case 1: saved_size = min((size_t)(saved_size + page_size()), (size_t)SAVED_DATA_MAX); next_page += page_size(); break; case 0: next_page = NULL; break; default: return -1; } } if (saved_size > 0) { void *sd; sd = read_proc_mem(pid, stack, saved_size); if (sd == NULL) return -1; *saved_data = sd; } else { *saved_data = NULL; } *saved_data_size = saved_size; return 0; } static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) { if (saved_data != NULL) { void *current_data; current_data = read_proc_mem(pid, stack, saved_data_size); if (current_data == NULL) return -1; if (memcmp(saved_data, current_data, saved_data_size) != 0) return 1; } return 0; } static int do_infection(int pid) { int state; struct parasite_ctl *ctl; struct infect_ctx *ictx; int *arg; void *stack; size_t saved_data_size; int saved_data_check; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); printf("Stopping task\n"); state = compel_stop_task(pid); if (state < 0) err_and_ret("Can't stop task\n"); printf("Preparing parasite ctl\n"); ctl = compel_prepare(pid); if (!ctl) err_and_ret("Can't prepare for infection\n"); printf("Configuring contexts\n"); /* * First -- the infection context. Most of the stuff * is already filled by compel_prepare(), just set the * log descriptor for parasite side, library cannot * live w/o it. */ ictx = compel_infect_ctx(ctl); ictx->log_fd = STDERR_FILENO; parasite_setup_c_header(ctl); printf("Infecting\n"); if (compel_infect_no_daemon(ctl, 1, sizeof(int))) err_and_ret("Can't infect victim\n"); if (atexit(cleanup_saved_data)) err_and_ret("Can't register cleanup function with atexit\n"); stack = get_parasite_rstack_start(ctl); if (save_data_near_stack(ctl, pid, stack, &saved_data, &saved_data_size)) err_and_ret("Can't save data above stack\n"); if (compel_start_daemon(ctl)) err_and_ret("Can't start daemon in victim\n"); /* * Now get the area with arguments and run two * commands one by one. */ arg = compel_parasite_args(ctl, int); printf("Running cmd 1\n"); *arg = 137; if (compel_rpc_call_sync(PARASITE_CMD_INC, ctl)) err_and_ret("Can't run parasite command 1\n"); printf("Running cmd 2\n"); *arg = 404; if (compel_rpc_call_sync(PARASITE_CMD_DEC, ctl)) err_and_ret("Can't run parasite command 2\n"); saved_data_check = check_saved_data(ctl, pid, stack, saved_data, saved_data_size); if (saved_data_check == -1) err_and_ret("Could not check saved data\n"); if (saved_data_check != 0) err_and_ret("Saved data unexpectedly modified\n"); /* * Done. Cure and resume the task. */ printf("Curing\n"); if (compel_cure(ctl)) err_and_ret("Can't cure victim\n"); if (compel_resume_task(pid, state, state)) err_and_ret("Can't unseize task\n"); printf("Done\n"); return 0; } static inline int chk(int fd, int val) { int v = 0; if (read(fd, &v, sizeof(v)) != sizeof(v)) return 1; printf("%d, want %d\n", v, val); return v != val; } int main(int argc, char **argv) { int p_in[2], p_out[2], p_err[2], pid, i, err = 0; /* * Prepare IO-s and fork the victim binary */ if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { perror("Can't make pipe"); return -1; } pid = vfork(); if (pid == 0) { close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); execl("./victim", "victim", NULL); exit(1); } close(p_in[0]); close(p_out[1]); close(p_err[1]); /* * Tell the little guy some numbers */ i = 1; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; i = 42; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; printf("Checking the victim alive\n"); err = chk(p_out[0], 1); if (err) return 1; err = chk(p_out[0], 42); if (err) return 1; /* * Now do the infection with parasite.c */ printf("Infecting the victim\n"); if (do_infection(pid)) return 1; /* * Tell the victim some more stuff to check it's alive */ i = 1234; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; i = 4096; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; /* * Stop the victim and check the infection went well */ printf("Closing victim stdin\n"); close(p_in[1]); printf("Waiting for victim to die\n"); wait(NULL); printf("Checking the result\n"); /* These two came from parasite */ err = chk(p_out[0], 138); err |= chk(p_out[0], 403); /* These two came from post-infect */ err |= chk(p_out[0], 1234); err |= chk(p_out[0], 4096); if (!err) printf("All OK\n"); else printf("Something went WRONG\n"); return 0; } crac-criu-1.5.0/compel/test/stack/victim.c000066400000000000000000000003121471504326700203740ustar00rootroot00000000000000#include int main(int argc, char **argv) { int i; while (1) { if (read(0, &i, sizeof(i)) != sizeof(i)) break; if (write(1, &i, sizeof(i)) != sizeof(i)) break; } return 0; } crac-criu-1.5.0/contrib/000077500000000000000000000000001471504326700150365ustar00rootroot00000000000000crac-criu-1.5.0/contrib/debian/000077500000000000000000000000001471504326700162605ustar00rootroot00000000000000crac-criu-1.5.0/contrib/debian/dev-packages.lst000066400000000000000000000005131471504326700213350ustar00rootroot00000000000000# Required packages for development in Debian build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev # Extra packages, required for testing and building other tools pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev python3-yaml libnl-route-3-dev crac-criu-1.5.0/contrib/docker_cr.sh000077500000000000000000000273771471504326700173500ustar00rootroot00000000000000#!/bin/bash # # A convenience shell script to call criu for checkpointing and restoring # a Docker container. # # This script saves the user from having to remember all the command # line options, some of which are very long. Note that once Docker # has native support for checkpoint and restore, there will no longer # be a need for this particular shell script. # set -o errexit set -o nounset set -o pipefail # # These can be set in the environment to override their defaults. # Note that while the default value of CRIU_IMG_DIR in this script # is a directory in DOCKER_HOME, it doesn't have to be tied to # DOCKER_HOME. For example, it can be /var/spool/criu_img. # : ${DOCKER_HOME=/var/lib/docker} : ${DOCKER_BINARY=docker} : ${CRIU_IMG_DIR=${DOCKER_HOME}/criu_img} : ${CRIU_BINARY=criu} : ${DOCKERINIT_BINARY=} # # Patterns for different filesystem types in dump.log. # readonly AUFS_PATTERN='/sys/fs/aufs/si_' readonly OVERLAYFS_PATTERN='type.*source.*options.*lowerdir=.*upperdir=.*workdir=' readonly UNIONFS_PATTERN='type.*source.*options.*dirs=' # # These globals will be set by init_container_vars() # declare CID declare CONTAINER_IMG_DIR declare CONTAINER_DUMP_LOG declare -A BIND_MOUNT BIND_MOUNT[/etc/resolv.conf]=.ResolvConfPath BIND_MOUNT[/etc/hosts]=.HostsPath BIND_MOUNT[/etc/hostname]=.HostnamePath MOUNT_MAP_ARGS=() # # The default mode is non-verbose, printing only a short message # saying if the command succeeded or failed. For the verbose mode, # we could have used set -o xtrace but this option would have # generated excessive output suitable for debugging, not normal # usage. So we set ${ECHO} to echo in the verbose mode to print # selected messages. # VERBOSE="" ECHO=":" CMD="" PGNAME=$(basename "$0") usage() { local rv=0 if [[ -n "${1-}" ]]; then rv=1 echo -e "${PGNAME}: $1\n" >&2 fi cat <] -c, --checkpoint checkpoint container -h, --help print help message -r, --restore restore container -v, --verbose enable verbose mode Environment: DOCKER_HOME (default ${DOCKER_HOME}) CRIU_IMG_DIR (default ${CRIU_IMG_DIR}) DOCKER_BINARY (default ${DOCKER_BINARY}) DOCKERINIT_BINARY (default \${DOCKER_HOME}/init/dockerinit--dev) CRIU_BINARY (default ${CRIU_BINARY}) EOF exit ${rv} } # # If the user has not specified a bind mount file for the container's # /.dockerinit, try to determine it from the Docker version. # find_dockerinit() { local v if [[ -z "${DOCKERINIT_BINARY}" ]]; then v=$("${DOCKER_BINARY}" --version | sed -e 's/.*version \(.*\),.*/\1/') DOCKERINIT_BINARY="${DOCKER_HOME}/init/dockerinit-${v}" elif [[ "${DOCKERINIT_BINARY}" != /* ]]; then DOCKERINIT_BINARY="${DOCKER_HOME}/init/${DOCKERINIT_BINARY}" fi if [[ ! -x "${DOCKERINIT_BINARY}" ]]; then echo "${DOCKERINIT_BINARY} does not exist" exit 1 fi BIND_MOUNT[/.dockerinit]="${DOCKERINIT_BINARY}" } parse_args() { local args local flags args=$(getopt --options 'chrv' \ --longoptions 'checkpoint help restore verbose' -- "$@") [[ $? == 0 ]] || usage eval set -- "${args}" while :; do arg="${1}" shift case "${arg}" in -c|--checkpoint) CMD="dump" ;; -h|--help) usage ;; -r|--restore) CMD="restore" ;; -v|--verbose) VERBOSE="-v"; ECHO="echo" ;; --) break ;; *) usage "internal error parsing arguments!" ;; esac done [[ "${CMD}" == "" ]] && usage "need either -c or -r" [[ $# -gt 1 ]] && usage "$# too many arguments" # if no container id in args, prompt the user if [[ $# -eq 1 ]]; then CID="$1" else if [[ "${CMD}" == "dump" ]]; then flags="" else # we need -a only for restore flags="-a" fi "${DOCKER_BINARY}" ps ${flags} read -rp $'\nContainer ID: ' CID fi } execute() { # since commands are pretty long and can wrap around # several lines, print a blank line to make it visually # easier to see ${ECHO} -e "\n$*" "$@" } init_container_vars() { local d CID=$(get_container_conf .Id) d=$("${DOCKER_BINARY}" info 2> /dev/null | awk '/Storage Driver:/ { print $3 }') if [[ "${d}" == "vfs" ]]; then CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/dir/${CID}" elif [[ "${d}" == "aufs" || "${d}" == "unionfs" ]]; then CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/mnt/${CID}" elif [[ "${d}" == "overlay" ]]; then CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/${CID}/merged" else echo "${d}: unknown filesystem type" return 1 fi CONTAINER_IMG_DIR="${CRIU_IMG_DIR}/${CID}" CONTAINER_DUMP_LOG="${CONTAINER_IMG_DIR}/dump.log" } get_container_conf() { local val val=$("${DOCKER_BINARY}" inspect --format "{{$1}}" "${CID}") [[ "${val}" == "" ]] && exit 1 echo "${val//}" } setup_mount_map() { local key if [[ "$1" == "dump" ]]; then for key in "${!BIND_MOUNT[@]}"; do MOUNT_MAP_ARGS+=(--ext-mount-map "${key}:${key}") done else for key in "${!BIND_MOUNT[@]}"; do if [[ "${key}" == "/.dockerinit" ]]; then MOUNT_MAP_ARGS+=("--ext-mount-map" "${key}:${BIND_MOUNT[$key]}") else MOUNT_MAP_ARGS+=("--ext-mount-map" "${key}:$(get_container_conf "${BIND_MOUNT[$key]}")") fi done fi } fs_mounted() { if grep -wq "$1" /proc/self/mountinfo; then ${ECHO} "container root directory already mounted" return 0 fi ${ECHO} "container root directory not mounted" return 1 } # # Pretty print the mount command in verbose mode by putting each branch # pathname on a single line for easier visual inspection. # pp_mount() { ${ECHO} -e "\nmount -t $1 -o" ${ECHO} "${2}" | tr ':,' '\n' ${ECHO} "${3}" ${ECHO} "${4}" } # # Reconstruct the AUFS filesystem from information in CRIU's dump log. # The dump log has a series of branch entries for each process in the # entire process tree in the following form: # # (00.014075) /sys/fs/aufs/si_f598876b0855b883/br0 : /var/lib/docker/aufs/diff/ # # Note that this script assumes that all processes in the process # tree have the same AUFS filesystem. This assumption is fairly # safe for typical Docker containers. # setup_aufs() { local -r tmpf="${CONTAINER_IMG_DIR}/aufs.br" local br local branches # nothing to do if filesystem already mounted fs_mounted "${CONTAINER_ROOT_DIR}" && return # create a temporary file with branches listed in # ascending order (line 1 is branch 0) awk '/aufs.si_/ { print $2, $4 }' "${CONTAINER_DUMP_LOG}" | \ sort | uniq | awk '{ print $2 }' > "${tmpf}" # construct the mount option string from branches branches="" while read br; do branches+="${branches:+:}${br}" done < "${tmpf}" # mount the container's filesystem pp_mount "aufs" "${branches}" "none" "${CONTAINER_ROOT_DIR}" mount -t aufs -o br="${branches}" none "${CONTAINER_ROOT_DIR}" rm -f "${tmpf}" } setup_overlayfs() { local lowerdir local upperdir local workdir local ovlydirs local -r f="${CONTAINER_DUMP_LOG}" # nothing to do if filesystem already mounted fs_mounted "${CONTAINER_ROOT_DIR}" && return lowerdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*lowerdir=\([^,]*\).*/\1/p') upperdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*upperdir=\([^,]*\).*/\1/p') workdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*workdir=\([^,]*\).*/\1/p') ovlydirs="lowerdir=${lowerdir},upperdir=${upperdir},workdir=${workdir}" # mount the container's filesystem pp_mount "overlay" "${ovlydirs}" "overlay" "${CONTAINER_ROOT_DIR}" mount -t overlay -o "${ovlydirs}" overlay "${CONTAINER_ROOT_DIR}" } # # Reconstruct the UnionFS filesystem from information in CRIU's dump log. # The dump log has the mountinfo root entry for the filesystem. The # options field contains the list of directories that make up the UnionFS. # # Note that this script assumes that all processes in the process # tree have the same UnionFS filesystem. This assumption is fairly # safe for typical Docker containers. # # XXX If /dev/null was manually created by Docker (i.e., it's not in # a branch), create it. Although this has worked so far, it needs # a deeper look as I am not sure if /dev/null should be created as # a regular file to be the target of a bind mount or created as a # device file by mknod. # setup_unionfs() { local dirs # nothing to do if filesystem already mounted fs_mounted "${CONTAINER_ROOT_DIR}" && return dirs=$(sed -n -e 's/.*type.*dirs=/dirs=/p' "${CONTAINER_DUMP_LOG}") [[ "${dirs}" = "" ]] && echo "do not have branch information" && exit 1 # mount the container's filesystem pp_mount "unionfs" "${dirs}" "none" "${CONTAINER_ROOT_DIR}" mount -t unionfs -o "${dirs}" none "${CONTAINER_ROOT_DIR}" # see comment at the beginning of the function if [[ ! -e "${CONTAINER_ROOT_DIR}/dev/null" ]]; then execute touch "${CONTAINER_ROOT_DIR}/dev/null" fi } prep_dump() { local pid pid=$(get_container_conf .State.Pid) # docker returns 0 for containers it thinks have exited # (i.e., dumping a restored container again) if [[ ${pid} -eq 0 ]]; then echo -e "\nCheckpointing a restored container?" read -p "Process ID: " pid fi # remove files previously created by criu but not others files (if any) mkdir -p "${CONTAINER_IMG_DIR}" rm -f "${CONTAINER_IMG_DIR}"/*.{img,log,pid} "${CONTAINER_IMG_DIR}"/stats-restore CMD_ARGS=("-t" "${pid}") # we need --root only for aufs to compensate for the # erroneous information in /proc//map_files if [[ "${CONTAINER_ROOT_DIR}" == *aufs* ]]; then CMD_ARGS+=("--root" "${CONTAINER_ROOT_DIR}") fi } # # Set up container's root filesystem if not already set up. # prep_restore() { local -r f="${CONTAINER_DUMP_LOG}" if [[ ! -f "${f}" ]]; then echo "${f} does not exist" return 1 fi if grep -q "${AUFS_PATTERN}" "${f}"; then setup_aufs elif grep -q "${OVERLAYFS_PATTERN}" "${f}"; then setup_overlayfs elif grep -q "${UNIONFS_PATTERN}" "${f}"; then setup_unionfs fi # criu requires this (due to container using pivot_root) if ! grep -qw "${CONTAINER_ROOT_DIR}" /proc/self/mountinfo; then execute mount --rbind "${CONTAINER_ROOT_DIR}" "${CONTAINER_ROOT_DIR}" MOUNTED=1 else MOUNTED=0 fi CMD_ARGS=("-d" "--root" "${CONTAINER_ROOT_DIR}" "--pidfile" "${CONTAINER_IMG_DIR}/restore.pid") } # # Since this function produces output string (either in the # verbose mode or from ${CRIU_BINARY}), we set the return value # in parameter 1. # run_criu() { local -a common_args=("-v4" "-D" "${CONTAINER_IMG_DIR}" \ "-o" "${CMD}.log" \ "--manage-cgroups" \ "--evasive-devices") setup_mount_map "${CMD}" common_args+=("${MOUNT_MAP_ARGS[@]}") # we do not want to exit if there's an error execute "${CRIU_BINARY}" "${CMD}" "${common_args[@]}" "${CMD_ARGS[@]}" } wrap_up() { local -r logf="${CONTAINER_IMG_DIR}/${CMD}.log" local -r pidf="${CONTAINER_IMG_DIR}/restore.pid" if [[ $1 -eq 0 ]]; then ${ECHO} -e "\n" echo "${CMD} successful" else ${ECHO} -e "\n" echo "${CMD} failed" fi if [[ "${VERBOSE}" == "-v" && -e "${logf}" ]]; then if ! grep "finished successfully" "${logf}"; then grep Error "${logf}" fi fi if [[ "${CMD}" == "restore" ]]; then if [[ ${MOUNTED} -eq 1 ]]; then execute umount "${CONTAINER_ROOT_DIR}" fi if [[ -e "${pidf}" ]]; then ${ECHO} -e "\n$(ps -f -p "$(cat "${pidf}")" --no-headers)" fi fi } resolve_path() { local p p="${2}" if which realpath > /dev/null; then p=$(realpath "${p}") fi ${ECHO} "${1}: ${p}" } resolve_cmd() { local cpath cpath=$(which "${2}") resolve_path "${1}" "${cpath}" } main() { local rv=0 if [[ $(id -u) -ne 0 ]]; then echo "not running as root" exit 1 fi parse_args "$@" find_dockerinit init_container_vars if [[ "${VERBOSE}" == "-v" ]]; then echo resolve_cmd "docker binary" "${DOCKER_BINARY}" resolve_cmd "dockerinit binary" "${DOCKERINIT_BINARY}" resolve_cmd "criu binary" "${CRIU_BINARY}" resolve_path "image directory" "${CONTAINER_IMG_DIR}" resolve_path "container root directory" "${CONTAINER_ROOT_DIR}" fi if [[ "${CMD}" == "dump" ]]; then prep_dump else prep_restore fi run_criu || rv=$? wrap_up ${rv} exit ${rv} } main "$@" crac-criu-1.5.0/coredump/000077500000000000000000000000001471504326700152145ustar00rootroot00000000000000crac-criu-1.5.0/coredump/coredump000077500000000000000000000024651471504326700167670ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import os import sys import criu_coredump def coredump(opts): generator = criu_coredump.coredump_generator() cores = generator(os.path.realpath(opts['in'])) for pid in cores: if opts['pid'] and pid != opts['pid']: continue with open(os.path.realpath(opts['out']) + "/core." + str(pid), 'wb+') as f: cores[pid].write(f) def main(): desc = 'CRIU core dump' parser = argparse.ArgumentParser(description=desc, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--in', default='.', help='directory where to get images from') parser.add_argument('-p', '--pid', type=int, help='generate coredump for specific pid(all pids py default)') parser.add_argument('-o', '--out', default='.', help='directory to write coredumps to') opts = vars(parser.parse_args()) try: coredump(opts) except SystemExit as error: print('ERROR: %s' % error) print('Exiting') sys.exit(1) if __name__ == '__main__': main() crac-criu-1.5.0/coredump/criu_coredump/000077500000000000000000000000001471504326700200545ustar00rootroot00000000000000crac-criu-1.5.0/coredump/criu_coredump/.gitignore000066400000000000000000000000061471504326700220400ustar00rootroot00000000000000*.pyc crac-criu-1.5.0/coredump/criu_coredump/__init__.py000066400000000000000000000000511471504326700221610ustar00rootroot00000000000000from .coredump import coredump_generator crac-criu-1.5.0/coredump/criu_coredump/coredump.py000066400000000000000000000630541471504326700222540ustar00rootroot00000000000000# Functions and classes for creating core dump from criu images. # Code is inspired by outdated google coredumper(RIP) [1] and # fs/binfmt_elf.h from Linux kernel [2]. # # [1] https://code.google.com/p/google-coredumper/ # probably already dead, so consider trying: # https://github.com/efiop/google-coredumper/ # [2] https://www.kernel.org/ # # On my x86_64 systems with fresh kernel ~3.17 core dump looks like: # # 1) Elf file header; # 2) PT_NOTE program header describing notes section; # 3) PT_LOAD program headers for (almost?) each vma; # 4) NT_PRPSINFO note with elf_prpsinfo inside; # 5) An array of notes for each thread of the process: # NT_PRSTATUS note with elf_prstatus inside; # NT_FPREGSET note with elf_fpregset inside; # NT_X86_XSTATE note with x86 extended state using xsave; # NT_SIGINFO note with siginfo_t inside; # 6) NT_AUXV note with auxv; # 7) NT_FILE note with mapped files; # 8) VMAs themselves; # # Or, you can represent it in less details as: # 1) Elf file header; # 2) Program table; # 3) Notes; # 4) VMAs contents; # import io import sys import ctypes from pycriu import images from . import elf # Some memory-related constants PAGESIZE = 4096 status = { "VMA_AREA_NONE": 0 << 0, "VMA_AREA_REGULAR": 1 << 0, "VMA_AREA_STACK": 1 << 1, "VMA_AREA_VSYSCALL": 1 << 2, "VMA_AREA_VDSO": 1 << 3, "VMA_FORCE_READ": 1 << 4, "VMA_AREA_HEAP": 1 << 5, "VMA_FILE_PRIVATE": 1 << 6, "VMA_FILE_SHARED": 1 << 7, "VMA_ANON_SHARED": 1 << 8, "VMA_ANON_PRIVATE": 1 << 9, "VMA_AREA_SYSVIPC": 1 << 10, "VMA_AREA_SOCKET": 1 << 11, "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, "VMA_AREA_MEMFD": 1 << 14, "VMA_AREA_UNSUPP": 1 << 31 } prot = {"PROT_READ": 0x1, "PROT_WRITE": 0x2, "PROT_EXEC": 0x4} class elf_note: nhdr = None # Elf_Nhdr; owner = None # i.e. CORE or LINUX; data = None # Ctypes structure with note data; class coredump: """ A class to keep elf core dump components inside and functions to properly write them to file. """ ehdr = None # Elf ehdr; phdrs = [] # Array of Phdrs; notes = [] # Array of elf_notes; vmas = [] # Array of BytesIO with memory content; # FIXME keeping all vmas in memory is a bad idea; def write(self, f): """ Write core dump to file f. """ buf = io.BytesIO() buf.write(self.ehdr) for phdr in self.phdrs: buf.write(phdr) for note in self.notes: buf.write(note.nhdr) buf.write(note.owner) buf.write(b"\0" * (8 - len(note.owner))) buf.write(note.data) offset = ctypes.sizeof(elf.Elf64_Ehdr()) offset += (len(self.vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) filesz = 0 for note in self.notes: filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 note_align = PAGESIZE - ((offset + filesz) % PAGESIZE) if note_align == PAGESIZE: note_align = 0 if note_align != 0: scratch = (ctypes.c_char * note_align)() ctypes.memset(ctypes.addressof(scratch), 0, ctypes.sizeof(scratch)) buf.write(scratch) for vma in self.vmas: buf.write(vma.data) buf.seek(0) f.write(buf.read()) class coredump_generator: """ Generate core dump from criu images. """ coredumps = {} # coredumps by pid; pstree = {} # process info by pid; cores = {} # cores by pid; mms = {} # mm by pid; reg_files = None # reg-files; pagemaps = {} # pagemap by pid; def _img_open_and_strip(self, name, single=False, pid=None): """ Load criu image and strip it from magic and redundant list. """ path = self._imgs_dir + "/" + name if pid: path += "-" + str(pid) path += ".img" with open(path, 'rb') as f: img = images.load(f) if single: return img["entries"][0] else: return img["entries"] def __call__(self, imgs_dir): """ Parse criu images stored in directory imgs_dir to fill core dumps. """ self._imgs_dir = imgs_dir pstree = self._img_open_and_strip("pstree") for p in pstree: pid = p['pid'] self.pstree[pid] = p for tid in p['threads']: self.cores[tid] = self._img_open_and_strip("core", True, tid) self.mms[pid] = self._img_open_and_strip("mm", True, pid) self.pagemaps[pid] = self._img_open_and_strip( "pagemap", False, pid) files = self._img_open_and_strip("files", False) self.reg_files = [x["reg"] for x in files if x["type"] == "REG"] for pid in self.pstree: self.coredumps[pid] = self._gen_coredump(pid) return self.coredumps def write(self, coredumps_dir, pid=None): """ Write core dumpt to cores_dir directory. Specify pid to choose core dump of only one process. """ for p in self.coredumps: if pid and p != pid: continue with open(coredumps_dir + "/" + "core." + str(p), 'wb+') as f: self.coredumps[p].write(f) def _gen_coredump(self, pid): """ Generate core dump for pid. """ cd = coredump() # Generate everything backwards so it is easier to calculate offset. cd.vmas = self._gen_vmas(pid) cd.notes = self._gen_notes(pid) cd.phdrs = self._gen_phdrs(pid, cd.notes, cd.vmas) cd.ehdr = self._gen_ehdr(pid, cd.phdrs) return cd def _gen_ehdr(self, pid, phdrs): """ Generate elf header for process pid with program headers phdrs. """ ehdr = elf.Elf64_Ehdr() ctypes.memset(ctypes.addressof(ehdr), 0, ctypes.sizeof(ehdr)) ehdr.e_ident[elf.EI_MAG0] = elf.ELFMAG0 ehdr.e_ident[elf.EI_MAG1] = elf.ELFMAG1 ehdr.e_ident[elf.EI_MAG2] = elf.ELFMAG2 ehdr.e_ident[elf.EI_MAG3] = elf.ELFMAG3 ehdr.e_ident[elf.EI_CLASS] = elf.ELFCLASS64 ehdr.e_ident[elf.EI_DATA] = elf.ELFDATA2LSB ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT ehdr.e_type = elf.ET_CORE ehdr.e_machine = elf.EM_X86_64 ehdr.e_version = elf.EV_CURRENT ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) ehdr.e_phentsize = ctypes.sizeof(elf.Elf64_Phdr()) # FIXME Case len(phdrs) > PN_XNUM should be handled properly. # See fs/binfmt_elf.c from linux kernel. ehdr.e_phnum = len(phdrs) return ehdr def _gen_phdrs(self, pid, notes, vmas): """ Generate program headers for process pid. """ phdrs = [] offset = ctypes.sizeof(elf.Elf64_Ehdr()) offset += (len(vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) filesz = 0 for note in notes: filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 # PT_NOTE phdr = elf.Elf64_Phdr() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_NOTE phdr.p_offset = offset phdr.p_filesz = filesz phdrs.append(phdr) note_align = PAGESIZE - ((offset + filesz) % PAGESIZE) if note_align == PAGESIZE: note_align = 0 offset += note_align # VMA phdrs for vma in vmas: offset += filesz filesz = vma.filesz phdr = elf.Elf64_Phdr() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_LOAD phdr.p_align = PAGESIZE phdr.p_paddr = 0 phdr.p_offset = offset phdr.p_vaddr = vma.start phdr.p_memsz = vma.memsz phdr.p_filesz = vma.filesz phdr.p_flags = vma.flags phdrs.append(phdr) return phdrs def _gen_prpsinfo(self, pid): """ Generate NT_PRPSINFO note for process pid. """ pstree = self.pstree[pid] core = self.cores[pid] prpsinfo = elf.elf_prpsinfo() ctypes.memset(ctypes.addressof(prpsinfo), 0, ctypes.sizeof(prpsinfo)) # FIXME TASK_ALIVE means that it is either running or sleeping, need to # teach criu to distinguish them. TASK_ALIVE = 0x1 # XXX A bit of confusion here, as in ps "dead" and "zombie" # state are two separate states, and we use TASK_DEAD for zombies. TASK_DEAD = 0x2 TASK_STOPPED = 0x3 if core["tc"]["task_state"] == TASK_ALIVE: prpsinfo.pr_state = 0 if core["tc"]["task_state"] == TASK_DEAD: prpsinfo.pr_state = 4 if core["tc"]["task_state"] == TASK_STOPPED: prpsinfo.pr_state = 3 # Don't even ask me why it is so, just borrowed from linux # source and made pr_state match. prpsinfo.pr_sname = b'.' if prpsinfo.pr_state > 5 else b"RSDTZW" [ prpsinfo.pr_state] prpsinfo.pr_zomb = 1 if prpsinfo.pr_state == 4 else 0 prpsinfo.pr_nice = core["thread_core"][ "sched_prio"] if "sched_prio" in core["thread_core"] else 0 prpsinfo.pr_flag = core["tc"]["flags"] prpsinfo.pr_uid = core["thread_core"]["creds"]["uid"] prpsinfo.pr_gid = core["thread_core"]["creds"]["gid"] prpsinfo.pr_pid = pid prpsinfo.pr_ppid = pstree["ppid"] prpsinfo.pr_pgrp = pstree["pgid"] prpsinfo.pr_sid = pstree["sid"] # prpsinfo.pr_psargs has a limit of 80 characters which means it will # fail here if the cmdline is longer than 80 prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] prpsinfo.pr_fname = core["tc"]["comm"].encode() nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prpsinfo()) nhdr.n_type = elf.NT_PRPSINFO note = elf_note() note.data = prpsinfo note.owner = b"CORE" note.nhdr = nhdr return note def _gen_prstatus(self, pid, tid): """ Generate NT_PRSTATUS note for thread tid of process pid. """ core = self.cores[tid] regs = core["thread_info"]["gpregs"] pstree = self.pstree[pid] prstatus = elf.elf_prstatus() ctypes.memset(ctypes.addressof(prstatus), 0, ctypes.sizeof(prstatus)) # FIXME setting only some of the fields for now. Revisit later. prstatus.pr_pid = tid prstatus.pr_ppid = pstree["ppid"] prstatus.pr_pgrp = pstree["pgid"] prstatus.pr_sid = pstree["sid"] prstatus.pr_reg.r15 = regs["r15"] prstatus.pr_reg.r14 = regs["r14"] prstatus.pr_reg.r13 = regs["r13"] prstatus.pr_reg.r12 = regs["r12"] prstatus.pr_reg.rbp = regs["bp"] prstatus.pr_reg.rbx = regs["bx"] prstatus.pr_reg.r11 = regs["r11"] prstatus.pr_reg.r10 = regs["r10"] prstatus.pr_reg.r9 = regs["r9"] prstatus.pr_reg.r8 = regs["r8"] prstatus.pr_reg.rax = regs["ax"] prstatus.pr_reg.rcx = regs["cx"] prstatus.pr_reg.rdx = regs["dx"] prstatus.pr_reg.rsi = regs["si"] prstatus.pr_reg.rdi = regs["di"] prstatus.pr_reg.orig_rax = regs["orig_ax"] prstatus.pr_reg.rip = regs["ip"] prstatus.pr_reg.cs = regs["cs"] prstatus.pr_reg.eflags = regs["flags"] prstatus.pr_reg.rsp = regs["sp"] prstatus.pr_reg.ss = regs["ss"] prstatus.pr_reg.fs_base = regs["fs_base"] prstatus.pr_reg.gs_base = regs["gs_base"] prstatus.pr_reg.ds = regs["ds"] prstatus.pr_reg.es = regs["es"] prstatus.pr_reg.fs = regs["fs"] prstatus.pr_reg.gs = regs["gs"] nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prstatus()) nhdr.n_type = elf.NT_PRSTATUS note = elf_note() note.data = prstatus note.owner = b"CORE" note.nhdr = nhdr return note def _gen_fpregset(self, pid, tid): """ Generate NT_FPREGSET note for thread tid of process pid. """ core = self.cores[tid] regs = core["thread_info"]["fpregs"] fpregset = elf.elf_fpregset_t() ctypes.memset(ctypes.addressof(fpregset), 0, ctypes.sizeof(fpregset)) fpregset.cwd = regs["cwd"] fpregset.swd = regs["swd"] fpregset.ftw = regs["twd"] fpregset.fop = regs["fop"] fpregset.rip = regs["rip"] fpregset.rdp = regs["rdp"] fpregset.mxcsr = regs["mxcsr"] fpregset.mxcr_mask = regs["mxcsr_mask"] fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( *regs["st_space"]) fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( *regs["xmm_space"]) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_fpregset_t()) nhdr.n_type = elf.NT_FPREGSET note = elf_note() note.data = fpregset note.owner = b"CORE" note.nhdr = nhdr return note def _gen_x86_xstate(self, pid, tid): """ Generate NT_X86_XSTATE note for thread tid of process pid. """ core = self.cores[tid] fpregs = core["thread_info"]["fpregs"] data = elf.elf_xsave_struct() ctypes.memset(ctypes.addressof(data), 0, ctypes.sizeof(data)) data.i387.cwd = fpregs["cwd"] data.i387.swd = fpregs["swd"] data.i387.twd = fpregs["twd"] data.i387.fop = fpregs["fop"] data.i387.rip = fpregs["rip"] data.i387.rdp = fpregs["rdp"] data.i387.mxcsr = fpregs["mxcsr"] data.i387.mxcsr_mask = fpregs["mxcsr_mask"] data.i387.st_space = (ctypes.c_uint * len(fpregs["st_space"]))( *fpregs["st_space"]) data.i387.xmm_space = (ctypes.c_uint * len(fpregs["xmm_space"]))( *fpregs["xmm_space"]) if "xsave" in fpregs: data.xsave_hdr.xstate_bv = fpregs["xsave"]["xstate_bv"] data.ymmh.ymmh_space = (ctypes.c_uint * len(fpregs["xsave"]["ymmh_space"]))( *fpregs["xsave"]["ymmh_space"]) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 6 nhdr.n_descsz = ctypes.sizeof(data) nhdr.n_type = elf.NT_X86_XSTATE note = elf_note() note.data = data note.owner = b"LINUX" note.nhdr = nhdr return note def _gen_siginfo(self, pid, tid): """ Generate NT_SIGINFO note for thread tid of process pid. """ siginfo = elf.siginfo_t() # FIXME zeroify everything for now ctypes.memset(ctypes.addressof(siginfo), 0, ctypes.sizeof(siginfo)) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.siginfo_t()) nhdr.n_type = elf.NT_SIGINFO note = elf_note() note.data = siginfo note.owner = b"CORE" note.nhdr = nhdr return note def _gen_auxv(self, pid): """ Generate NT_AUXV note for thread tid of process pid. """ mm = self.mms[pid] num_auxv = len(mm["mm_saved_auxv"]) // 2 class elf_auxv(ctypes.Structure): _fields_ = [("auxv", elf.Elf64_auxv_t * num_auxv)] auxv = elf_auxv() for i in range(num_auxv): auxv.auxv[i].a_type = mm["mm_saved_auxv"][i] auxv.auxv[i].a_val = mm["mm_saved_auxv"][i + 1] nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf_auxv()) nhdr.n_type = elf.NT_AUXV note = elf_note() note.data = auxv note.owner = b"CORE" note.nhdr = nhdr return note def _gen_files(self, pid): """ Generate NT_FILE note for process pid. """ mm = self.mms[pid] class mmaped_file_info: start = None end = None file_ofs = None name = None infos = [] for vma in mm["vmas"]: if vma["shmid"] == 0: # shmid == 0 means that it is not a file continue shmid = vma["shmid"] off = vma["pgoff"] // PAGESIZE files = self.reg_files fname = next(filter(lambda x: x["id"] == shmid, files))["name"] info = mmaped_file_info() info.start = vma["start"] info.end = vma["end"] info.file_ofs = off info.name = fname infos.append(info) # /* # * Format of NT_FILE note: # * # * long count -- how many files are mapped # * long page_size -- units for file_ofs # * array of [COUNT] elements of # * long start # * long end # * long file_ofs # * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... # */ fields = [] fields.append(("count", ctypes.c_long)) fields.append(("page_size", ctypes.c_long)) for i in range(len(infos)): fields.append(("start" + str(i), ctypes.c_long)) fields.append(("end" + str(i), ctypes.c_long)) fields.append(("file_ofs" + str(i), ctypes.c_long)) for i in range(len(infos)): fields.append( ("name" + str(i), ctypes.c_char * (len(infos[i].name) + 1))) class elf_files(ctypes.Structure): _fields_ = fields data = elf_files() data.count = len(infos) data.page_size = PAGESIZE for i in range(len(infos)): info = infos[i] setattr(data, "start" + str(i), info.start) setattr(data, "end" + str(i), info.end) setattr(data, "file_ofs" + str(i), info.file_ofs) setattr(data, "name" + str(i), info.name.encode()) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 # strlen + 1 nhdr.n_descsz = ctypes.sizeof(elf_files()) nhdr.n_type = elf.NT_FILE note = elf_note() note.nhdr = nhdr note.owner = b"CORE" note.data = data return note def _gen_thread_notes(self, pid, tid): notes = [] notes.append(self._gen_prstatus(pid, tid)) notes.append(self._gen_fpregset(pid, tid)) notes.append(self._gen_x86_xstate(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) return notes def _gen_notes(self, pid): """ Generate notes for core dump of process pid. """ notes = [] notes.append(self._gen_prpsinfo(pid)) threads = self.pstree[pid]["threads"] # Main thread first notes += self._gen_thread_notes(pid, pid) # Then other threads for tid in threads: if tid == pid: continue notes += self._gen_thread_notes(pid, tid) notes.append(self._gen_auxv(pid)) notes.append(self._gen_files(pid)) return notes def _get_page(self, pid, page_no): """ Try to find memory page page_no in pages.img image for process pid. """ pagemap = self.pagemaps[pid] # First entry is pagemap_head, we will need it later to open # proper pages.img. pages_id = pagemap[0]["pages_id"] off = 0 # in pages for m in pagemap[1:]: found = False for i in range(m["nr_pages"]): if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE: found = True break off += 1 if not found: continue if "in_parent" in m and m["in_parent"]: ppid = self.pstree[pid]["ppid"] return self._get_page(ppid, page_no) else: with open(self._imgs_dir + "/pages-%s.img" % pages_id, 'rb') as f: f.seek(off * PAGESIZE) return f.read(PAGESIZE) return None def _gen_mem_chunk(self, pid, vma, size): """ Obtain vma contents for process pid. """ f = None if size == 0: return b"" if vma["status"] & status["VMA_AREA_VVAR"]: # FIXME this is what gdb does, as vvar vma # is not readable from userspace? return b"\0" * size elif vma["status"] & status["VMA_AREA_VSYSCALL"]: # FIXME need to dump it with criu or read from # current process. return b"\0" * size if vma["status"] & status["VMA_FILE_SHARED"] or \ vma["status"] & status["VMA_FILE_PRIVATE"]: # Open file before iterating vma pages shmid = vma["shmid"] off = vma["pgoff"] files = self.reg_files fname = next(filter(lambda x: x["id"] == shmid, files))["name"] try: f = open(fname, 'rb') except FileNotFoundError: sys.exit('Required file %s not found.' % fname) f.seek(off) start = vma["start"] end = vma["start"] + size # Split requested memory chunk into pages, so it could be # pictured as: # # "----" -- part of page with memory outside of our vma; # "XXXX" -- memory from our vma; # # Start page Pages in the middle End page # [-----XXXXX]...[XXXXXXXXXX][XXXXXXXXXX]...[XXX-------] # # Each page could be found in pages.img or in a standalone # file described by shmid field in vma entry and # corresponding entry in reg-files.img. # For VMA_FILE_PRIVATE vma, unchanged pages are taken from # a file, and changed ones -- from pages.img. # Finally, if no page is found neither in pages.img nor # in file, hole in inserted -- a page filled with zeroes. start_page = start // PAGESIZE end_page = end // PAGESIZE buf = b"" for page_no in range(start_page, end_page + 1): page = None # Search for needed page in pages.img and reg-files.img # and choose appropriate. page_mem = self._get_page(pid, page_no) if f is not None: page = f.read(PAGESIZE) if page_mem is not None: # Page from pages.img has higher priority # than one from mapped file on disk. page = page_mem if page is None: # Hole page = PAGESIZE * b"\0" # If it is a start or end page, we need to read # only part of it. if page_no == start_page: n_skip = start - page_no * PAGESIZE if start_page == end_page: n_read = size else: n_read = PAGESIZE - n_skip elif page_no == end_page: n_skip = 0 n_read = end - page_no * PAGESIZE else: n_skip = 0 n_read = PAGESIZE buf += page[n_skip:n_skip + n_read] # Don't forget to close file. if f is not None: f.close() return buf def _gen_cmdline(self, pid): """ Generate full command with arguments. """ mm = self.mms[pid] vma = {} vma["start"] = mm["mm_arg_start"] vma["end"] = mm["mm_arg_end"] # Dummy flags and status. vma["flags"] = 0 vma["status"] = 0 size = vma["end"] - vma["start"] chunk = self._gen_mem_chunk(pid, vma, size) # Replace all '\0's with spaces. return chunk.replace(b'\0', b' ') def _get_vma_dump_size(self, vma): """ Calculate amount of vma to put into core dump. """ if (vma["status"] & status["VMA_AREA_VVAR"] or vma["status"] & status["VMA_AREA_VSYSCALL"] or vma["status"] & status["VMA_AREA_VDSO"]): size = vma["end"] - vma["start"] elif vma["prot"] == 0: size = 0 elif (vma["prot"] & prot["PROT_READ"] and vma["prot"] & prot["PROT_EXEC"]): size = PAGESIZE elif (vma["status"] & status["VMA_ANON_SHARED"] or vma["status"] & status["VMA_FILE_SHARED"] or vma["status"] & status["VMA_ANON_PRIVATE"] or vma["status"] & status["VMA_FILE_PRIVATE"]): size = vma["end"] - vma["start"] else: size = 0 return size def _get_vma_flags(self, vma): """ Convert vma flags int elf flags. """ flags = 0 if vma['prot'] & prot["PROT_READ"]: flags = flags | elf.PF_R if vma['prot'] & prot["PROT_WRITE"]: flags = flags | elf.PF_W if vma['prot'] & prot["PROT_EXEC"]: flags = flags | elf.PF_X return flags def _gen_vmas(self, pid): """ Generate vma contents for core dump for process pid. """ mm = self.mms[pid] class vma_class: data = None filesz = None memsz = None flags = None start = None vmas = [] for vma in mm["vmas"]: v = vma_class() v.filesz = self._get_vma_dump_size(vma) v.data = self._gen_mem_chunk(pid, vma, v.filesz) v.memsz = vma["end"] - vma["start"] v.start = vma["start"] v.flags = self._get_vma_flags(vma) vmas.append(v) return vmas crac-criu-1.5.0/coredump/criu_coredump/elf.py000066400000000000000000000613641471504326700212060ustar00rootroot00000000000000# Define structures and constants for generating elf file. import ctypes Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; Elf64_Addr = ctypes.c_uint64 # typedef uint64_t Elf64_Addr; Elf64_Off = ctypes.c_uint64 # typedef uint64_t Elf64_Off; Elf64_Xword = ctypes.c_uint64 # typedef uint64_t Elf64_Xword; # Elf64_Ehdr related constants. # e_ident size. EI_NIDENT = 16 # #define EI_NIDENT (16) EI_MAG0 = 0 # #define EI_MAG0 0 /* File identification byte 0 index */ ELFMAG0 = 0x7f # #define ELFMAG0 0x7f /* Magic number byte 0 */ EI_MAG1 = 1 # #define EI_MAG1 1 /* File identification byte 1 index */ ELFMAG1 = ord('E') # #define ELFMAG1 'E' /* Magic number byte 1 */ EI_MAG2 = 2 # #define EI_MAG2 2 /* File identification byte 2 index */ ELFMAG2 = ord('L') # #define ELFMAG2 'L' /* Magic number byte 2 */ EI_MAG3 = 3 # #define EI_MAG3 3 /* File identification byte 3 index */ ELFMAG3 = ord('F') # #define ELFMAG3 'F' /* Magic number byte 3 */ EI_CLASS = 4 # #define EI_CLASS 4 /* File class byte index */ EI_DATA = 5 # #define EI_DATA 5 /* Data encoding byte index */ EI_VERSION = 6 # #define EI_VERSION 6 /* File version byte index */ ELFDATA2LSB = 1 # #define ELFDATA2LSB 1 /* 2's complement, little endian */ ELFCLASS64 = 2 # #define ELFCLASS64 2 /* 64-bit objects */ # Legal values for e_type (object file type). ET_CORE = 4 # #define ET_CORE 4 /* Core file */ # Legal values for e_machine (architecture). EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ class Elf64_Ehdr(ctypes.Structure): # typedef struct _fields_ = [ ("e_ident", ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; ("e_type", Elf64_Half), # Elf64_Half e_type; ("e_machine", Elf64_Half), # Elf64_Half e_machine; ("e_version", Elf64_Word), # Elf64_Word e_version; ("e_entry", Elf64_Addr), # Elf64_Addr e_entry; ("e_phoff", Elf64_Off), # Elf64_Off e_phoff; ("e_shoff", Elf64_Off), # Elf64_Off e_shoff; ("e_flags", Elf64_Word), # Elf64_Word e_flags; ("e_ehsize", Elf64_Half), # Elf64_Half e_ehsize; ("e_phentsize", Elf64_Half), # Elf64_Half e_phentsize; ("e_phnum", Elf64_Half), # Elf64_Half e_phnum; ("e_shentsize", Elf64_Half), # Elf64_Half e_shentsize; ("e_shnum", Elf64_Half), # Elf64_Half e_shnum; ("e_shstrndx", Elf64_Half) # Elf64_Half e_shstrndx; ] # } Elf64_Ehdr; # Elf64_Phdr related constants. # Legal values for p_type (segment type). PT_LOAD = 1 # #define PT_LOAD 1 /* Loadable program segment */ PT_NOTE = 4 # #define PT_NOTE 4 /* Auxiliary information */ # Legal values for p_flags (segment flags). PF_X = 1 # #define PF_X (1 << 0) /* Segment is executable */ PF_W = 1 << 1 # #define PF_W (1 << 1) /* Segment is writable */ PF_R = 1 << 2 # #define PF_R (1 << 2) /* Segment is readable */ class Elf64_Phdr(ctypes.Structure): # typedef struct _fields_ = [ ("p_type", Elf64_Word), # Elf64_Word p_type; ("p_flags", Elf64_Word), # Elf64_Word p_flags; ("p_offset", Elf64_Off), # Elf64_Off p_offset; ("p_vaddr", Elf64_Addr), # Elf64_Addr p_vaddr; ("p_paddr", Elf64_Addr), # Elf64_Addr p_paddr; ("p_filesz", Elf64_Xword), # Elf64_Xword p_filesz; ("p_memsz", Elf64_Xword), # Elf64_Xword p_memsz; ("p_align", Elf64_Xword), # Elf64_Xword p_align; ] # } Elf64_Phdr; # Elf64_auxv_t related constants. class _Elf64_auxv_t_U(ctypes.Union): _fields_ = [("a_val", ctypes.c_uint64)] class Elf64_auxv_t(ctypes.Structure): # typedef struct _fields_ = [ ("a_type", ctypes.c_uint64), # uint64_t a_type; /* Entry type */ ("a_un", _Elf64_auxv_t_U) # union # uint64_t a_val; /* Integer value */ # /* We use to have pointer elements added here. We cannot do that, # though, since it does not work when using 32-bit definitions # on 64-bit platforms and vice versa. */ # } a_un; ] # } Elf64_auxv_t; # Elf64_Nhdr related constants. NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ NT_PRPSINFO = 3 # #define NT_PRPSINFO 3 /* Contains copy of prpsinfo struct */ NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ class Elf64_Nhdr(ctypes.Structure): # typedef struct _fields_ = [ ( "n_namesz", Elf64_Word ), # Elf64_Word n_namesz; /* Length of the note's name. */ ( "n_descsz", Elf64_Word ), # Elf64_Word n_descsz; /* Length of the note's descriptor. */ ("n_type", Elf64_Word ), # Elf64_Word n_type; /* Type of the note. */ ] # } Elf64_Nhdr; # Elf64_Shdr related constants. class Elf64_Shdr(ctypes.Structure): _fields_ = [ ( # Section name (string tbl index) "sh_name", Elf64_Word ), ( # Section type "sh_type", Elf64_Word ), ( # Section flags "sh_flags", Elf64_Xword ), ( # Section virtual addr at execution "sh_addr", Elf64_Addr ), ( # Section file offset "sh_offset", Elf64_Off ), ( # Section size in bytes "sh_size", Elf64_Xword ), ( # Link to another section "sh_link", Elf64_Word ), ( # Additional section information "sh_info", Elf64_Word ), ( # Section alignment "sh_addralign", Elf64_Xword ), ( # Entry size if section holds table "sh_entsize", Elf64_Xword ) ] # elf_prstatus related constants. # Signal info. class elf_siginfo(ctypes.Structure): # struct elf_siginfo _fields_ = [ ( # Signal number "si_signo", ctypes.c_int ), ( # Extra code "si_code", ctypes.c_int ), ( # Errno "si_errno", ctypes.c_int ) ] # A time value that is accurate to the nearest # microsecond but also has a range of years. class timeval(ctypes.Structure): # struct timeval _fields_ = [ ( # __time_t tv_sec; /* Seconds. */ "tv_sec", ctypes.c_long ), ( # __suseconds_t tv_usec; /* Microseconds. */ "tv_usec", ctypes.c_long ) ] class user_regs_struct(ctypes.Structure): # struct user_regs_struct _fields_ = [ ("r15", ctypes.c_ulonglong), # __extension__ unsigned long long int r15; ("r14", ctypes.c_ulonglong), # __extension__ unsigned long long int r14; ("r13", ctypes.c_ulonglong), # __extension__ unsigned long long int r13; ("r12", ctypes.c_ulonglong), # __extension__ unsigned long long int r12; ("rbp", ctypes.c_ulonglong), # __extension__ unsigned long long int rbp; ("rbx", ctypes.c_ulonglong), # __extension__ unsigned long long int rbx; ("r11", ctypes.c_ulonglong), # __extension__ unsigned long long int r11; ("r10", ctypes.c_ulonglong), # __extension__ unsigned long long int r10; ("r9", ctypes.c_ulonglong), # __extension__ unsigned long long int r9; ("r8", ctypes.c_ulonglong), # __extension__ unsigned long long int r8; ("rax", ctypes.c_ulonglong), # __extension__ unsigned long long int rax; ("rcx", ctypes.c_ulonglong), # __extension__ unsigned long long int rcx; ("rdx", ctypes.c_ulonglong), # __extension__ unsigned long long int rdx; ("rsi", ctypes.c_ulonglong), # __extension__ unsigned long long int rsi; ("rdi", ctypes.c_ulonglong), # __extension__ unsigned long long int rdi; ("orig_rax", ctypes.c_ulonglong ), # __extension__ unsigned long long int orig_rax; ("rip", ctypes.c_ulonglong), # __extension__ unsigned long long int rip; ("cs", ctypes.c_ulonglong), # __extension__ unsigned long long int cs; ("eflags", ctypes.c_ulonglong), # __extension__ unsigned long long int eflags; ("rsp", ctypes.c_ulonglong), # __extension__ unsigned long long int rsp; ("ss", ctypes.c_ulonglong), # __extension__ unsigned long long int ss; ("fs_base", ctypes.c_ulonglong ), # __extension__ unsigned long long int fs_base; ("gs_base", ctypes.c_ulonglong ), # __extension__ unsigned long long int gs_base; ("ds", ctypes.c_ulonglong), # __extension__ unsigned long long int ds; ("es", ctypes.c_ulonglong), # __extension__ unsigned long long int es; ("fs", ctypes.c_ulonglong), # __extension__ unsigned long long int fs; ("gs", ctypes.c_ulonglong ) # __extension__ unsigned long long int gs; ] # elf_greg_t = ctypes.c_ulonglong # ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) # elf_gregset_t = elf_greg_t*ELF_NGREG elf_gregset_t = user_regs_struct class elf_prstatus(ctypes.Structure): # struct elf_prstatus _fields_ = [ ( # Info associated with signal # struct elf_siginfo pr_info; "pr_info", elf_siginfo ), ( # Current signal # short int pr_cursig; "pr_cursig", ctypes.c_short ), ( # Set of pending signals # unsigned long int pr_sigpend; "pr_sigpend", ctypes.c_ulong ), ( # Set of held signals # unsigned long int pr_sighold; "pr_sighold", ctypes.c_ulong ), ( # Process ID # __pid_t pr_pid; "pr_pid", ctypes.c_int ), ( # Parent process ID # __pid_t pr_ppid; "pr_ppid", ctypes.c_int ), ( # Parent group ID # __pid_t pr_pgrp; "pr_pgrp", ctypes.c_int ), ( # Parent session ID # __pid_t pr_sid; "pr_sid", ctypes.c_int ), ( # User time # struct timeval pr_utime; "pr_utime", timeval ), ( # System time # struct timeval pr_stime; "pr_stime", timeval ), ( # Cumulative user time # struct timeval pr_cutime; "pr_cutime", timeval ), ( # Cumulative system time # struct timeval pr_cstime; "pr_cstime", timeval ), ( # GP registers # elf_gregset_t pr_reg; "pr_reg", elf_gregset_t ), ( # True if math copro being used # int pr_fpvalid; "pr_fpvalid", ctypes.c_int ) ] # elf_prpsinfo related constants. # Number of chars for args # #define ELF_PRARGSZ (80) ELF_PRARGSZ = 80 class elf_prpsinfo(ctypes.Structure): # struct elf_prpsinfo _fields_ = [ ( # Numeric process state # char pr_state; "pr_state", ctypes.c_byte ), ( # Char for pr_state # char pr_sname; "pr_sname", ctypes.c_char ), ( # Zombie # char pr_zomb; "pr_zomb", ctypes.c_byte ), ( # Nice value # char pr_nice; "pr_nice", ctypes.c_byte ), ( # Flags # unsigned long int pr_flag; "pr_flag", ctypes.c_ulong ), ( # User ID # unsigned int pr_uid; "pr_uid", ctypes.c_uint ), ( # Group ID # unsigned int pr_gid; "pr_gid", ctypes.c_uint ), ("pr_pid", ctypes.c_int), ("pr_ppid", ctypes.c_int), ("pr_pgrp", ctypes.c_int), ("pr_sid", ctypes.c_int), # /* Lots missing */ ( # Filename of executable # char pr_fname[16]; "pr_fname", ctypes.c_char * 16 ), ( # Initial part of arg list # char pr_psargs[ELF_PRARGSZ]; "pr_psargs", ctypes.c_char * ELF_PRARGSZ ) ] class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct _fields_ = [ # unsigned short int cwd; ("cwd", ctypes.c_ushort), # unsigned short int swd; ("swd", ctypes.c_ushort), # unsigned short int ftw; ("ftw", ctypes.c_ushort), # unsigned short int fop; ("fop", ctypes.c_ushort), # __extension__ unsigned long long int rip; ("rip", ctypes.c_ulonglong), # __extension__ unsigned long long int rdp; ("rdp", ctypes.c_ulonglong), # unsigned int mxcsr; ("mxcsr", ctypes.c_uint), # unsigned int mxcr_mask; ("mxcr_mask", ctypes.c_uint), # unsigned int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ ("st_space", ctypes.c_uint * 32), # unsigned int xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ ("xmm_space", ctypes.c_uint * 64), # unsigned int padding[24]; ("padding", ctypes.c_uint * 24), ] elf_fpregset_t = user_fpregs_struct # siginfo_t related constants. _SI_MAX_SIZE = 128 _SI_PAD_SIZE = (_SI_MAX_SIZE // ctypes.sizeof(ctypes.c_int)) - 4 # /* kill(). */ class _siginfo_t_U_kill(ctypes.Structure): # struct _fields_ = [ ( # Sending process ID # __pid_t si_pid; "si_pid", ctypes.c_int ), ( # Real user ID of sending process # __uid_t si_uid; "si_uid", ctypes.c_uint ) ] # } _kill; # Type for data associated with a signal. class sigval_t(ctypes.Union): # typedef union sigval _fields_ = [ ("sival_int", ctypes.c_int), # int sival_int; ("sical_ptr", ctypes.c_void_p), # void *sival_ptr; ] # } sigval_t; # /* POSIX.1b timers. */ class _siginfo_t_U_timer(ctypes.Structure): # struct _fields_ = [ ( # Timer ID # int si_tid; "si_tid", ctypes.c_int ), ( # Overrun count # int si_overrun; "si_overrun", ctypes.c_int ), ( # Signal value # sigval_t si_sigval; "si_sigval", sigval_t ) ] # } _timer; # /* POSIX.1b signals. */ class _siginfo_t_U_rt(ctypes.Structure): # struct _fields_ = [ ( # Sending process ID # __pid_t si_pid; "si_pid", ctypes.c_int ), ( # Real user ID of sending process # __uid_t si_uid; "si_uid", ctypes.c_uint ), ( # Signal value # sigval_t si_sigval; "si_sigval", sigval_t ) ] # } _rt; # /* SIGCHLD. */ class _siginfo_t_U_sigchld(ctypes.Structure): # struct _fields_ = [ ( # Which child # __pid_t si_pid; "si_pid", ctypes.c_int ), ( # Real user ID of sending process # __uid_t si_uid; "si_uid", ctypes.c_uint ), ( # Exit value or signal # int si_status; "si_status", ctypes.c_int ), ( # __sigchld_clock_t si_utime; "si_utime", ctypes.c_long ), ( # __sigchld_clock_t si_stime; "si_stime", ctypes.c_long ) ] # } _sigchld; # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ class _siginfo_t_U_sigfault(ctypes.Structure): # struct _fields_ = [ ( # Faulting insn/memory ref # void *si_addr; "si_addr", ctypes.c_void_p ), ( # Valid LSB of the reported address # short int si_addr_lsb; "si_addr_lsb", ctypes.c_short ) ] # } _sigfault; # /* SIGPOLL. */ class _siginfo_t_U_sigpoll(ctypes.Structure): # struct _fields_ = [ ( # Band event for SIGPOLL # long int si_band; "si_band", ctypes.c_long ), ( # int si_fd; "si_fd", ctypes.c_int ) ] # } _sigpoll; # /* SIGSYS. */ class _siginfo_t_U_sigsys(ctypes.Structure): # struct _fields_ = [ ("_call_addr", ctypes.c_void_p ), # void *_call_addr; /* Calling user insn. */ ( "_syscall", ctypes.c_int ), # int _syscall; /* Triggering system call number. */ ("_arch", ctypes.c_uint ) # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ ] # } _sigsys; class _siginfo_t_U(ctypes.Union): # union _fields_ = [ ("_pad", ctypes.c_int * _SI_PAD_SIZE), # int _pad[__SI_PAD_SIZE]; # /* kill(). */ ("_kill", _siginfo_t_U_kill), # struct # __pid_t si_pid; /* Sending process ID. */ # __uid_t si_uid; /* Real user ID of sending process. */ # } _kill; # /* POSIX.1b timers. */ ("_timer", _siginfo_t_U_timer), # struct # int si_tid; /* Timer ID. */ # int si_overrun; /* Overrun count. */ # sigval_t si_sigval; /* Signal value. */ # } _timer; # /* POSIX.1b signals. */ ("_rt", _siginfo_t_U_rt), # struct # __pid_t si_pid; /* Sending process ID. */ # __uid_t si_uid; /* Real user ID of sending process. */ # sigval_t si_sigval; /* Signal value. */ # } _rt; # /* SIGCHLD. */ ("_sigchld", _siginfo_t_U_sigchld), # struct # __pid_t si_pid; /* Which child. */ # __uid_t si_uid; /* Real user ID of sending process. */ # int si_status; /* Exit value or signal. */ # __sigchld_clock_t si_utime; # __sigchld_clock_t si_stime; # } _sigchld; # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ ("_sigfault", _siginfo_t_U_sigfault), # struct # void *si_addr; /* Faulting insn/memory ref. */ # short int si_addr_lsb; /* Valid LSB of the reported address. */ # } _sigfault; # /* SIGPOLL. */ ("_sigpoll", _siginfo_t_U_sigpoll), # struct # long int si_band; /* Band event for SIGPOLL. */ # int si_fd; # } _sigpoll; # /* SIGSYS. */ ("_sigsys", _siginfo_t_U_sigpoll) # struct # void *_call_addr; /* Calling user insn. */ # int _syscall; /* Triggering system call number. */ # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ # } _sigsys; ] # } _sifields; class siginfo_t(ctypes.Structure): # typedef struct _fields_ = [ ( # Signal number # int si_signo; "si_signo", ctypes.c_int ), ( # If non-zero, an errno value associated with # int si_errno; "si_errno", ctypes.c_int ), ( # Signal code - this signal, as defined in # int si_code; "si_code", ctypes.c_int ), ( # Union "_sifields", _siginfo_t_U ) # int _pad[__SI_PAD_SIZE]; # # /* kill(). */ # struct # __pid_t si_pid; /* Sending process ID. */ # __uid_t si_uid; /* Real user ID of sending process. */ # } _kill; # # /* POSIX.1b timers. */ # struct # int si_tid; /* Timer ID. */ # int si_overrun; /* Overrun count. */ # sigval_t si_sigval; /* Signal value. */ # } _timer; # # /* POSIX.1b signals. */ # struct # __pid_t si_pid; /* Sending process ID. */ # __uid_t si_uid; /* Real user ID of sending process. */ # sigval_t si_sigval; /* Signal value. */ # } _rt; # # /* SIGCHLD. */ # struct # __pid_t si_pid; /* Which child. */ # __uid_t si_uid; /* Real user ID of sending process. */ # int si_status; /* Exit value or signal. */ # __sigchld_clock_t si_utime; # __sigchld_clock_t si_stime; # } _sigchld; # # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ # struct # void *si_addr; /* Faulting insn/memory ref. */ # short int si_addr_lsb; /* Valid LSB of the reported address. */ # } _sigfault; # # /* SIGPOLL. */ # struct # long int si_band; /* Band event for SIGPOLL. */ # int si_fd; # } _sigpoll; # # /* SIGSYS. */ # struct # void *_call_addr; /* Calling user insn. */ # int _syscall; /* Triggering system call number. */ # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ # } _sigsys; # } _sifields; ] # } siginfo_t __SI_ALIGNMENT; # xsave related. class ymmh_struct(ctypes.Structure): # struct ymmh_struct { _fields_ = [ # u32 ymmh_space[64]; ("ymmh_space", 64 * ctypes.c_uint) ] # } __packed; class xsave_hdr_struct(ctypes.Structure): # struct xsave_hdr_struct { _fields_ = [ # u64 xstate_bv; ("xstate_bv", ctypes.c_ulonglong), # u64 reserved1[2]; ("reserved1", ctypes.c_ulonglong * 2), # u64 reserved2[5]; ("reserved2", ctypes.c_ulonglong * 5) ] # } __packed; class i387_fxsave_struct(ctypes.Structure): # struct i387_fxsave_struct { _fields_ = [ ( # Control Word # u16 cwd; "cwd", ctypes.c_ushort ), ( # Status Word # u16 swd; "swd", ctypes.c_ushort ), ( # Tag Word # u16 twd; "twd", ctypes.c_ushort ), ( # Last Instruction Opcode # u16 fop; "fop", ctypes.c_ushort ), # union { # struct { ( # Instruction Pointer # u64 rip; "rip", ctypes.c_ulonglong ), ( # Data Pointer # u64 rdp; "rdp", ctypes.c_ulonglong ), # struct { # u32 fip; /* FPU IP Offset */ # u32 fcs; /* FPU IP Selector */ # u32 foo; /* FPU Operand Offset */ # u32 fos; /* FPU Operand Selector */ ( # MXCSR Register State # u32 mxcsr; "mxcsr", ctypes.c_uint ), ( # MXCSR Mask # u32 mxcsr_mask; "mxcsr_mask", ctypes.c_uint ), # 8*16 bytes for each FP-reg = 128 bytes ( # u32 st_space[32]; "st_space", ctypes.c_uint * 32 ), # 16*16 bytes for each XMM-reg = 256 bytes ( # u32 xmm_space[64]; "xmm_space", ctypes.c_uint * 64 ), ( # u32 padding[12]; "padding", ctypes.c_uint * 12 ), # union { ( # u32 padding1[12]; "padding1", ctypes.c_uint * 12 ) # u32 sw_reserved[12]; ] # } __aligned(16); class elf_xsave_struct(ctypes.Structure): # struct xsave_struct { _fields_ = [ # struct i387_fxsave_struct i387; ("i387", i387_fxsave_struct), # struct xsave_hdr_struct xsave_hdr; ("xsave_hdr", xsave_hdr_struct), # struct ymmh_struct ymmh; ("ymmh", ymmh_struct) ] # } __aligned(FP_MIN_ALIGN_BYTES) __packed; crac-criu-1.5.0/coredump/pycriu000077700000000000000000000000001471504326700206642../lib/pycriuustar00rootroot00000000000000crac-criu-1.5.0/crit/000077500000000000000000000000001471504326700143375ustar00rootroot00000000000000crac-criu-1.5.0/crit/.gitignore000066400000000000000000000000471471504326700163300ustar00rootroot00000000000000crit.egg-info/ build/ dist/ version.py crac-criu-1.5.0/crit/Makefile000066400000000000000000000026631471504326700160060ustar00rootroot00000000000000PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') PIP_BREAK_SYSTEM_PACKAGES := 0 VERSION_FILE := $(if $(obj),$(addprefix $(obj)/,crit/version.py),crit/version.py) all-y += ${VERSION_FILE} cleanup-y += ${VERSION_FILE} ${VERSION_FILE}: $(Q) echo "__version__ = '${CRIU_VERSION}'" > $@ install: ${VERSION_FILE} ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) $(E) " SKIP INSTALL crit: Externally managed python environment (See PEP 668 for more information)" $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" else $(E) " INSTALL " crit $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit endif else $(E) " INSTALL " crit $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit endif .PHONY: install uninstall: ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) $(E) " SKIP UNINSTALL crit: Externally managed python environment (See PEP 668 for more information)" $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" else $(E) " UNINSTALL" crit $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit endif else $(E) " UNINSTALL" crit $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit endif .PHONY: uninstall crac-criu-1.5.0/crit/crit/000077500000000000000000000000001471504326700153005ustar00rootroot00000000000000crac-criu-1.5.0/crit/crit/__init__.py000066400000000000000000000000411471504326700174040ustar00rootroot00000000000000from .version import __version__ crac-criu-1.5.0/crit/crit/__main__.py000077500000000000000000000270371471504326700174060ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import sys import json import os import pycriu from . import __version__ def inf(opts): if opts['in']: return open(opts['in'], 'rb') else: if sys.stdin.isatty(): # If we are reading from a terminal (not a pipe) we want text input and not binary return sys.stdin return sys.stdin.buffer def outf(opts, decode): # Decode means from protobuf to JSON. # Use text when writing to JSON else use binaray mode if opts['out']: mode = 'wb+' if decode: mode = 'w+' return open(opts['out'], mode) else: if decode: return sys.stdout return sys.stdout.buffer def dinf(opts, name): return open(os.path.join(opts['dir'], name), mode='rb') def decode(opts): indent = None try: img = pycriu.images.load(inf(opts), opts['pretty'], opts['nopl']) except pycriu.images.MagicException as exc: print("Unknown magic %#x.\n" "Maybe you are feeding me an image with " "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) sys.exit(1) if opts['pretty']: indent = 4 f = outf(opts, True) json.dump(img, f, indent=indent) if f == sys.stdout: f.write("\n") def encode(opts): try: img = json.load(inf(opts)) except UnicodeDecodeError: print("Cannot read JSON.\n" "Maybe you are feeding me an image with protobuf data? " "Encode expects JSON input.", file=sys.stderr) sys.exit(1) pycriu.images.dump(img, outf(opts, False)) def info(opts): infs = pycriu.images.info(inf(opts)) json.dump(infs, sys.stdout, indent=4) print() def get_task_id(p, val): return p[val] if val in p else p['ns_' + val][0] # # Explorers # class ps_item: def __init__(self, p, core): self.pid = get_task_id(p, 'pid') self.ppid = p['ppid'] self.p = p self.core = core self.kids = [] def show_ps(p, opts, depth=0): print("%7d%7d%7d %s%s" % (p.pid, get_task_id(p.p, 'pgid'), get_task_id(p.p, 'sid'), ' ' * (4 * depth), p.core['tc']['comm'])) for kid in p.kids: show_ps(kid, opts, depth + 1) def explore_ps(opts): pss = {} ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) for p in ps_img['entries']: core = pycriu.images.load( dinf(opts, 'core-%d.img' % get_task_id(p, 'pid'))) ps = ps_item(p, core['entries'][0]) pss[ps.pid] = ps # Build tree psr = None for pid in pss: p = pss[pid] if p.ppid == 0: psr = p continue pp = pss[p.ppid] pp.kids.append(p) print("%7s%7s%7s %s" % ('PID', 'PGID', 'SID', 'COMM')) show_ps(psr, opts) files_img = None def ftype_find_in_files(opts, ft, fid): global files_img if files_img is None: try: files_img = pycriu.images.load(dinf(opts, "files.img"))['entries'] except Exception: files_img = [] if len(files_img) == 0: return None for f in files_img: if f['id'] == fid: return f return None def ftype_find_in_image(opts, ft, fid, img): f = ftype_find_in_files(opts, ft, fid) if f: if ft['field'] in f: return f[ft['field']] else: return None if ft['img'] is None: ft['img'] = pycriu.images.load(dinf(opts, img))['entries'] for f in ft['img']: if f['id'] == fid: return f return None def ftype_reg(opts, ft, fid): rf = ftype_find_in_image(opts, ft, fid, 'reg-files.img') return rf and rf['name'] or 'unknown path' def ftype_pipe(opts, ft, fid): p = ftype_find_in_image(opts, ft, fid, 'pipes.img') return p and 'pipe[%d]' % p['pipe_id'] or 'pipe[?]' def ftype_unix(opts, ft, fid): ux = ftype_find_in_image(opts, ft, fid, 'unixsk.img') if not ux: return 'unix[?]' n = ux['name'] and ' %s' % ux['name'] or '' return 'unix[%d (%d)%s]' % (ux['ino'], ux['peer'], n) file_types = { 'REG': { 'get': ftype_reg, 'img': None, 'field': 'reg' }, 'PIPE': { 'get': ftype_pipe, 'img': None, 'field': 'pipe' }, 'UNIXSK': { 'get': ftype_unix, 'img': None, 'field': 'usk' }, } def ftype_gen(opts, ft, fid): return '%s.%d' % (ft['typ'], fid) files_cache = {} def get_file_str(opts, fd): key = (fd['type'], fd['id']) f = files_cache.get(key, None) if not f: ft = file_types.get(fd['type'], {'get': ftype_gen, 'typ': fd['type']}) f = ft['get'](opts, ft, fd['id']) files_cache[key] = f return f def explore_fds(opts): ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) for p in ps_img['entries']: pid = get_task_id(p, 'pid') idi = pycriu.images.load(dinf(opts, 'ids-%s.img' % pid)) fdt = idi['entries'][0]['files_id'] fdi = pycriu.images.load(dinf(opts, 'fdinfo-%d.img' % fdt)) print("%d" % pid) for fd in fdi['entries']: print("\t%7d: %s" % (fd['fd'], get_file_str(opts, fd))) fdi = pycriu.images.load(dinf(opts, 'fs-%d.img' % pid))['entries'][0] print("\t%7s: %s" % ('cwd', get_file_str(opts, { 'type': 'REG', 'id': fdi['cwd_id'] }))) print("\t%7s: %s" % ('root', get_file_str(opts, { 'type': 'REG', 'id': fdi['root_id'] }))) class vma_id: def __init__(self): self.__ids = {} self.__last = 1 def get(self, iid): ret = self.__ids.get(iid, None) if not ret: ret = self.__last self.__last += 1 self.__ids[iid] = ret return ret def explore_mems(opts): ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) vids = vma_id() for p in ps_img['entries']: pid = get_task_id(p, 'pid') mmi = pycriu.images.load(dinf(opts, 'mm-%d.img' % pid))['entries'][0] print("%d" % pid) print("\t%-36s %s" % ('exe', get_file_str(opts, { 'type': 'REG', 'id': mmi['exe_file_id'] }))) for vma in mmi['vmas']: st = vma['status'] if st & (1 << 10): fn = ' ' + 'ips[%lx]' % vids.get(vma['shmid']) elif st & (1 << 8): fn = ' ' + 'shmem[%lx]' % vids.get(vma['shmid']) elif st & (1 << 11): fn = ' ' + 'packet[%lx]' % vids.get(vma['shmid']) elif st & ((1 << 6) | (1 << 7)): fn = ' ' + get_file_str(opts, { 'type': 'REG', 'id': vma['shmid'] }) if vma['pgoff']: fn += ' + %#lx' % vma['pgoff'] if st & (1 << 7): fn += ' (s)' elif st & (1 << 1): fn = ' [stack]' elif st & (1 << 2): fn = ' [vsyscall]' elif st & (1 << 3): fn = ' [vdso]' elif vma['flags'] & 0x0100: # growsdown fn = ' [stack?]' else: fn = '' if not st & (1 << 0): fn += ' *' prot = vma['prot'] & 0x1 and 'r' or '-' prot += vma['prot'] & 0x2 and 'w' or '-' prot += vma['prot'] & 0x4 and 'x' or '-' astr = '%08lx-%08lx' % (vma['start'], vma['end']) print("\t%-36s%s%s" % (astr, prot, fn)) def explore_rss(opts): ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) for p in ps_img['entries']: pid = get_task_id(p, 'pid') vmas = pycriu.images.load(dinf(opts, 'mm-%d.img' % pid))['entries'][0]['vmas'] pms = pycriu.images.load(dinf(opts, 'pagemap-%d.img' % pid))['entries'] print("%d" % pid) vmi = 0 pvmi = -1 for pm in pms[1:]: pstr = '\t%lx / %-8d' % (pm['vaddr'], pm['nr_pages']) while vmas[vmi]['end'] <= pm['vaddr']: vmi += 1 pme = pm['vaddr'] + (pm['nr_pages'] << 12) vstr = '' while vmas[vmi]['start'] < pme: vma = vmas[vmi] if vmi == pvmi: vstr += ' ~' else: vstr += ' %08lx / %-8d' % ( vma['start'], (vma['end'] - vma['start']) >> 12) if vma['status'] & ((1 << 6) | (1 << 7)): vstr += ' ' + get_file_str(opts, { 'type': 'REG', 'id': vma['shmid'] }) pvmi = vmi vstr += '\n\t%23s' % '' vmi += 1 vmi -= 1 print('%-24s%s' % (pstr, vstr)) explorers = { 'ps': explore_ps, 'fds': explore_fds, 'mems': explore_mems, 'rss': explore_rss } def explore(opts): explorers[opts['what']](opts) def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version=__version__) subparsers = parser.add_subparsers( help='Use crit CMD --help for command-specific help') # Decode decode_parser = subparsers.add_parser( 'decode', help='convert criu image from binary type to json') decode_parser.add_argument( '--pretty', help='Multiline with indents and some numerical fields in field-specific format', action='store_true') decode_parser.add_argument( '-i', '--in', help='criu image in binary format to be decoded (stdin by default)') decode_parser.add_argument( '-o', '--out', help='where to put criu image in json format (stdout by default)') decode_parser.set_defaults(func=decode, nopl=False) # Encode encode_parser = subparsers.add_parser( 'encode', help='convert criu image from json type to binary') encode_parser.add_argument( '-i', '--in', help='criu image in json format to be encoded (stdin by default)') encode_parser.add_argument( '-o', '--out', help='where to put criu image in binary format (stdout by default)') encode_parser.set_defaults(func=encode) # Info info_parser = subparsers.add_parser('info', help='show info about image') info_parser.add_argument("in") info_parser.set_defaults(func=info) # Explore x_parser = subparsers.add_parser('x', help='explore image dir') x_parser.add_argument('dir') x_parser.add_argument('what', choices=['ps', 'fds', 'mems', 'rss']) x_parser.set_defaults(func=explore) # Show show_parser = subparsers.add_parser( 'show', help="convert criu image from binary to human-readable json") show_parser.add_argument("in") show_parser.add_argument('--nopl', help='do not show entry payload (if exists)', action='store_true') show_parser.set_defaults(func=decode, pretty=True, out=None) opts = vars(parser.parse_args()) if not opts: sys.stderr.write(parser.format_usage()) sys.stderr.write("crit: error: too few arguments\n") sys.exit(1) opts["func"](opts) if __name__ == '__main__': main() crac-criu-1.5.0/crit/pyproject.toml000066400000000000000000000006571471504326700172630ustar00rootroot00000000000000[build-system] requires = ["setuptools"] build-backend = "setuptools.build_meta" [project] name = "crit" description = "CRiu Image Tool" authors = [ {name = "CRIU team", email = "criu@openvz.org"}, ] license = {text = "GPLv2"} dynamic = ["version"] requires-python = ">=3.6" [project.scripts] crit = "crit.__main__:main" [tool.setuptools] packages = ["crit"] [tool.setuptools.dynamic] version = {attr = "crit.__version__"} crac-criu-1.5.0/crit/setup.cfg000066400000000000000000000011121471504326700161530ustar00rootroot00000000000000# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 # https://setuptools.pypa.io/en/latest/history.html#v61-0-0 # For older versions of setuptools, we need to use the setup.cfg file # https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config [metadata] name = crit description = CRiu Image Tool author = CRIU team author_email = criu@openvz.org license = GPLv2 version = attr: crit.__version__ [options] packages = crit python_requires = >=3.6 [options.entry_points] console_scripts = crit = crit.__main__:main crac-criu-1.5.0/crit/setup.py000066400000000000000000000001341471504326700160470ustar00rootroot00000000000000#!/usr/bin/env python3 import setuptools if __name__ == '__main__': setuptools.setup() crac-criu-1.5.0/criu/000077500000000000000000000000001471504326700143405ustar00rootroot00000000000000crac-criu-1.5.0/criu/Makefile000066400000000000000000000106671471504326700160120ustar00rootroot00000000000000# here is a workaround for a bug in libnl-3: # 6a8d90f5fec4 "attr: Allow attribute type 0" WRAPFLAGS += -Wl,--wrap=nla_parse,--wrap=nlmsg_parse ARCH_DIR := criu/arch/$(ARCH) PIE_DIR := criu/pie export ARCH_DIR PIE_DIR ifeq ($(filter clean mrproper,$(MAKECMDGOALS)),) CFLAGS += $(shell $(COMPEL_BIN) includes) COMPEL_LIBS := $(shell $(COMPEL_BIN) --static libs) CFLAGS_PIE += $(shell $(COMPEL_BIN) cflags) endif # # Configuration file paths CONFIG-DEFINES += -DSYSCONFDIR='"/etc"' CONFIG-DEFINES += -DGLOBAL_CONFIG_DIR='"/etc/criu/"' CONFIG-DEFINES += -DDEFAULT_CONFIG_FILENAME='"default.conf"' CONFIG-DEFINES += -DUSER_CONFIG_DIR='".criu/"' # # General flags. CFLAGS += -fno-strict-aliasing CFLAGS += -iquote criu/include CFLAGS += -iquote include CFLAGS += -iquote images CFLAGS += -iquote $(ARCH_DIR)/include CFLAGS += -iquote . CFLAGS += $(shell $(PKG_CONFIG) --cflags --static libnl-3.0) CFLAGS += $(CONFIG-DEFINES) ifeq ($(GMON),1) CFLAGS += -pg GMONLDOPT := -pg endif # msg-* printing include $(__nmk_dir)msg.mk # # Needed libraries checks include criu/Makefile.packages # # Architecture dependent part. ARCH-LIB := $(ARCH_DIR)/crtools.built-in.o $(ARCH-LIB): .FORCE $(Q) $(MAKE) $(build)=$(ARCH_DIR) all # # PIE library code. criu/pie/pie.lib.a: $(ARCH-LIB) .FORCE $(Q) $(MAKE) $(call build-as,Makefile.library,criu/pie) all # # PIE code blobs themseves. pie: criu/pie/pie.lib.a $(Q) $(MAKE) $(build)=criu/pie all .PHONY: pie criu/pie/Makefile: ; criu/pie/Makefile.library: ; criu/pie/%: pie ; # # CRIU executable PROGRAM-BUILTINS += criu/pie/pie.lib.a PROGRAM-BUILTINS += images/built-in.o PROGRAM-BUILTINS += $(obj)/built-in.o PROGRAM-BUILTINS += $(ARCH-LIB) PROGRAM-BUILTINS += soccr/libsoccr.a PROGRAM-BUILTINS += $(COMPEL_LIBS) $(obj)/built-in.o: pie $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) all $(obj)/Makefile: ; $(obj)/Makefile.crtools: ; $(obj)/Makefile.packages: ; $(obj)/%: pie $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) $@ $(obj)/criu: $(PROGRAM-BUILTINS) $(call msg-link, $@) $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) $(GMONLDOPT) -rdynamic -o $@ UNIT-BUILTINS += $(obj)/util.o UNIT-BUILTINS += $(obj)/config.o UNIT-BUILTINS += $(obj)/log.o UNIT-BUILTINS += $(obj)/string.o UNIT-BUILTINS += $(obj)/unittest/built-in.o $(obj)/unittest/Makefile: ; $(obj)/unittest/%: .FORCE $(obj)/unittest/built-in.o: .FORCE $(Q) $(MAKE) $(call build-as,Makefile,criu/unittest) all $(obj)/unittest/unittest: $(UNIT-BUILTINS) $(call msg-link, $@) $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) -rdynamic -o $@ unittest: $(obj)/unittest/unittest $(Q) $(obj)/unittest/$@ .PHONY: unittest # # Clean the most, except generated c files subclean: $(Q) $(RM) $(obj)/*.{gcda,gcno,gcov} $(Q) $(RM) $(obj)/pie/*.{gcda,gcno,gcov} $(Q) $(RM) -r $(obj)/gcov $(Q) $(MAKE) $(build)=$(ARCH_DIR) clean $(Q) $(MAKE) $(call build-as,Makefile.library,$(PIE_DIR)) clean $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) clean $(Q) $(MAKE) $(call build-as,Makefile,criu/unittest) clean $(Q) $(MAKE) $(build)=$(PIE_DIR) clean .PHONY: subclean cleanup-y += $(obj)/criu clean: subclean # # Delete all generated files subproper: $(Q) $(MAKE) $(build)=$(ARCH_DIR) mrproper $(Q) $(MAKE) $(call build-as,Makefile.library,$(PIE_DIR)) mrproper $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) mrproper $(Q) $(MAKE) $(build)=$(PIE_DIR) mrproper .PHONY: subproper mrproper: subproper UAPI_HEADERS := criu/include/criu-plugin.h UAPI_HEADERS += criu/include/criu-log.h install: $(obj)/criu $(E) " INSTALL " $(obj)/criu $(Q) mkdir -p $(DESTDIR)$(SBINDIR) $(Q) install -m 755 $(obj)/criu $(DESTDIR)$(SBINDIR)/crac-criu $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/crac-criu/criu/ $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/crac-criu/criu/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/criu/scripts $(Q) install -m 755 scripts/systemd-autofs-restart.sh $(DESTDIR)$(LIBEXECDIR)/criu/scripts $(E) " INSTALL " scripts/criu-ns $(Q) install -m 755 scripts/criu-ns $(DESTDIR)$(SBINDIR)/crac-criu-ns .PHONY: install uninstall: $(E) " UNINSTALL" criu $(Q) $(RM) $(addprefix $(DESTDIR)$(SBINDIR)/,crac-criu) $(Q) $(RM) $(addprefix $(DESTDIR)$(SBINDIR)/,crac-criu-ns) $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/crac-criu/criu/,$(notdir $(UAPI_HEADERS))) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBEXECDIR)/criu/scripts/,systemd-autofs-restart.sh) .PHONY: uninstall all-y += check-packages $(obj)/criu crac-criu-1.5.0/criu/Makefile.crtools000066400000000000000000000052061471504326700174670ustar00rootroot00000000000000CFLAGS_REMOVE_clone-noasan.o += $(CFLAGS-ASAN) CFLAGS_kerndat.o += -DKDAT_MAGIC_2=${shell echo $${SOURCE_DATE_EPOCH:-$$(date +%s)}} -DKDAT_RUNDIR=\"$(RUNDIR)\" ldflags-y += -r obj-y += action-scripts.o obj-y += external.o obj-y += aio.o obj-y += apparmor.o obj-y += bfd.o obj-y += bitmap.o obj-y += cgroup.o obj-y += cgroup-props.o obj-y += clone-noasan.o obj-y += cr-check.o obj-y += cr-dedup.o obj-y += cr-dump.o obj-y += cr-errno.o obj-y += cr-restore.o obj-y += cr-service.o obj-y += crtools.o obj-y += eventfd.o obj-y += eventpoll.o obj-y += fault-injection.o obj-y += fifo.o obj-y += file-ids.o obj-y += file-lock.o obj-y += files-ext.o obj-y += files.o obj-y += files-reg.o obj-y += fsnotify.o obj-y += image-desc.o obj-y += image.o obj-y += img-streamer.o obj-y += ipc_ns.o obj-y += irmap.o obj-y += kcmp-ids.o obj-y += kerndat.o obj-y += libnetlink.o obj-y += log.o obj-y += lsm.o obj-y += mem.o obj-y += memfd.o obj-y += mount.o obj-y += mount-v2.o obj-y += filesystems.o obj-y += namespaces.o obj-y += netfilter.o obj-y += net.o obj-y += pagemap-cache.o obj-y += page-pipe.o obj-y += pagemap.o obj-y += page-xfer.o obj-y += parasite-syscall.o obj-y += pie-util.o obj-y += pipes.o obj-y += plugin.o obj-y += proc_parse.o obj-y += protobuf-desc.o obj-y += protobuf.o obj-y += pstree.o obj-y += rbtree.o obj-y += rst-malloc.o obj-y += seccomp.o obj-y += seize.o obj-y += shmem.o obj-y += sigframe.o obj-y += signalfd.o obj-y += sk-inet.o obj-y += sk-netlink.o obj-y += sk-packet.o obj-y += sk-queue.o obj-y += sk-tcp.o obj-y += sk-unix.o obj-y += sockets.o obj-y += stats.o obj-y += string.o obj-y += setproctitle.o obj-y += sysctl.o obj-y += sysfs_parse.o obj-y += timerfd.o obj-$(CONFIG_GNUTLS) += tls.o obj-y += tty.o obj-y += tun.o obj-y += util.o obj-y += uts_ns.o obj-y += path.o obj-y += autofs.o obj-y += fdstore.o obj-y += uffd.o obj-y += config.o obj-y += servicefd.o obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += timens.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) obj-y += pidfd-store.o obj-y += hugetlb.o obj-y += pages-compress.o PROTOBUF_GEN := scripts/protobuf-gen.sh $(obj)/protobuf-desc.d: $(obj)/protobuf-desc-gen.h $(obj)/protobuf-desc-gen.h: $(PROTOBUF_GEN) criu/include/protobuf-desc.h $(call msg-gen, $@) $(Q) $(SH) $(PROTOBUF_GEN) > $@ mrproper-y += $(obj)/protobuf-desc-gen.h crac-criu-1.5.0/criu/Makefile.packages000066400000000000000000000031101471504326700175500ustar00rootroot00000000000000REQ-RPM-PKG-NAMES += protobuf REQ-RPM-PKG-NAMES += protobuf-c REQ-RPM-PKG-NAMES += protobuf-c-devel REQ-RPM-PKG-NAMES += protobuf-compiler REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel REQ-RPM-PKG-TEST-NAMES += libaio-devel REQ-DEB-PKG-NAMES += libprotobuf-dev REQ-DEB-PKG-NAMES += libprotobuf-c-dev REQ-DEB-PKG-NAMES += protobuf-c-compiler REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML export LIBS += -Wl,-Bstatic -lprotobuf-c -lsoccr -Lsoccr/ -lnet -lnl-3 -Lcriu -llz4io -Llz4/lib -llz4 -Wl,-Bdynamic -lm -ldl -lpthread check-packages-failed: $(warning Can not find some of the required libraries) $(warning Make sure the following packages are installed) $(warning RPM based distros: $(REQ-RPM-PKG-NAMES)) $(warning DEB based distros: $(REQ-DEB-PKG-NAMES)) $(warning To run tests the following packages are needed) $(warning RPM based distros: $(REQ-RPM-PKG-TEST-NAMES)) $(warning DEB based distros: $(REQ-DEB-PKG-TEST-NAMES)) $(error Compilation aborted) # # Make sure all required libs are installed PROGRAM_STUB := int main(int argc, char **argv) { return 0; } check-packages: $(Q) $(call try-cc,$(PROGRAM_STUB),$(LIBS)) \ || $(MAKE) -f $(obj)/Makefile.packages check-packages-failed .PHONY: check-packages-failed check-packages crac-criu-1.5.0/criu/action-scripts.c000066400000000000000000000101011471504326700174370ustar00rootroot00000000000000#include #include #include #include #include "cr_options.h" #include "common/list.h" #include "xmalloc.h" #include "log.h" #include "servicefd.h" #include "cr-service.h" #include "action-scripts.h" #include "pstree.h" #include "common/bug.h" #include "util.h" #include #include #include "common/scm.h" static const char *action_names[ACT_MAX] = { [ACT_PRE_STREAM] = "pre-stream", [ACT_PRE_DUMP] = "pre-dump", [ACT_POST_DUMP] = "post-dump", [ACT_PRE_RESTORE] = "pre-restore", [ACT_POST_RESTORE] = "post-restore", [ACT_NET_LOCK] = "network-lock", [ACT_NET_UNLOCK] = "network-unlock", [ACT_SETUP_NS] = "setup-namespaces", [ACT_POST_SETUP_NS] = "post-setup-namespaces", [ACT_PRE_RESUME] = "pre-resume", [ACT_POST_RESUME] = "post-resume", [ACT_ORPHAN_PTS_MASTER] = "orphan-pts-master", [ACT_STATUS_READY] = "status-ready", [ACT_QUERY_EXT_FILES] = "query-ext-files", }; struct script { struct list_head node; char *path; }; enum { SCRIPTS_NONE, SCRIPTS_SHELL, SCRIPTS_RPC }; static int scripts_mode = SCRIPTS_NONE; static LIST_HEAD(scripts); static int run_shell_scripts(const char *action) { int retval = 0; struct script *script; static unsigned env_set = 0; #define ENV_IMGDIR 0x1 #define ENV_ROOTPID 0x2 if (list_empty(&scripts)) return 0; if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) { pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action); return -1; } if (!(env_set & ENV_IMGDIR)) { char image_dir[PATH_MAX]; sprintf(image_dir, "/proc/%ld/fd/%d", (long)getpid(), get_service_fd(IMG_FD_OFF)); if (setenv("CRTOOLS_IMAGE_DIR", image_dir, 1)) { pr_perror("Can't set CRTOOLS_IMAGE_DIR=%s", image_dir); return -1; } env_set |= ENV_IMGDIR; } if (!(env_set & ENV_ROOTPID) && root_item) { int pid; pid = root_item->pid->real; if (pid != -1) { char root_item_pid[16]; snprintf(root_item_pid, sizeof(root_item_pid), "%d", pid); if (setenv("CRTOOLS_INIT_PID", root_item_pid, 1)) { pr_perror("Can't set CRTOOLS_INIT_PID=%s", root_item_pid); return -1; } env_set |= ENV_ROOTPID; } } list_for_each_entry(script, &scripts, node) { int err; pr_debug("\t[%s]\n", script->path); err = cr_system(-1, -1, -1, script->path, (char *[]){ script->path, NULL }, 0); if (err) pr_err("Script %s exited with %d\n", script->path, err); retval |= err; } unsetenv("CRTOOLS_SCRIPT_ACTION"); return retval; } int rpc_send_fd(enum script_actions act, int fd) { const char *action = action_names[act]; int rpc_sk; if (scripts_mode != SCRIPTS_RPC) return -1; rpc_sk = get_service_fd(RPC_SK_OFF); if (rpc_sk < 0) return -1; pr_debug("\tRPC\n"); return send_criu_rpc_script(act, (char *)action, rpc_sk, fd); } int rpc_query_external_files(void) { int rpc_sk; if (scripts_mode != SCRIPTS_RPC) return 0; rpc_sk = get_service_fd(RPC_SK_OFF); if (rpc_sk < 0) return -1; return exec_rpc_query_external_files((char *)action_names[ACT_QUERY_EXT_FILES], rpc_sk); } int run_scripts(enum script_actions act) { int ret = 0; const char *action = action_names[act]; pr_debug("Running %s scripts\n", action); switch (scripts_mode) { case SCRIPTS_NONE: return 0; case SCRIPTS_RPC: ret = rpc_send_fd(act, -1); if (ret) break; /* Enable scripts from config file in RPC mode (fallthrough) */ case SCRIPTS_SHELL: ret = run_shell_scripts(action); break; default: BUG(); } if (ret) pr_err("One of more action scripts failed\n"); return ret; } int add_script(char *path) { struct script *script; /* Set shell mode when a script is added but don't overwrite RPC mode */ if (scripts_mode == SCRIPTS_NONE) scripts_mode = SCRIPTS_SHELL; script = xmalloc(sizeof(struct script)); if (script == NULL) return -1; script->path = xstrdup(path); if (!script->path) { xfree(script); return -1; } list_add(&script->node, &scripts); return 0; } int add_rpc_notify(int sk) { int fd; fd = dup(sk); if (fd < 0) { pr_perror("dup() failed"); return -1; } scripts_mode = SCRIPTS_RPC; if (install_service_fd(RPC_SK_OFF, fd) < 0) return -1; return 0; } crac-criu-1.5.0/criu/aio.c000066400000000000000000000064351471504326700152640ustar00rootroot00000000000000#include #include #include #include "vma.h" #include "xmalloc.h" #include "pstree.h" #include "restorer.h" #include "aio.h" #include "rst_info.h" #include "rst-malloc.h" #include "parasite.h" #include "parasite-syscall.h" #include "images/mm.pb-c.h" #include "compel/infect.h" #define NR_IOEVENTS_IN_NPAGES(npages) ((PAGE_SIZE * (npages) - sizeof(struct aio_ring)) / sizeof(struct io_event)) int dump_aio_ring(MmEntry *mme, struct vma_area *vma) { int nr = mme->n_aios; AioRingEntry *re; mme->aios = xrealloc(mme->aios, (nr + 1) * sizeof(re)); if (!mme->aios) return -1; re = xmalloc(sizeof(*re)); if (!re) return -1; aio_ring_entry__init(re); re->id = vma->e->start; re->ring_len = vma->e->end - vma->e->start; re->nr_req = aio_estimate_nr_reqs(re->ring_len); if (!re->nr_req) { xfree(re); return -1; } mme->aios[nr] = re; mme->n_aios = nr + 1; pr_info("Dumping AIO ring @%" PRIx64 "-%" PRIx64 "\n", vma->e->start, vma->e->end); return 0; } void free_aios(MmEntry *mme) { int i; if (mme->aios) { for (i = 0; i < mme->n_aios; i++) xfree(mme->aios[i]); xfree(mme->aios); } } unsigned int aio_estimate_nr_reqs(unsigned int size) { unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size / PAGE_SIZE); if (size & ~PAGE_MASK) { pr_err("Ring size is not aligned\n"); return 0; } /* * Kernel does * * nr_reqs = max(nr_reqs, nr_cpus * 4) * nr_reqs *= 2 * nr_reqs += 2 * ring = roundup(sizeof(head) + nr_reqs * sizeof(req)) * nr_reqs = (ring - sizeof(head)) / sizeof(req) * * And the k_max_reqs here is the resulting value. * * We need to get the initial nr_reqs that would grow * up back to the k_max_reqs. */ return (k_max_reqs - 2) / 2; } unsigned long aio_rings_args_size(struct vm_area_list *vmas) { return sizeof(struct parasite_check_aios_args) + vmas->nr_aios * sizeof(struct parasite_aio); } int parasite_collect_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas) { struct vma_area *vma; struct parasite_check_aios_args *aa; struct parasite_aio *pa; if (!vmas->nr_aios) return 0; pr_info("Checking AIO rings\n"); /* * Go to parasite and * a) check that no requests are currently pengind * b) get the maximum number of requests kernel handles * to estimate what was the user request on ring * creation. */ aa = compel_parasite_args_s(ctl, aio_rings_args_size(vmas)); pa = &aa->ring[0]; list_for_each_entry(vma, &vmas->h, list) { if (!vma_area_is(vma, VMA_AREA_AIORING)) continue; pr_debug(" `- Ring #%ld @%" PRIx64 "\n", (long)(pa - &aa->ring[0]), vma->e->start); pa->ctx = vma->e->start; pa->size = vma->e->end - vma->e->start; pa++; } aa->nr_rings = vmas->nr_aios; if (compel_rpc_call_sync(PARASITE_CMD_CHECK_AIOS, ctl)) return -1; return 0; } int prepare_aios(struct pstree_item *t, struct task_restore_args *ta) { int i; MmEntry *mm = rsti(t)->mm; /* * Put info about AIO rings, they will get remapped */ ta->rings = (struct rst_aio_ring *)rst_mem_align_cpos(RM_PRIVATE); ta->rings_n = mm->n_aios; for (i = 0; i < mm->n_aios; i++) { struct rst_aio_ring *raio; raio = rst_mem_alloc(sizeof(*raio), RM_PRIVATE); if (!raio) return -1; raio->addr = mm->aios[i]->id; raio->nr_req = mm->aios[i]->nr_req; raio->len = mm->aios[i]->ring_len; } return 0; } crac-criu-1.5.0/criu/apparmor.c000066400000000000000000000424421471504326700163330ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "common/config.h" #include "imgset.h" #include "pstree.h" #include "util.h" #include "string.h" #include "lsm.h" #include "cr_options.h" #include "kerndat.h" #include "protobuf.h" #include "images/inventory.pb-c.h" #include "images/apparmor.pb-c.h" /* * Apparmor stacked profile checkpoint restore. Previously, we just saved the * profile that was in use by the task, and we expected it to be present on the * target host. Now with stacking, containers are able to load their own * profiles, so we can't rely on this. * * The basic idea here is that there is some (collection) of (potentially * nested) namespaces that a container uses. We don't collect everything on the * host level, but we *do* collect everything inside the namespace; a container * could have loaded a profile but not yet used it when we start to checkpoint. * * Thus, the old code that saves and restores AA profiles is still relevant, we * just need to add the new code in this file to walk the namespace and dump * any blobs in that AA namespace, and then restore these blobs on restore so * that the profiles the old code tries to use are actually present. */ static AaNamespace **namespaces = NULL; static int n_namespaces = 0; static AaNamespace *new_namespace(char *name, AaNamespace *parent) { void *m; AaNamespace *ret; ret = xzalloc(sizeof(*ret)); if (!ret) return NULL; aa_namespace__init(ret); ret->name = xstrdup(name); if (!ret->name) { xfree(ret); return NULL; } if (parent) { m = xrealloc(parent->namespaces, sizeof(*parent->namespaces) * (parent->n_namespaces + 1)); if (!m) { xfree(ret->name); xfree(ret); return NULL; } parent->namespaces = m; parent->namespaces[parent->n_namespaces++] = ret; } m = xrealloc(namespaces, sizeof(*namespaces) * (n_namespaces + 1)); if (!m) { if (parent) parent->n_namespaces--; xfree(ret->name); xfree(ret); return NULL; } namespaces = m; namespaces[n_namespaces++] = ret; return ret; } static int collect_profile(char *path, int offset, char *dir, AaNamespace *ns) { AaPolicy *cur; int fd, my_offset, ret; struct stat sb; ssize_t n; void *m; FILE *f; my_offset = snprintf(path + offset, PATH_MAX - offset, "%s/", dir); if (my_offset < 0 || my_offset >= PATH_MAX - offset) { pr_err("snprintf failed\n"); return -1; } my_offset += offset; pr_info("dumping profile %s\n", path); cur = xmalloc(sizeof(*cur)); if (!cur) return -1; aa_policy__init(cur); __strlcat(path + my_offset, "name", PATH_MAX - my_offset); f = fopen(path, "r"); if (!f) { xfree(cur); pr_perror("failed to open %s", path); return -1; } ret = fscanf(f, "%ms", &cur->name); fclose(f); if (ret != 1) { xfree(cur); pr_perror("couldn't scanf %s", path); return -1; } __strlcpy(path + my_offset, "raw_data", PATH_MAX - my_offset); fd = open(path, O_RDONLY); if (fd < 0) { pr_perror("failed to open aa policy %s", path); goto err; } if (fstat(fd, &sb) < 0) { pr_perror("failed to stat %s", path); goto close; } cur->blob.len = sb.st_size; cur->blob.data = xmalloc(sb.st_size); if (!cur->blob.data) goto close; n = read(fd, cur->blob.data, sb.st_size); if (n < 0) { pr_perror("failed to read %s", path); goto close; } if (n != sb.st_size) { pr_err("didn't read all of %s\n", path); goto close; } close(fd); m = xrealloc(ns->policies, sizeof(*ns->policies) * (ns->n_policies + 1)); if (!m) goto err; ns->policies = m; ns->policies[ns->n_policies++] = cur; return 0; close: close(fd); err: xfree(cur->name); xfree(cur); return -1; } char *ns_path; int sort_err; static int no_dirdots(const struct dirent *de) { return !dir_dots(de); } static int by_time(const struct dirent **de1, const struct dirent **de2) { char path[PATH_MAX]; struct stat sb1, sb2; snprintf(path, sizeof(path), "%s/%s", ns_path, (*de1)->d_name); if (stat(path, &sb1) < 0) { pr_perror("couldn't stat %s", path); sort_err = errno; return 0; } snprintf(path, sizeof(path), "%s/%s", ns_path, (*de2)->d_name); if (stat(path, &sb2) < 0) { pr_perror("couldn't state %s", path); sort_err = errno; return 0; } if (sb1.st_mtim.tv_sec == sb2.st_mtim.tv_sec) { if (sb1.st_mtim.tv_nsec < sb2.st_mtim.tv_nsec) return -1; if (sb1.st_mtim.tv_nsec == sb2.st_mtim.tv_nsec) return 0; return 1; } else { if (sb1.st_mtim.tv_sec < sb2.st_mtim.tv_sec) return -1; return 1; } } static int walk_namespace(char *path, size_t offset, AaNamespace *ns) { DIR *dir = NULL; struct dirent *de, **namelist = NULL; int ret = -1, n_names = 0, i; size_t my_offset; /* collect all the child namespaces */ strcat(path, "/namespaces/"); my_offset = offset + 12; dir = opendir(path); if (!dir) goto out; while ((de = readdir(dir))) { AaNamespace *cur; if (dir_dots(de)) continue; path[my_offset] = '\0'; strcat(path, de->d_name); cur = new_namespace(de->d_name, ns); if (!cur) goto out; if (walk_namespace(path, my_offset + strlen(de->d_name), cur) < 0) { aa_namespace__free_unpacked(cur, NULL); ns->n_namespaces--; goto out; } } closedir(dir); dir = NULL; /* now collect the profiles for this namespace */ path[offset] = '\0'; strcat(path, "/profiles/"); my_offset = offset + 10; sort_err = 0; ns_path = path; n_names = scandir(path, &namelist, no_dirdots, by_time); if (n_names < 0 || sort_err != 0) { pr_perror("scandir failed"); goto out; } for (i = 0; i < n_names; i++) { de = namelist[i]; path[my_offset] = 0; if (collect_profile(path, my_offset, de->d_name, ns) < 0) goto out; } ret = 0; out: if (dir) closedir(dir); if (namelist) { for (i = 0; i < n_names; i++) xfree(namelist[i]); xfree(namelist); } return ret; } int collect_aa_namespace(char *profile) { char path[PATH_MAX], *namespace, *end; int ret, i; AaNamespace *ns; if (!profile) return 0; namespace = strchr(profile, ':'); if (!namespace) return 0; /* no namespace to dump */ namespace ++; if (!kdat.apparmor_ns_dumping_enabled) { pr_warn("Apparmor namespace present but dumping not enabled\n"); return 0; } /* XXX: this is not strictly correct; if something is using namespace * views, extra //s can indicate a namespace separation. However, I * think only the apparmor developers use this feature :) */ end = strchr(namespace, ':'); if (!end) { pr_err("couldn't find AA namespace end in: %s\n", namespace); return -1; } *end = '\0'; for (i = 0; i < n_namespaces; i++) { /* did we already dump this namespace? */ if (!strcmp(namespaces[i]->name, namespace)) { *end = ':'; return 0; } } pr_info("dumping AA namespace %s\n", namespace); ns = new_namespace(namespace, NULL); *end = ':'; if (!ns) return -1; ret = snprintf(path, sizeof(path), AA_SECURITYFS_PATH "/policy/namespaces/%s", ns->name); if (ret < 0 || ret >= sizeof(path)) { pr_err("snprintf failed?\n"); goto err; } if (walk_namespace(path, ret, ns) < 0) { pr_err("walking AA namespace %s failed\n", ns->name); goto err; } return 0; err: aa_namespace__free_unpacked(ns, NULL); n_namespaces--; return -1; } /* An AA profile that allows everything that the parasite needs to do */ #define PARASITE_PROFILE \ ("profile %s {\n" \ " /** rwmlkix,\n" \ " unix,\n" \ " capability,\n" \ " signal,\n" \ "}\n") char policydir[PATH_MAX] = ".criu.temp-aa-policy.XXXXXX"; char cachedir[PATH_MAX]; struct apparmor_parser_args { char *cache; char *file; }; static int apparmor_parser_exec(void *data) { struct apparmor_parser_args *args = data; execlp("apparmor_parser", "apparmor_parser", "-QWL", args->cache, args->file, NULL); return -1; } static int apparmor_cache_exec(void *data) { execlp("apparmor_parser", "apparmor_parser", "--cache-loc", "/", "--print-cache-dir", (char *)NULL); return -1; } static void *get_suspend_policy(char *name, off_t *len) { char policy[1024], file[PATH_MAX], cache[PATH_MAX], clean_name[PATH_MAX]; void *ret = NULL; int n, fd, policy_len, i; struct stat sb; struct apparmor_parser_args args = { .cache = cache, .file = file, }; *len = 0; policy_len = snprintf(policy, sizeof(policy), PARASITE_PROFILE, name); if (policy_len < 0 || policy_len >= sizeof(policy)) { pr_err("policy name %s too long\n", name); return NULL; } /* policy names can have /s, but file paths can't */ for (i = 0; name[i]; i++) { if (i == PATH_MAX) { pr_err("name %s too long\n", name); return NULL; } clean_name[i] = name[i] == '/' ? '.' : name[i]; } clean_name[i] = 0; n = snprintf(file, sizeof(file), "%s/%s", policydir, clean_name); if (n < 0 || n >= sizeof(policy)) { pr_err("policy name %s too long\n", clean_name); return NULL; } n = snprintf(cache, sizeof(cache), "%s/cache", policydir); if (n < 0 || n >= sizeof(policy)) { pr_err("policy dir too long\n"); return NULL; } fd = open(file, O_CREAT | O_WRONLY, 0600); if (fd < 0) { pr_perror("couldn't create %s", file); return NULL; } n = write(fd, policy, policy_len); close(fd); if (n < 0 || n != policy_len) { pr_perror("couldn't write policy for %s", file); return NULL; } n = run_command(cachedir, sizeof(cachedir), apparmor_cache_exec, NULL); if (n < 0) { pr_err("apparmor parsing failed %d\n", n); return NULL; } n = run_command(NULL, 0, apparmor_parser_exec, &args); if (n < 0) { pr_err("apparmor parsing failed %d\n", n); return NULL; } n = snprintf(file, sizeof(file), "%s/cache/%s/%s", policydir, cachedir, clean_name); if (n < 0 || n >= sizeof(policy)) { pr_err("policy name %s too long\n", clean_name); return NULL; } fd = open(file, O_RDONLY); if (fd < 0) { pr_perror("couldn't open %s", file); return NULL; } if (fstat(fd, &sb) < 0) { pr_perror("couldn't stat fd"); goto out; } ret = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (ret == MAP_FAILED) { pr_perror("mmap of %s failed", file); goto out; } *len = sb.st_size; out: close(fd); return ret; } #define NEXT_AA_TOKEN(pos) \ while (*pos) { \ if (*pos == '/' && *(pos + 1) && *(pos + 1) == '/' && *(pos + 2) && *(pos + 2) == '&') { \ pos += 3; \ break; \ } \ if (*pos == ':' && *(pos + 1) && *(pos + 1) == '/' && *(pos + 2) && *(pos + 2) == '/') { \ pos += 3; \ break; \ } \ pos++; \ } static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrite, bool suspend) { int i, my_offset, ret; char *rewrite_pos = rewrite, namespace[PATH_MAX]; if (rewrite && suspend) { pr_err("requesting aa rewriting and suspension at the same time is not supported\n"); return -1; } if (!rewrite) { strncpy(namespace, ns->name, sizeof(namespace) - 1); } else { NEXT_AA_TOKEN(rewrite_pos); switch (*rewrite_pos) { case ':': { char tmp, *end; end = strchr(rewrite_pos + 1, ':'); if (!end) { pr_err("invalid namespace %s\n", rewrite_pos); return -1; } tmp = *end; *end = 0; __strlcpy(namespace, rewrite_pos + 1, sizeof(namespace)); *end = tmp; break; } default: __strlcpy(namespace, ns->name, sizeof(namespace)); for (i = 0; i < ns->n_policies; i++) { if (strcmp(ns->policies[i]->name, rewrite_pos)) pr_warn("binary rewriting of apparmor policies not supported right now, not renaming %s to %s\n", ns->policies[i]->name, rewrite_pos); } } } my_offset = snprintf(path + offset, PATH_MAX - offset, "/namespaces/%s", ns->name); if (my_offset < 0 || my_offset >= PATH_MAX - offset) { pr_err("snprintf'd too many characters\n"); return -1; } if (!suspend && mkdir(path, 0755) < 0 && errno != EEXIST) { pr_perror("failed to create namespace %s", path); goto fail; } for (i = 0; i < ns->n_namespaces; i++) { if (write_aa_policy(ns, path, offset + my_offset, rewrite_pos, suspend) < 0) goto fail; } ret = snprintf(path + offset + my_offset, PATH_MAX - offset - my_offset, "/.replace"); if (ret < 0 || ret >= PATH_MAX - offset - my_offset) { pr_err("snprintf failed\n"); goto fail; } for (i = 0; i < ns->n_policies; i++) { AaPolicy *p = ns->policies[i]; void *data = p->blob.data; int fd, n; off_t len = p->blob.len; fd = open(path, O_WRONLY); if (fd < 0) { pr_perror("couldn't open apparmor load file %s", path); goto fail; } if (suspend) { pr_info("suspending policy %s\n", p->name); data = get_suspend_policy(p->name, &len); if (!data) { close(fd); goto fail; } } n = write(fd, data, len); close(fd); if (suspend && munmap(data, len) < 0) { pr_perror("failed to munmap"); goto fail; } if (n != len) { pr_perror("write AA policy %s in %s failed", p->name, namespace); goto fail; } if (!suspend) pr_info("wrote aa policy %s: %s %d\n", path, p->name, n); } return 0; fail: if (!suspend) { path[offset + my_offset] = 0; rmdir(path); } pr_err("failed to write policy in AA namespace %s\n", namespace); return -1; } static int do_suspend(bool suspend) { int i; for (i = 0; i < n_namespaces; i++) { AaNamespace *ns = namespaces[i]; char path[PATH_MAX] = AA_SECURITYFS_PATH "/policy"; if (write_aa_policy(ns, path, strlen(path), opts.lsm_profile, suspend) < 0) return -1; } return 0; } int suspend_aa(void) { int ret; if (!mkdtemp(policydir)) { pr_perror("failed to make AA policy dir"); return -1; } ret = do_suspend(true); if (rmrf(policydir) < 0) pr_err("failed removing policy dir %s\n", policydir); return ret; } int unsuspend_aa(void) { return do_suspend(false); } int dump_aa_namespaces(void) { ApparmorEntry *ae = NULL; int ret; if (n_namespaces == 0) return 0; ae = xmalloc(sizeof(*ae)); if (!ae) return -1; apparmor_entry__init(ae); ae->n_namespaces = n_namespaces; ae->namespaces = namespaces; ret = pb_write_one(img_from_set(glob_imgset, CR_FD_APPARMOR), ae, PB_APPARMOR); apparmor_entry__free_unpacked(ae, NULL); n_namespaces = -1; namespaces = NULL; return ret; } bool check_aa_ns_dumping(void) { char contents[49]; int major, minor, ret; FILE *f; f = fopen(AA_SECURITYFS_PATH "/features/domain/stack", "r"); if (!f) return false; ret = fscanf(f, "%48s", contents); fclose(f); if (ret != 1) { pr_err("scanning aa stack feature failed\n"); return false; } if (strcmp("yes", contents)) { pr_warn("aa stack featured disabled: %s\n", contents); return false; } f = fopen(AA_SECURITYFS_PATH "/features/domain/version", "r"); if (!f) return false; ret = fscanf(f, "%d.%d", &major, &minor); fclose(f); if (ret != 2) { pr_err("scanning aa stack version failed\n"); return false; } return major >= 1 && minor >= 2; } int prepare_apparmor_namespaces(void) { struct cr_img *img; int ret, i; ApparmorEntry *ae; img = open_image(CR_FD_APPARMOR, O_RSTR); if (!img) return -1; ret = pb_read_one_eof(img, &ae, PB_APPARMOR); close_image(img); if (ret <= 0) return 0; /* there was no AA namespace entry */ if (!ae) { pr_err("missing aa namespace entry\n"); return -1; } /* no real reason we couldn't do this in parallel, but in usually we * expect one namespace so there's probably not a lot to be gained. */ for (i = 0; i < ae->n_namespaces; i++) { char path[PATH_MAX] = AA_SECURITYFS_PATH "/policy"; if (write_aa_policy(ae->namespaces[i], path, strlen(path), opts.lsm_profile, false) < 0) { ret = -1; goto out; } } ret = 0; out: apparmor_entry__free_unpacked(ae, NULL); return ret; } int render_aa_profile(char **out, const char *cur) { const char *pos; int n_namespaces = 0, n_profiles = 0; bool last_namespace = false; /* no rewriting necessary */ if (!opts.lsm_supplied) { *out = xsprintf("changeprofile %s", cur); if (!*out) return -1; return 0; } /* user asked to re-write to an unconfined profile */ if (!opts.lsm_profile) { *out = NULL; return 0; } pos = opts.lsm_profile; while (*pos) { switch (*pos) { case ':': n_namespaces++; break; default: n_profiles++; } NEXT_AA_TOKEN(pos); } /* special case: there is no namespacing or stacking; we can just * changeprofile to the rewritten string */ if (n_profiles == 1 && n_namespaces == 0) { *out = xsprintf("changeprofile %s", opts.lsm_profile); if (!*out) return -1; pr_info("rewrote apparmor profile from %s to %s\n", cur, *out); return 0; } pos = cur; while (*pos) { switch (*pos) { case ':': n_namespaces--; last_namespace = true; break; default: n_profiles--; } NEXT_AA_TOKEN(pos); if (n_profiles == 0 && n_namespaces == 0) break; } *out = xsprintf("changeprofile %s//%s%s", opts.lsm_profile, last_namespace ? "" : "&", pos); if (!*out) return -1; pr_info("rewrote apparmor profile from %s to %s\n", cur, *out); return 0; } crac-criu-1.5.0/criu/arch/000077500000000000000000000000001471504326700152555ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/aarch64/000077500000000000000000000000001471504326700165055ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/aarch64/Makefile000066400000000000000000000002071471504326700201440ustar00rootroot00000000000000builtin-name := crtools.built-in.o ldflags-y += -r obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o obj-y += bitops.o crac-criu-1.5.0/criu/arch/aarch64/bitops.S000066400000000000000000000004721471504326700201340ustar00rootroot00000000000000#include "common/asm/linkage.h" .text ENTRY(test_and_set_bit) and w3, w0, #63 eor w0, w0, w3 mov x2, #1 add x1, x1, x0, lsr #3 lsl x4, x2, x3 1: ldaxr x2, [x1] lsr x0, x2, x3 orr x2, x2, x4 stlxr w5, x2, [x1] cbnz w5, 1b and x0, x0, #1 3: ret END(test_and_set_bit) crac-criu-1.5.0/criu/arch/aarch64/cpu.c000066400000000000000000000006501471504326700174410ustar00rootroot00000000000000#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include #include "cpu.h" int cpu_init(void) { return 0; } int cpu_dump_cpuinfo(void) { return 0; } int cpu_validate_cpuinfo(void) { return 0; } int cpu_dump_cpuinfo_single(void) { return -ENOTSUP; } int cpu_validate_image_cpuinfo_single(void) { return -ENOTSUP; } int cpuinfo_dump(void) { return -ENOTSUP; } int cpuinfo_check(void) { return -ENOTSUP; } crac-criu-1.5.0/criu/arch/aarch64/crtools.c000066400000000000000000000063111471504326700203370ustar00rootroot00000000000000#include #include #include #include "types.h" #include #include #include "asm/restorer.h" #include "common/compiler.h" #include #include "asm/dump.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" #include "parasite-syscall.h" #include "log.h" #include "util.h" #include "cpu.h" #include "restorer.h" #include "compel/infect.h" #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; CoreEntry *core = x; // Save the Aarch64 CPU state for (i = 0; i < 31; ++i) assign_reg(core->ti_aarch64->gpregs, regs, regs[i]); assign_reg(core->ti_aarch64->gpregs, regs, sp); assign_reg(core->ti_aarch64->gpregs, regs, pc); assign_reg(core->ti_aarch64->gpregs, regs, pstate); // Save the FP/SIMD state for (i = 0; i < 32; ++i) { core->ti_aarch64->fpsimd->vregs[2 * i] = fpsimd->vregs[i]; core->ti_aarch64->fpsimd->vregs[2 * i + 1] = fpsimd->vregs[i] >> 64; } assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); return 0; } int arch_alloc_thread_info(CoreEntry *core) { ThreadInfoAarch64 *ti_aarch64; UserAarch64RegsEntry *gpregs; UserAarch64FpsimdContextEntry *fpsimd; ti_aarch64 = xmalloc(sizeof(*ti_aarch64)); if (!ti_aarch64) goto err; thread_info_aarch64__init(ti_aarch64); core->ti_aarch64 = ti_aarch64; gpregs = xmalloc(sizeof(*gpregs)); if (!gpregs) goto err; user_aarch64_regs_entry__init(gpregs); gpregs->regs = xmalloc(31 * sizeof(uint64_t)); if (!gpregs->regs) goto err; gpregs->n_regs = 31; ti_aarch64->gpregs = gpregs; fpsimd = xmalloc(sizeof(*fpsimd)); if (!fpsimd) goto err; user_aarch64_fpsimd_context_entry__init(fpsimd); ti_aarch64->fpsimd = fpsimd; fpsimd->vregs = xmalloc(64 * sizeof(fpsimd->vregs[0])); fpsimd->n_vregs = 64; if (!fpsimd->vregs) goto err; return 0; err: return -1; } void arch_free_thread_info(CoreEntry *core) { if (CORE_THREAD_ARCH_INFO(core)) { if (CORE_THREAD_ARCH_INFO(core)->fpsimd) { xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs); xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd); } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); CORE_THREAD_ARCH_INFO(core) = NULL; } } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { int i; struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); if (core->ti_aarch64->fpsimd->n_vregs != 64) return 1; for (i = 0; i < 32; ++i) fpsimd->vregs[i] = (__uint128_t)core->ti_aarch64->fpsimd->vregs[2 * i] | ((__uint128_t)core->ti_aarch64->fpsimd->vregs[2 * i + 1] << 64); assign_reg(fpsimd, core->ti_aarch64->fpsimd, fpsr); assign_reg(fpsimd, core->ti_aarch64->fpsimd, fpcr); fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); return 0; } int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) { #define CPREG1(d) f->uc.uc_mcontext.d = r->d int i; for (i = 0; i < 31; ++i) CPREG1(regs[i]); CPREG1(sp); CPREG1(pc); CPREG1(pstate); #undef CPREG1 return 0; } crac-criu-1.5.0/criu/arch/aarch64/include/000077500000000000000000000000001471504326700201305ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/aarch64/include/asm/000077500000000000000000000000001471504326700207105ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/aarch64/include/asm/dump.h000066400000000000000000000006161471504326700220310ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); static inline void core_put_tls(CoreEntry *core, tls_t tls) { core->ti_aarch64->tls = tls; } #define get_task_futex_robust_list_compat(pid, info) -1 #endif crac-criu-1.5.0/criu/arch/aarch64/include/asm/int.h000066400000000000000000000001571471504326700216560ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ crac-criu-1.5.0/criu/arch/aarch64/include/asm/kerndat.h000066400000000000000000000002341471504326700225100ustar00rootroot00000000000000#ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ #define kdat_compatible_cr() 0 #define kdat_can_map_vdso() 0 #endif /* __CR_ASM_KERNDAT_H__ */ crac-criu-1.5.0/criu/arch/aarch64/include/asm/parasite-syscall.h000066400000000000000000000001521471504326700243370ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ struct parasite_ctl; #endif crac-criu-1.5.0/criu/arch/aarch64/include/asm/parasite.h000066400000000000000000000002611471504326700226700ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ static inline void arch_get_tls(tls_t *ptls) { tls_t tls; asm("mrs %0, tpidr_el0" : "=r"(tls)); *ptls = tls; } #endif crac-criu-1.5.0/criu/arch/aarch64/include/asm/restore.h000066400000000000000000000011751471504326700225500ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" /* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ "and sp, %0, #~15 \n" \ "mov x0, %2 \n" \ "br %1 \n" \ : \ : "r"(new_sp), \ "r"(restore_task_exec_start), \ "r"(task_args) \ : "x0", "memory") /* clang-format on */ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { *ptls = pcore->ti_aarch64->tls; } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif crac-criu-1.5.0/criu/arch/aarch64/include/asm/restorer.h000066400000000000000000000102421471504326700227250ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include #include #include "asm/types.h" #include "images/core.pb-c.h" #include /* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "clone_emul: \n" \ "ldr x1, %2 \n" \ "and x1, x1, #~15 \n" \ "sub x1, x1, #16 \n" \ "stp %5, %6, [x1] \n" \ "mov x0, %1 \n" \ "mov x2, %3 \n" \ "mov x3, %4 \n" \ "mov x8, #"__stringify(__NR_clone)" \n" \ "svc #0 \n" \ \ "cbz x0, thread_run \n" \ \ "mov %0, x0 \n" \ "b clone_end \n" \ \ "thread_run: \n" \ "mov x8, #"__stringify(__NR_gettid)" \n" \ "svc #0 \n" \ "cmp x0, %7 \n" \ "beq cont \n" \ "mov x0, #2 \n" \ "mov x8, #"__stringify(__NR_exit)" \n" \ "svc #0 \n" \ \ "cont: \n" \ "ldp x1, x0, [sp] \n" \ "br x1 \n" \ \ "clone_end: \n" \ : "=r"(ret) \ : "r"(clone_flags), \ "m"(new_sp), \ "r"(&parent_tid), \ "r"(&thread_args[i].pid), \ "r"(clone_restore_fn), \ "r"(&thread_args[i]), \ "r"(thread_args[i].pid) \ : "x0", "x1", "x2", "x3", "x8", "memory") /* * Based on sysdeps/unix/sysv/linux/aarch64/clone.S * * int clone(int (*fn)(void *arg), x0 * void *child_stack, x1 * int flags, x2 * void *arg, x3 * pid_t *ptid, x4 * struct user_desc *tls, x5 * pid_t *ctid); x6 * * int clone3(struct clone_args *args, x0 * size_t size); x1 * * Always consult the CLONE3 wrappers for other architectures * for additional details. * */ #define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ clone_restore_fn) \ asm volatile( \ /* In contrast to the clone() wrapper above this does not put * the thread function and its arguments on the child stack, * but uses registers to pass these parameters to the child process. * Based on the glibc clone() wrapper at * sysdeps/unix/sysv/linux/aarch64/clone.S. */ \ "clone3_emul: \n" \ /* * Based on the glibc clone() wrapper, which uses x10 and x11 * to save the arguments for the child process, this does the same. * x10 for the thread function and x11 for the thread arguments. */ \ "mov x10, %3 /* clone_restore_fn */ \n" \ "mov x11, %4 /* args */ \n" \ "mov x0, %1 /* &clone_args */ \n" \ "mov x1, %2 /* size */ \n" \ /* Load syscall number */ \ "mov x8, #"__stringify(__NR_clone3)" \n" \ /* Do the syscall */ \ "svc #0 \n" \ \ "cbz x0, clone3_thread_run \n" \ \ "mov %0, x0 \n" \ "b clone3_end \n" \ \ "clone3_thread_run: \n" \ /* Move args to x0 */ \ "mov x0, x11 \n" \ /* Jump to clone_restore_fn */ \ "br x10 \n" \ \ "clone3_end: \n" \ : "=r"(ret) \ : "r"(&clone_args), \ "r"(size), \ "r"(clone_restore_fn), \ "r"(args) \ : "x0", "x1", "x8", "x10", "x11", "memory") #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "mov sp, %0 \n" \ "mov x0, #0 \n" \ "b x0 \n" \ : \ : "r"(ret) \ : "sp", "x0", "memory") /* clang-format on */ #define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserAarch64RegsEntry *r); int restore_nonsigframe_gpregs(UserAarch64RegsEntry *r); static inline void restore_tls(tls_t *ptls) { asm("msr tpidr_el0, %0" : : "r"(*ptls)); } static inline void *alloc_compat_syscall_stack(void) { return NULL; } static inline void free_compat_syscall_stack(void *stack32) { } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } #endif crac-criu-1.5.0/criu/arch/aarch64/include/asm/thread_pointer.h000066400000000000000000000017751471504326700241020ustar00rootroot00000000000000/* __thread_pointer definition. Generic version. Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #ifndef _SYS_THREAD_POINTER_H #define _SYS_THREAD_POINTER_H static inline void *__criu_thread_pointer(void) { return __builtin_thread_pointer(); } #endif /* _SYS_THREAD_POINTER_H */ crac-criu-1.5.0/criu/arch/aarch64/include/asm/types.h000066400000000000000000000014521471504326700222270ustar00rootroot00000000000000#ifndef __CR_ASM_TYPES_H__ #define __CR_ASM_TYPES_H__ #include #include #include #include "images/core.pb-c.h" #include "page.h" #include "bitops.h" #include "asm/int.h" #include #define core_is_compat(core) false typedef UserAarch64RegsEntry UserRegsEntry; #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__AARCH64 #define CORE_THREAD_ARCH_INFO(core) core->ti_aarch64 #define TI_SP(core) ((core)->ti_aarch64->gpregs->sp) #define TI_IP(core) ((core)->ti_aarch64->gpregs->pc) static inline void *decode_pointer(uint64_t v) { return (void *)v; } static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } #define AT_VECTOR_SIZE 40 typedef uint64_t auxv_t; typedef uint64_t tls_t; #endif /* __CR_ASM_TYPES_H__ */ crac-criu-1.5.0/criu/arch/aarch64/include/asm/vdso.h000066400000000000000000000016321471504326700220360ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "common/compiler.h" #include "asm-generic/vdso.h" /* * This is a minimal amount of symbols * we should support at the moment. */ #define VDSO_SYMBOL_MAX 4 #define VDSO_SYMBOL_GTOD 2 /* * Workaround for VDSO array symbol table's relocation. * XXX: remove when compel/piegen will support aarch64. */ #define ARCH_VDSO_SYMBOLS_LIST \ const char *aarch_vdso_symbol1 = "__kernel_clock_getres"; \ const char *aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ const char *aarch_vdso_symbol3 = "__kernel_gettimeofday"; \ const char *aarch_vdso_symbol4 = "__kernel_rt_sigreturn"; #define ARCH_VDSO_SYMBOLS aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4 extern void write_intraprocedure_branch(unsigned long to, unsigned long from); #endif /* __CR_ASM_VDSO_H__ */ crac-criu-1.5.0/criu/arch/aarch64/intraprocedure.S000066400000000000000000000010071471504326700216550ustar00rootroot00000000000000.global write_intraprocedure_branch /* to is x0, from is x1 */ write_intraprocedure_branch: /* load two 32-bit instructions */ ldr x2, loadbranch /* store 64 bits of instructions and 64 bits of destination address */ stp x2, x0, [x1] /* perform required cache maintenance and synronization operations */ dc cvau, x1 dsb ish ic ivau, x1 dsb ish isb ret /* intraprocedure trampoline instructions */ loadbranch: ldr x16, =destination br x16 /* label to get relative position of literal pool */ destination: crac-criu-1.5.0/criu/arch/aarch64/restorer.c000066400000000000000000000003551471504326700205210ustar00rootroot00000000000000#include #include "restorer.h" #include "asm/restorer.h" #include #include "log.h" #include #include "cpu.h" int restore_nonsigframe_gpregs(UserRegsEntry *r) { return 0; } crac-criu-1.5.0/criu/arch/aarch64/sigframe.c000066400000000000000000000003001471504326700204370ustar00rootroot00000000000000#include "asm/types.h" #include #include "asm/sigframe.h" int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } crac-criu-1.5.0/criu/arch/aarch64/vdso-pie.c000066400000000000000000000013711471504326700204010ustar00rootroot00000000000000#include #include "asm/types.h" #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX #undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, struct vdso_symtable *from, bool __always_unused compat_vdso) { unsigned int i; for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { if (vdso_symbol_empty(&from->symbols[i])) continue; pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, from->symbols[i].offset, base_to, to->symbols[i].offset, i); write_intraprocedure_branch(base_to + to->symbols[i].offset, base_from + from->symbols[i].offset); } return 0; } crac-criu-1.5.0/criu/arch/arm/000077500000000000000000000000001471504326700160345ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/arm/Makefile000066400000000000000000000002261471504326700174740ustar00rootroot00000000000000builtin-name := crtools.built-in.o ldflags-y += -r -z noexecstack obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o obj-y += bitops.o crac-criu-1.5.0/criu/arch/arm/aeabi-helpers.S000066400000000000000000000036461471504326700206720ustar00rootroot00000000000000/* * Code borrowed from gcc, arm/lib1funcs.S * and adapted to CRIU macros. */ #if defined(__thumb__) /* * We don't support compiling PIEs in Thumb mode, * see top Makefile for details (ARM CFLAGS_PIE section). */ #error Unsupported Thumb mode #endif #include "common/asm/linkage.h" #define RET bx lr #define RETc(x) bx##x lr #define LSYM(x) .x .macro do_it cond, suffix="" .endm .macro ARM_DIV2_ORDER divisor, order clz \order, \divisor rsb \order, \order, #31 .endm .macro ARM_DIV_BODY dividend, divisor, result, curbit clz \curbit, \dividend clz \result, \divisor sub \curbit, \result, \curbit rsbs \curbit, \curbit, #31 addne \curbit, \curbit, \curbit, lsl #1 mov \result, #0 addne pc, pc, \curbit, lsl #2 nop .set shift, 32 .rept 32 .set shift, shift - 1 cmp \dividend, \divisor, lsl #shift adc \result, \result, \result subcs \dividend, \dividend, \divisor, lsl #shift .endr .endm /* * XXX: as an optimization add udiv instruction based version. * It's possible to check if CPU supports the instruction by * reading Instruction Set Attribute Register (ID_ISAR0) * and checking fields "Divide_instrs". */ ENTRY(__aeabi_uidiv) /* Note: if called via udivsi3_skip_div0_test, this will unnecessarily check for division-by-zero a second time. */ LSYM(udivsi3_skip_div0_test): subs r2, r1, #1 do_it eq RETc(eq) bcc LSYM(Ldiv0) cmp r0, r1 bls 11f tst r1, r2 beq 12f ARM_DIV_BODY r0, r1, r2, r3 mov r0, r2 RET 11: do_it eq, e moveq r0, #1 movne r0, #0 RET 12: ARM_DIV2_ORDER r1, r2 mov r0, r0, lsr r2 RET LSYM(Ldiv0): .byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux END(__aeabi_uidiv) ALIAS(__udivsi3, __aeabi_uidiv) ENTRY(__aeabi_uidivmod) cmp r1, #0 beq LSYM(Ldiv0) stmfd sp!, { r0, r1, lr } bl LSYM(udivsi3_skip_div0_test) ldmfd sp!, { r1, r2, lr } mul r3, r2, r0 sub r1, r1, r3 RET END(__aeabi_uidivmod) ALIAS(__umodsi3, __aeabi_uidiv) crac-criu-1.5.0/criu/arch/arm/bitops.S000066400000000000000000000011331471504326700174560ustar00rootroot00000000000000#include "common/asm/linkage.h" .syntax unified ENTRY(test_and_set_bit) ands ip, r1, #3 strbne r1, [ip] @ assert word-aligned mov r2, #1 and r3, r0, #31 @ Get bit offset mov r0, r0, lsr #5 add r1, r1, r0, lsl #2 @ Get word offset mov r3, r2, lsl r3 @ create mask dmb ish 1: ldrex r2, [r1] ands r0, r2, r3 @ save old value of bit orreq r2, r2, r3 @ toggle bit strex ip, r2, [r1] cmp ip, #0 bne 1b dmb ish cmp r0, #0 movne r0, #1 2: bx lr END(test_and_set_bit) crac-criu-1.5.0/criu/arch/arm/cpu.c000066400000000000000000000006501471504326700167700ustar00rootroot00000000000000#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include #include "cpu.h" int cpu_init(void) { return 0; } int cpu_dump_cpuinfo(void) { return 0; } int cpu_validate_cpuinfo(void) { return 0; } int cpu_dump_cpuinfo_single(void) { return -ENOTSUP; } int cpu_validate_image_cpuinfo_single(void) { return -ENOTSUP; } int cpuinfo_dump(void) { return -ENOTSUP; } int cpuinfo_check(void) { return -ENOTSUP; } crac-criu-1.5.0/criu/arch/arm/crtools.c000066400000000000000000000067571471504326700177040ustar00rootroot00000000000000#include #include #include "types.h" #include #include #include "asm/restorer.h" #include "common/compiler.h" #include "asm/dump.h" #include #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" #include "log.h" #include "util.h" #include "cpu.h" #include "elf.h" #include "parasite-syscall.h" #include "restorer.h" #include "compel/infect.h" #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; // Save the ARM CPU state assign_reg(core->ti_arm->gpregs, regs, r0); assign_reg(core->ti_arm->gpregs, regs, r1); assign_reg(core->ti_arm->gpregs, regs, r2); assign_reg(core->ti_arm->gpregs, regs, r3); assign_reg(core->ti_arm->gpregs, regs, r4); assign_reg(core->ti_arm->gpregs, regs, r5); assign_reg(core->ti_arm->gpregs, regs, r6); assign_reg(core->ti_arm->gpregs, regs, r7); assign_reg(core->ti_arm->gpregs, regs, r8); assign_reg(core->ti_arm->gpregs, regs, r9); assign_reg(core->ti_arm->gpregs, regs, r10); assign_reg(core->ti_arm->gpregs, regs, fp); assign_reg(core->ti_arm->gpregs, regs, ip); assign_reg(core->ti_arm->gpregs, regs, sp); assign_reg(core->ti_arm->gpregs, regs, lr); assign_reg(core->ti_arm->gpregs, regs, pc); assign_reg(core->ti_arm->gpregs, regs, cpsr); core->ti_arm->gpregs->orig_r0 = regs->ARM_ORIG_r0; // Save the VFP state memcpy(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, &fpregs->fpregs, sizeof(fpregs->fpregs)); CORE_THREAD_ARCH_INFO(core)->fpstate->fpscr = fpregs->fpscr; return 0; } int arch_alloc_thread_info(CoreEntry *core) { ThreadInfoArm *ti_arm; UserArmRegsEntry *gpregs; UserArmVfpstateEntry *fpstate; ti_arm = xmalloc(sizeof(*ti_arm)); if (!ti_arm) goto err; thread_info_arm__init(ti_arm); core->ti_arm = ti_arm; gpregs = xmalloc(sizeof(*gpregs)); user_arm_regs_entry__init(gpregs); ti_arm->gpregs = gpregs; fpstate = xmalloc(sizeof(*fpstate)); if (!fpstate) goto err; user_arm_vfpstate_entry__init(fpstate); ti_arm->fpstate = fpstate; fpstate->vfp_regs = xmalloc(32 * sizeof(unsigned long long)); fpstate->n_vfp_regs = 32; if (!fpstate->vfp_regs) goto err; return 0; err: return -1; } void arch_free_thread_info(CoreEntry *core) { if (CORE_THREAD_ARCH_INFO(core)) { if (CORE_THREAD_ARCH_INFO(core)->fpstate) { xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs); xfree(CORE_THREAD_ARCH_INFO(core)->fpstate); } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); CORE_THREAD_ARCH_INFO(core) = NULL; } } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { struct aux_sigframe *aux = (struct aux_sigframe *)&sigframe->sig.uc.uc_regspace; memcpy(&aux->vfp.ufp.fpregs, CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, sizeof(aux->vfp.ufp.fpregs)); aux->vfp.ufp.fpscr = CORE_THREAD_ARCH_INFO(core)->fpstate->fpscr; aux->vfp.magic = VFP_MAGIC; aux->vfp.size = VFP_STORAGE_SIZE; return 0; } int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r) { #define CPREG1(d) f->sig.uc.uc_mcontext.arm_##d = r->d #define CPREG2(d, s) f->sig.uc.uc_mcontext.arm_##d = r->s CPREG1(r0); CPREG1(r1); CPREG1(r2); CPREG1(r3); CPREG1(r4); CPREG1(r5); CPREG1(r6); CPREG1(r7); CPREG1(r8); CPREG1(r9); CPREG1(r10); CPREG1(fp); CPREG1(ip); CPREG1(sp); CPREG1(lr); CPREG1(pc); CPREG1(cpsr); #undef CPREG1 #undef CPREG2 return 0; } crac-criu-1.5.0/criu/arch/arm/include/000077500000000000000000000000001471504326700174575ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/arm/include/asm/000077500000000000000000000000001471504326700202375ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/arm/include/asm/dump.h000066400000000000000000000006121471504326700213540ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); static inline void core_put_tls(CoreEntry *core, tls_t tls) { core->ti_arm->tls = tls; } #define get_task_futex_robust_list_compat(pid, info) -1 #endif crac-criu-1.5.0/criu/arch/arm/include/asm/int.h000066400000000000000000000001571471504326700212050ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ crac-criu-1.5.0/criu/arch/arm/include/asm/kerndat.h000066400000000000000000000002341471504326700220370ustar00rootroot00000000000000#ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ #define kdat_compatible_cr() 0 #define kdat_can_map_vdso() 0 #endif /* __CR_ASM_KERNDAT_H__ */ crac-criu-1.5.0/criu/arch/arm/include/asm/parasite-syscall.h000066400000000000000000000001521471504326700236660ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ struct parasite_ctl; #endif crac-criu-1.5.0/criu/arch/arm/include/asm/parasite.h000066400000000000000000000003441471504326700222210ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ /* kuser_get_tls() kernel-provided user-helper, the address is emulated */ static inline void arch_get_tls(tls_t *ptls) { *ptls = ((tls_t(*)(void))0xffff0fe0)(); } #endif crac-criu-1.5.0/criu/arch/arm/include/asm/restore.h000066400000000000000000000013201471504326700220670ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" /* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ "mov sp, %0 \n" \ "mov r1, %1 \n" \ "mov r0, %2 \n" \ "bx r1 \n" \ : \ : "r"(new_sp), \ "r"(restore_task_exec_start), \ "r"(task_args) \ : "r0", "r1", "memory") /* clang-format on */ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { *ptls = pcore->ti_arm->tls; } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif crac-criu-1.5.0/criu/arch/arm/include/asm/restorer.h000066400000000000000000000076411471504326700222650ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include "asm/types.h" #include "images/core.pb-c.h" #include /* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "clone_emul: \n" \ "ldr r1, %2 \n" \ "sub r1, #16 \n" \ "mov r0, %6 \n" \ "str r0, [r1, #4] \n" \ "mov r0, %5 \n" \ "str r0, [r1] \n" \ "mov r0, %1 \n" \ "mov r2, %3 \n" \ "mov r3, %4 \n" \ "mov r7, #"__stringify(__NR_clone)" \n" \ "svc #0 \n" \ \ "cmp r0, #0 \n" \ "beq thread_run \n" \ \ "mov %0, r0 \n" \ "b clone_end \n" \ \ "thread_run: \n" \ "pop { r1 } \n" \ "pop { r0 } \n" \ "bx r1 \n" \ \ "clone_end: \n" \ : "=r"(ret) \ : "r"(clone_flags), \ "m"(new_sp), \ "r"(&parent_tid), \ "r"(&thread_args[i].pid), \ "r"(clone_restore_fn), \ "r"(&thread_args[i]) \ : "r0", "r1", "r2", "r3", "r7", "memory") /* * The clone3() assembler wrapper is based on the clone() wrapper above * and on code from the glibc wrapper at * sysdeps/unix/sysv/linux/arm/clone.S * * For arm it is necessary to change the child stack as on x86_64 as * it seems there are not registers which stay the same over a syscall * like on s390x, ppc64le and aarch64. * * Changing the child stack means that this code has to deal with the * kernel doing stack + stack_size implicitly. * * int clone3(struct clone_args *args, size_t size) */ #define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ clone_restore_fn) \ asm volatile( \ "clone3_emul: \n" \ /* Load thread stack pointer */ \ "ldr r1, [%3] \n" \ /* Load thread stack size */ \ "mov r2, %4 \n" \ /* Goto to the end of stack */ \ "add r1, r1, r2 \n" \ /* Load thread function and arguments and push on stack */ \ "mov r2, %6 /* args */ \n" \ "str r2, [r1, #4] /* args */ \n" \ "mov r2, %5 /* function */ \n" \ "str r2, [r1] /* function */ \n" \ "mov r0, %1 /* clone_args */ \n" \ "mov r1, %2 /* size */ \n" \ "mov r7, #"__stringify(__NR_clone3)" \n" \ "svc #0 \n" \ \ "cmp r0, #0 \n" \ "beq thread3_run \n" \ \ "mov %0, r0 \n" \ "b clone3_end \n" \ \ "thread3_run: \n" \ "pop { r1 } \n" \ "pop { r0 } \n" \ "bx r1 \n" \ \ "clone3_end: \n" \ : "=r"(ret) \ : "r"(&clone_args), \ "r"(size), \ "r"(&clone_args.stack), \ "r"(clone_args.stack_size), \ "r"(clone_restore_fn), \ "r"(args) \ : "r0", "r1", "r2", "r7", "memory") #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "mov sp, %0 \n" \ "mov r0, #0 \n" \ "bx r0 \n" \ : \ : "r"(ret) \ : "memory") /* clang-format on */ #define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r); int restore_nonsigframe_gpregs(UserArmRegsEntry *r); #define ARCH_HAS_SHMAT_HOOK unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size); static inline void restore_tls(tls_t *ptls) { asm("mov r7, #15 \n" "lsl r7, #16 \n" "mov r0, #5 \n" "add r7, r0 \n" /* r7 = 0xF005 */ "ldr r0, [%0] \n" "svc #0 \n" : : "r"(ptls) : "r0", "r7"); } static inline void *alloc_compat_syscall_stack(void) { return NULL; } static inline void free_compat_syscall_stack(void *stack32) { } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } #endif crac-criu-1.5.0/criu/arch/arm/include/asm/thread_pointer.h000066400000000000000000000017751471504326700234310ustar00rootroot00000000000000/* __thread_pointer definition. Generic version. Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #ifndef _SYS_THREAD_POINTER_H #define _SYS_THREAD_POINTER_H static inline void *__criu_thread_pointer(void) { return __builtin_thread_pointer(); } #endif /* _SYS_THREAD_POINTER_H */ crac-criu-1.5.0/criu/arch/arm/include/asm/types.h000066400000000000000000000013641471504326700215600ustar00rootroot00000000000000#ifndef __CR_ASM_TYPES_H__ #define __CR_ASM_TYPES_H__ #include #include #include "images/core.pb-c.h" #include "page.h" #include "bitops.h" #include "asm/int.h" #include #define core_is_compat(core) false typedef UserArmRegsEntry UserRegsEntry; #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__ARM #define CORE_THREAD_ARCH_INFO(core) core->ti_arm #define TI_SP(core) ((core)->ti_arm->gpregs->sp) #define TI_IP(core) ((core)->ti_arm->gpregs->ip) static inline void *decode_pointer(u64 v) { return (void *)(u32)v; } static inline u64 encode_pointer(void *p) { return (u32)p; } #define AT_VECTOR_SIZE 40 typedef uint32_t auxv_t; typedef uint32_t tls_t; #endif /* __CR_ASM_TYPES_H__ */ crac-criu-1.5.0/criu/arch/arm/include/asm/vdso.h000066400000000000000000000011421471504326700213610ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "asm-generic/vdso.h" /* This definition is used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' * * Poke from kernel file arch/arm/vdso/vdso.lds.S */ #define VDSO_SYMBOL_MAX 2 #define VDSO_SYMBOL_GTOD 1 #define ARCH_VDSO_SYMBOLS_LIST \ const char *aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ const char *aarch_vdso_symbol2 = "__vdso_gettimeofday"; #define ARCH_VDSO_SYMBOLS aarch_vdso_symbol1, aarch_vdso_symbol2, #endif /* __CR_ASM_VDSO_H__ */ crac-criu-1.5.0/criu/arch/arm/pie-cacheflush.c000066400000000000000000000002621471504326700210600ustar00rootroot00000000000000#include /* That's __builtin___clear_cache() to flush CPU cache */ void __clear_cache(void *start, void *end) { sys_cacheflush(start, end, 0); } crac-criu-1.5.0/criu/arch/arm/restorer.c000066400000000000000000000050071471504326700200470ustar00rootroot00000000000000#include #include "restorer.h" #include "asm/restorer.h" #include #include "log.h" #include #include "cpu.h" #include "page.h" #include "common/err.h" int restore_nonsigframe_gpregs(UserArmRegsEntry *r) { return 0; } /* * On ARMv6 CPUs with VIPT caches there are aliasing issues: * if two different cache line indexes correspond to the same physical * address, then changes made to one of the alias might be lost or they * can overwrite each other. To overcome aliasing issues, page coloring * with 4 pages align for shared mappings was introduced (SHMLBA) in kernel. * Which resulted in unique physical address after any tag in cache * (because two upper bits corresponding to page address get unused in tags). * * The problem here is in shmat() syscall: * 1. if shmaddr is NULL then do_shmat() uses arch_get_unmapped_area() * to allocate shared mapping. Which checks if CPU cache is VIPT * and only then use SHMLBA alignment. * 2. if shmaddr is specified then do_shmat() checks that address has * SHMLBA alignment regardless to CPU cache aliasing. * * All above means that on non-VIPT CPU (like any ARMv7) we can get * non-SHMLBA, but page-aligned address with shmat(shmid, NULL, shmflg), * but we can't restore it with shmat(shmid, shmaddr, shmflg). * Which results that we can dump e.g., application with shmem aligned * on 2 pages, but can't restore it on the same ARMv7 CPU. * * To workaround this kernel feature, use mremap() on shmem mapping, * allocated with shmat(shmid, NULL, shmflg). */ #define SHMLBA (4UL * PAGE_SIZE) unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size) { unsigned long smap; /* SHMLBA-aligned, direct call shmat() */ if (!((unsigned long)shmaddr & (SHMLBA - 1))) return sys_shmat(shmid, shmaddr, shmflg); smap = sys_shmat(shmid, NULL, shmflg); if (IS_ERR_VALUE(smap)) { pr_err("shmat() with NULL shmaddr failed: %d\n", (int)smap); return smap; } /* We're lucky! */ if (smap == (unsigned long)shmaddr) return smap; /* Warn ALOUD */ pr_warn("Restoring shmem %p unaligned to SHMLBA.\n", shmaddr); pr_warn("Make sure that you don't migrate shmem from non-VIPT cached CPU to VIPT cached (e.g., ARMv7 -> ARMv6)\n"); pr_warn("Otherwise YOU HAVE A CHANCE OF DATA CORRUPTIONS in writeable shmem\n"); smap = sys_mremap(smap, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)shmaddr); if (IS_ERR_VALUE(smap)) pr_err("mremap() for shmem failed: %d\n", (int)smap); return smap; } crac-criu-1.5.0/criu/arch/arm/sigframe.c000066400000000000000000000003001471504326700177660ustar00rootroot00000000000000#include "asm/types.h" #include #include "asm/sigframe.h" int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } crac-criu-1.5.0/criu/arch/arm/vdso-pie.c000066400000000000000000000023711471504326700177310ustar00rootroot00000000000000#include #include "asm/types.h" #include #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX #undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " static void insert_trampoline(uintptr_t from, uintptr_t to) { struct { uint32_t ldr_pc; uint32_t imm32; uint32_t guards; } __packed jmp = { .ldr_pc = 0xe51ff004, /* ldr pc, [pc, #-4] */ .imm32 = to, .guards = 0xe1200070, /* bkpt 0x0000 */ }; void *iflush_start = (void *)from; void *iflush_end = iflush_start + sizeof(jmp); memcpy((void *)from, &jmp, sizeof(jmp)); __builtin___clear_cache(iflush_start, iflush_end); } int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, struct vdso_symtable *sfrom, bool compat_vdso) { unsigned int i; for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { uintptr_t from, to; if (vdso_symbol_empty(&sfrom->symbols[i])) continue; pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, sto->symbols[i].offset, i); from = base_from + sfrom->symbols[i].offset; to = base_to + sto->symbols[i].offset; insert_trampoline(from, to); } return 0; } crac-criu-1.5.0/criu/arch/loongarch64/000077500000000000000000000000001471504326700174035ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/loongarch64/Makefile000066400000000000000000000006041471504326700210430ustar00rootroot00000000000000builtin-name := crtools.built-in.o ccflags-y += -iquote $(obj)/include ccflags-y += -iquote criu/include -iquote include ccflags-y += $(COMPEL_UAPI_INCLUDES) asflags-y += -Wstrict-prototypes asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer asflags-y += -iquote $(obj)/include ldflags-y += -r -z noexecstack obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o crac-criu-1.5.0/criu/arch/loongarch64/cpu.c000066400000000000000000000004751471504326700203440ustar00rootroot00000000000000#undef LOG_PREFIX #define LOG_PREFIX "cpu: " int cpu_init(void) { return 0; } int cpu_dump_cpuinfo(void) { return 0; } int cpu_validate_cpuinfo(void) { return 0; } int cpuinfo_dump(void) { if (cpu_init()) return -1; if (cpu_dump_cpuinfo()) return -1; return 0; } int cpuinfo_check(void) { return 0; } crac-criu-1.5.0/criu/arch/loongarch64/crtools.c000066400000000000000000000056001471504326700212350ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "types.h" #include "log.h" #include "asm/restorer.h" #include "asm/parasite-syscall.h" #include #include "asm/dump.h" #include "cr_options.h" #include "common/compiler.h" #include "restorer.h" #include "parasite-syscall.h" #include "util.h" #include "cpu.h" #include #include "kerndat.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" #define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { int i; CoreEntry *core = x; UserLoongarch64GpregsEntry *gprs = core->ti_loongarch64->gpregs; UserLoongarch64FpregsEntry *fprs = core->ti_loongarch64->fpregs; for (i = 0; i < GPR_NUM; i++) assign_reg(gprs, regs, regs[i]); assign_reg(gprs, regs, pc); for (i = 0; i < FPR_NUM; i++) assign_reg(fpregs, fpregs, regs[i]); assign_reg(fprs, fpregs, fcc); assign_reg(fprs, fpregs, fcsr); return 0; } int arch_alloc_thread_info(CoreEntry *core) { ThreadInfoLoongarch64 *ti_loongarch64; UserLoongarch64GpregsEntry *gpregs; UserLoongarch64FpregsEntry *fpregs; ti_loongarch64 = xmalloc(sizeof(*ti_loongarch64)); thread_info_loongarch64__init(ti_loongarch64); core->ti_loongarch64 = ti_loongarch64; gpregs = xmalloc(sizeof(*gpregs)); if (!gpregs) goto err; user_loongarch64_gpregs_entry__init(gpregs); gpregs->n_regs = GPR_NUM; gpregs->regs = xmalloc(GPR_NUM * sizeof(uint64_t)); if (!gpregs->regs) goto err; ti_loongarch64->gpregs = gpregs; fpregs = xmalloc(sizeof(*fpregs)); if (!fpregs) goto err; user_loongarch64_fpregs_entry__init(fpregs); fpregs->n_regs = FPR_NUM; fpregs->regs = xmalloc(FPR_NUM * sizeof(uint64_t)); if (!fpregs->regs) goto err; ti_loongarch64->fpregs = fpregs; return 0; err: return -1; } void arch_free_thread_info(CoreEntry *core) { if (CORE_THREAD_ARCH_INFO(core)) { if (CORE_THREAD_ARCH_INFO(core)->fpregs) { xfree(CORE_THREAD_ARCH_INFO(core)->fpregs->regs); xfree(CORE_THREAD_ARCH_INFO(core)->fpregs); } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); CORE_THREAD_ARCH_INFO(core) = NULL; } } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { fpu_context_t *fpu = RT_SIGFRAME_FPU(sigframe); UserLoongarch64FpregsEntry *fpregs = core->ti_loongarch64->fpregs; memcpy(fpu->regs, fpregs->regs, sizeof(fpu->regs)); fpu->fcc = fpregs->fcc; fpu->fcsr = fpregs->fcsr; return 0; } int restore_gpregs(struct rt_sigframe *sigframe, UserRegsEntry *r) { sigcontext_t *sc = RT_SIGFRAME_SIGCTX(sigframe); memcpy(sc->regs, r->regs, sizeof(sc->regs)); sc->pc = r->pc; return 0; } crac-criu-1.5.0/criu/arch/loongarch64/include/000077500000000000000000000000001471504326700210265ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/loongarch64/include/asm/000077500000000000000000000000001471504326700216065ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/loongarch64/include/asm/dump.h000066400000000000000000000006221471504326700227240ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); static inline void core_put_tls(CoreEntry *core, tls_t tls) { core->ti_loongarch64->tls = tls; } #define get_task_futex_robust_list_compat(pid, info) -1 #endif crac-criu-1.5.0/criu/arch/loongarch64/include/asm/int.h000066400000000000000000000001571471504326700225540ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ crac-criu-1.5.0/criu/arch/loongarch64/include/asm/kerndat.h000066400000000000000000000002341471504326700234060ustar00rootroot00000000000000#ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ #define kdat_compatible_cr() 0 #define kdat_can_map_vdso() 0 #endif /* __CR_ASM_KERNDAT_H__ */ crac-criu-1.5.0/criu/arch/loongarch64/include/asm/parasite-syscall.h000066400000000000000000000001521471504326700252350ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ struct parasite_ctl; #endif crac-criu-1.5.0/criu/arch/loongarch64/include/asm/parasite.h000066400000000000000000000002721471504326700235700ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ static inline void arch_get_tls(tls_t *ptls) { tls_t tls; asm volatile("or %0, $zero, $tp" : "=r"(tls)); *ptls = tls; } #endif crac-criu-1.5.0/criu/arch/loongarch64/include/asm/restore.h000066400000000000000000000016451471504326700234500ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" /* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args) \ ({ \ uint64_t save_sp; \ asm volatile("or %0, $zero, $sp" : "=r"(save_sp) : :"memory"); \ asm volatile( \ "or $a0, $zero, %2 \n" \ "or $sp, $zero, %0 \n" \ "jirl $ra, %1, 0 \n" \ : \ : "r"(new_sp & ~15), \ "r"(restore_task_exec_start), \ "r"(task_args) \ : "$a0", "memory"); \ asm volatile("or $sp, $zero, %0" : : "r"(save_sp) : "memory"); \ }) /* clang-format on */ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { *ptls = pcore->ti_loongarch64->tls; } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif crac-criu-1.5.0/criu/arch/loongarch64/include/asm/restorer.h000066400000000000000000000052311471504326700236250ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include "asm/types.h" #include #include "images/core.pb-c.h" #include #include /* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "clone_emul: \n" \ "ld.d $a1, %2 \n" \ "addi.d $a1, $a1, -16 \n" \ "st.d %5, $a1, 0 \n" \ "st.d %6, $a1, 8 \n" \ "or $a0, $zero, %1 \n" \ "or $a2, $zero, %3 \n" \ "or $a3, $zero, %4 \n" \ "ori $a7, $zero, "__stringify(__NR_clone)" \n" \ "syscall 0 \n" \ \ "beqz $a0, thread_run \n" \ \ "or %0, $zero, $a0 \n" \ "b clone_end \n" \ \ "thread_run: \n" \ "ld.d $a1, $sp, 0 \n" \ "ld.d $a0, $sp, 8 \n" \ "jirl $ra, $a1, 0 \n" \ \ "clone_end: \n" \ : "=r"(ret) \ : "r"(clone_flags), \ "ZB"(new_sp), \ "r"(&parent_tid), \ "r"(&thread_args[i].pid), \ "r"(&clone_restore_fn), \ "r"(&thread_args[i]) \ : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") #define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ clone_restore_fn) \ asm volatile( \ "clone3_emul: \n" \ "or $a0, $zero, %1 \n" \ "or $a1, $zero, %2 \n" \ "or $a2, $zero, %3 \n" \ "or $a3, $zero, %4 \n" \ "ori $a7, $zero, "__stringify(__NR_clone3)" \n" \ "syscall 0 \n" \ \ "beqz $a0, clone3_thread_run \n" \ \ "or %0, $zero, $a0 \n" \ "b clone3_end \n" \ \ "clone3_thread_run: \n" \ "or $a0, $zero, $a3 \n" \ "jirl $ra, $a2, 0 \n" \ "clone3_end: \n" \ : "=r"(ret) \ : "r"(&clone_args), \ "r"(size), \ "r"(clone_restore_fn), \ "r"(args) \ : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") /* clang-format on */ static inline void restore_tls(tls_t *ptls) { asm volatile("or $tp, $zero, %0" : : "r"(*ptls)); } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } static inline void *alloc_compat_syscall_stack(void) { return NULL; } static inline void free_compat_syscall_stack(void *stack32) { } int restore_gpregs(struct rt_sigframe *f, UserLoongarch64GpregsEntry *r); int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r); #define arch_map_vdso(map, compat) -1 #endif crac-criu-1.5.0/criu/arch/loongarch64/include/asm/thread_pointer.h000066400000000000000000000017751471504326700250000ustar00rootroot00000000000000/* __thread_pointer definition. Generic version. Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #ifndef _SYS_THREAD_POINTER_H #define _SYS_THREAD_POINTER_H static inline void *__criu_thread_pointer(void) { return __builtin_thread_pointer(); } #endif /* _SYS_THREAD_POINTER_H */ crac-criu-1.5.0/criu/arch/loongarch64/include/asm/types.h000066400000000000000000000014551471504326700231300ustar00rootroot00000000000000#ifndef __CR_ASM_TYPES_H__ #define __CR_ASM_TYPES_H__ #include #include #include "page.h" #include "bitops.h" #include "asm/int.h" #include "images/core.pb-c.h" #include #define core_is_compat(core) false #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__LOONGARCH64 #define CORE_THREAD_ARCH_INFO(core) core->ti_loongarch64 #define TI_SP(core) ((core)->ti_loongarch64->gpregs->regs[4]) #define TI_IP(core) ((core)->ti_loongarch64->gpregs->pc) typedef UserLoongarch64GpregsEntry UserRegsEntry; static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } static inline void *decode_pointer(uint64_t v) { return (void *)v; } #define AT_VECTOR_SIZE 44 typedef uint64_t auxv_t; typedef uint64_t tls_t; #endif /* __CR_ASM_TYPES_H__ */ crac-criu-1.5.0/criu/arch/loongarch64/include/asm/vdso.h000066400000000000000000000015331471504326700227340ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "asm-generic/vdso.h" /* This definition is used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' */ /* * This is a minimal amount of symbols * we should support at the moment. */ #define VDSO_SYMBOL_MAX 5 #define VDSO_SYMBOL_GTOD 3 #define ARCH_VDSO_SYMBOLS_LIST \ const char *aarch_vdso_symbol1 = "__vdso_getcpu"; \ const char *aarch_vdso_symbol2 = "__vdso_clock_getres"; \ const char *aarch_vdso_symbol3 = "__vdso_clock_gettime"; \ const char *aarch_vdso_symbol4 = "__vdso_gettimeofday"; \ const char *aarch_vdso_symbol5 = "__vdso_rt_sigreturn"; #define ARCH_VDSO_SYMBOLS \ aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5 #endif crac-criu-1.5.0/criu/arch/loongarch64/restorer.c000066400000000000000000000003721471504326700214160ustar00rootroot00000000000000#include #include "restorer.h" #include "asm/restorer.h" #include #include #include "log.h" #include "cpu.h" int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r) { return 0; } crac-criu-1.5.0/criu/arch/loongarch64/sigframe.c000066400000000000000000000003501471504326700213420ustar00rootroot00000000000000#include #include #include "asm/sigframe.h" #include "asm/types.h" #include "log.h" #include int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } crac-criu-1.5.0/criu/arch/loongarch64/vdso-pie.c000066400000000000000000000024031471504326700212740ustar00rootroot00000000000000#include #include "asm/types.h" #include #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX #undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " static void insert_trampoline(uintptr_t from, uintptr_t to) { struct { uint32_t pcaddi; uint32_t ldptr; uint32_t jirl; uint32_t guards; uint64_t imm64; } __packed jmp = { .pcaddi = 0x18000095, /* pcaddi $x, 4 */ .ldptr = 0x260002b5, /* ldptr.d $x, $x, 0 */ .jirl = 0x4c0002a0, /* jirl $zero, $x, 0 */ .guards = 0x002a0000, /* break 0 */ .imm64 = to, }; memcpy((void *)from, &jmp, sizeof(jmp)); } int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, struct vdso_symtable *sfrom, bool compat_vdso) { unsigned int i; unsigned long from, to; for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { if (vdso_symbol_empty(&sfrom->symbols[i])) continue; pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, sto->symbols[i].offset, i); from = base_from + sfrom->symbols[i].offset; to = base_to + sto->symbols[i].offset; insert_trampoline(from, to); } return 0; } crac-criu-1.5.0/criu/arch/mips/000077500000000000000000000000001471504326700162255ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/mips/Makefile000066400000000000000000000006041471504326700176650ustar00rootroot00000000000000builtin-name := crtools.built-in.o ccflags-y += -iquote $(obj)/include ccflags-y += -iquote criu/include -iquote include ccflags-y += $(COMPEL_UAPI_INCLUDES) asflags-y += -Wstrict-prototypes asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer asflags-y += -iquote $(obj)/include ldflags-y += -r -z noexecstack obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o crac-criu-1.5.0/criu/arch/mips/cpu.c000066400000000000000000000013401471504326700171560ustar00rootroot00000000000000#include #include #include #include #include #include #include "bitops.h" #include "asm/types.h" #include "asm/cpu.h" #include #include #include "common/compiler.h" #include "cr_options.h" #include "image.h" #include "util.h" #include "log.h" #include "cpu.h" #include "protobuf.h" #include "images/cpuinfo.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " int cpu_init(void) { return 0; } int cpu_dump_cpuinfo(void) { return 0; } int cpu_validate_cpuinfo(void) { return 0; } int cpuinfo_dump(void) { if (cpu_init()) return -1; if (cpu_dump_cpuinfo()) return -1; return 0; } int cpuinfo_check(void) { return 0; } crac-criu-1.5.0/criu/arch/mips/crtools.c000066400000000000000000000201301471504326700200520ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "types.h" #include "log.h" #include "asm/parasite-syscall.h" #include "asm/restorer.h" #include #include "asm/dump.h" #include "cr_options.h" #include "common/compiler.h" #include "restorer.h" #include "parasite-syscall.h" #include "util.h" #include "cpu.h" #include #include "kerndat.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; /* Save the MIPS CPU state */ core->ti_mips->gpregs->r0 = regs->regs[0]; core->ti_mips->gpregs->r1 = regs->regs[1]; core->ti_mips->gpregs->r2 = regs->regs[2]; core->ti_mips->gpregs->r3 = regs->regs[3]; core->ti_mips->gpregs->r4 = regs->regs[4]; core->ti_mips->gpregs->r5 = regs->regs[5]; core->ti_mips->gpregs->r6 = regs->regs[6]; core->ti_mips->gpregs->r7 = regs->regs[7]; core->ti_mips->gpregs->r8 = regs->regs[8]; core->ti_mips->gpregs->r9 = regs->regs[9]; core->ti_mips->gpregs->r10 = regs->regs[10]; core->ti_mips->gpregs->r11 = regs->regs[11]; core->ti_mips->gpregs->r12 = regs->regs[12]; core->ti_mips->gpregs->r13 = regs->regs[13]; core->ti_mips->gpregs->r14 = regs->regs[14]; core->ti_mips->gpregs->r15 = regs->regs[15]; core->ti_mips->gpregs->r16 = regs->regs[16]; core->ti_mips->gpregs->r17 = regs->regs[17]; core->ti_mips->gpregs->r18 = regs->regs[18]; core->ti_mips->gpregs->r19 = regs->regs[19]; core->ti_mips->gpregs->r20 = regs->regs[20]; core->ti_mips->gpregs->r21 = regs->regs[21]; core->ti_mips->gpregs->r22 = regs->regs[22]; core->ti_mips->gpregs->r23 = regs->regs[23]; core->ti_mips->gpregs->r24 = regs->regs[24]; core->ti_mips->gpregs->r25 = regs->regs[25]; core->ti_mips->gpregs->r26 = regs->regs[26]; core->ti_mips->gpregs->r27 = regs->regs[27]; core->ti_mips->gpregs->r28 = regs->regs[28]; core->ti_mips->gpregs->r29 = regs->regs[29]; core->ti_mips->gpregs->r30 = regs->regs[30]; core->ti_mips->gpregs->r31 = regs->regs[31]; core->ti_mips->gpregs->lo = regs->lo; core->ti_mips->gpregs->hi = regs->hi; core->ti_mips->gpregs->cp0_epc = regs->cp0_epc; core->ti_mips->gpregs->cp0_badvaddr = regs->cp0_badvaddr; core->ti_mips->gpregs->cp0_status = regs->cp0_status; core->ti_mips->gpregs->cp0_cause = regs->cp0_cause; core->ti_mips->fpregs->r0 = fpregs->regs[0]; core->ti_mips->fpregs->r1 = fpregs->regs[1]; core->ti_mips->fpregs->r2 = fpregs->regs[2]; core->ti_mips->fpregs->r3 = fpregs->regs[3]; core->ti_mips->fpregs->r4 = fpregs->regs[4]; core->ti_mips->fpregs->r5 = fpregs->regs[5]; core->ti_mips->fpregs->r6 = fpregs->regs[6]; core->ti_mips->fpregs->r7 = fpregs->regs[7]; core->ti_mips->fpregs->r8 = fpregs->regs[8]; core->ti_mips->fpregs->r9 = fpregs->regs[9]; core->ti_mips->fpregs->r10 = fpregs->regs[10]; core->ti_mips->fpregs->r11 = fpregs->regs[11]; core->ti_mips->fpregs->r12 = fpregs->regs[12]; core->ti_mips->fpregs->r13 = fpregs->regs[13]; core->ti_mips->fpregs->r14 = fpregs->regs[14]; core->ti_mips->fpregs->r15 = fpregs->regs[15]; core->ti_mips->fpregs->r16 = fpregs->regs[16]; core->ti_mips->fpregs->r17 = fpregs->regs[17]; core->ti_mips->fpregs->r18 = fpregs->regs[18]; core->ti_mips->fpregs->r19 = fpregs->regs[19]; core->ti_mips->fpregs->r20 = fpregs->regs[20]; core->ti_mips->fpregs->r21 = fpregs->regs[21]; core->ti_mips->fpregs->r22 = fpregs->regs[22]; core->ti_mips->fpregs->r23 = fpregs->regs[23]; core->ti_mips->fpregs->r24 = fpregs->regs[24]; core->ti_mips->fpregs->r25 = fpregs->regs[25]; core->ti_mips->fpregs->r26 = fpregs->regs[26]; core->ti_mips->fpregs->r27 = fpregs->regs[27]; core->ti_mips->fpregs->r28 = fpregs->regs[28]; core->ti_mips->fpregs->r29 = fpregs->regs[29]; core->ti_mips->fpregs->r30 = fpregs->regs[30]; core->ti_mips->fpregs->r31 = fpregs->regs[31]; core->ti_mips->fpregs->fpu_fcr31 = fpregs->fpu_fcr31; core->ti_mips->fpregs->fpu_id = fpregs->fpu_id; return 0; } int arch_alloc_thread_info(CoreEntry *core) { ThreadInfoMips *ti_mips; UserMipsRegsEntry *gpregs; UserMipsFpregsEntry *fpregs; ti_mips = xmalloc(sizeof(*ti_mips)); if (!ti_mips) goto err; thread_info_mips__init(ti_mips); core->ti_mips = ti_mips; gpregs = xmalloc(sizeof(*gpregs)); if (!gpregs) { xfree(ti_mips); goto err; } user_mips_regs_entry__init(gpregs); ti_mips->gpregs = gpregs; fpregs = xmalloc(sizeof(*fpregs)); if (!fpregs) { xfree(ti_mips); xfree(gpregs); goto err; } user_mips_fpregs_entry__init(fpregs); ti_mips->fpregs = fpregs; return 0; err: return -1; } void arch_free_thread_info(CoreEntry *core) { if (!core->ti_mips) return; if (core->ti_mips->gpregs) xfree(core->ti_mips->gpregs); if (core->ti_mips->fpregs) xfree(core->ti_mips->fpregs); xfree(core->ti_mips); } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { struct rt_sigframe *f = sigframe; UserMipsFpregsEntry *r = core->ti_mips->fpregs; f->rs_uc.uc_mcontext.sc_fpregs[0] = r->r0; f->rs_uc.uc_mcontext.sc_fpregs[1] = r->r1; f->rs_uc.uc_mcontext.sc_fpregs[2] = r->r2; f->rs_uc.uc_mcontext.sc_fpregs[3] = r->r3; f->rs_uc.uc_mcontext.sc_fpregs[4] = r->r4; f->rs_uc.uc_mcontext.sc_fpregs[5] = r->r5; f->rs_uc.uc_mcontext.sc_fpregs[6] = r->r6; f->rs_uc.uc_mcontext.sc_fpregs[7] = r->r7; f->rs_uc.uc_mcontext.sc_fpregs[8] = r->r8; f->rs_uc.uc_mcontext.sc_fpregs[9] = r->r9; f->rs_uc.uc_mcontext.sc_fpregs[10] = r->r10; f->rs_uc.uc_mcontext.sc_fpregs[11] = r->r11; f->rs_uc.uc_mcontext.sc_fpregs[12] = r->r12; f->rs_uc.uc_mcontext.sc_fpregs[13] = r->r13; f->rs_uc.uc_mcontext.sc_fpregs[14] = r->r14; f->rs_uc.uc_mcontext.sc_fpregs[15] = r->r15; f->rs_uc.uc_mcontext.sc_fpregs[16] = r->r16; f->rs_uc.uc_mcontext.sc_fpregs[17] = r->r17; f->rs_uc.uc_mcontext.sc_fpregs[18] = r->r18; f->rs_uc.uc_mcontext.sc_fpregs[19] = r->r19; f->rs_uc.uc_mcontext.sc_fpregs[20] = r->r20; f->rs_uc.uc_mcontext.sc_fpregs[21] = r->r21; f->rs_uc.uc_mcontext.sc_fpregs[22] = r->r22; f->rs_uc.uc_mcontext.sc_fpregs[23] = r->r23; f->rs_uc.uc_mcontext.sc_fpregs[24] = r->r24; f->rs_uc.uc_mcontext.sc_fpregs[25] = r->r25; f->rs_uc.uc_mcontext.sc_fpregs[26] = r->r26; f->rs_uc.uc_mcontext.sc_fpregs[27] = r->r27; f->rs_uc.uc_mcontext.sc_fpregs[28] = r->r28; f->rs_uc.uc_mcontext.sc_fpregs[29] = r->r29; f->rs_uc.uc_mcontext.sc_fpregs[30] = r->r30; f->rs_uc.uc_mcontext.sc_fpregs[31] = r->r31; return 0; } int restore_gpregs(struct rt_sigframe *f, UserMipsRegsEntry *r) { f->rs_uc.uc_mcontext.sc_regs[0] = r->r0; f->rs_uc.uc_mcontext.sc_regs[1] = r->r1; f->rs_uc.uc_mcontext.sc_regs[2] = r->r2; f->rs_uc.uc_mcontext.sc_regs[3] = r->r3; f->rs_uc.uc_mcontext.sc_regs[4] = r->r4; f->rs_uc.uc_mcontext.sc_regs[5] = r->r5; f->rs_uc.uc_mcontext.sc_regs[6] = r->r6; f->rs_uc.uc_mcontext.sc_regs[7] = r->r7; f->rs_uc.uc_mcontext.sc_regs[8] = r->r8; f->rs_uc.uc_mcontext.sc_regs[9] = r->r9; f->rs_uc.uc_mcontext.sc_regs[10] = r->r10; f->rs_uc.uc_mcontext.sc_regs[11] = r->r11; f->rs_uc.uc_mcontext.sc_regs[12] = r->r12; f->rs_uc.uc_mcontext.sc_regs[13] = r->r13; f->rs_uc.uc_mcontext.sc_regs[14] = r->r14; f->rs_uc.uc_mcontext.sc_regs[15] = r->r15; f->rs_uc.uc_mcontext.sc_regs[16] = r->r16; f->rs_uc.uc_mcontext.sc_regs[17] = r->r17; f->rs_uc.uc_mcontext.sc_regs[18] = r->r18; f->rs_uc.uc_mcontext.sc_regs[19] = r->r19; f->rs_uc.uc_mcontext.sc_regs[20] = r->r20; f->rs_uc.uc_mcontext.sc_regs[21] = r->r21; f->rs_uc.uc_mcontext.sc_regs[22] = r->r22; f->rs_uc.uc_mcontext.sc_regs[23] = r->r23; f->rs_uc.uc_mcontext.sc_regs[24] = r->r24; f->rs_uc.uc_mcontext.sc_regs[25] = r->r25; f->rs_uc.uc_mcontext.sc_regs[26] = r->r26; f->rs_uc.uc_mcontext.sc_regs[27] = r->r27; f->rs_uc.uc_mcontext.sc_regs[28] = r->r28; f->rs_uc.uc_mcontext.sc_regs[29] = r->r29; f->rs_uc.uc_mcontext.sc_regs[30] = r->r30; f->rs_uc.uc_mcontext.sc_regs[31] = r->r31; f->rs_uc.uc_mcontext.sc_mdlo = r->lo; f->rs_uc.uc_mcontext.sc_mdhi = r->hi; f->rs_uc.uc_mcontext.sc_pc = r->cp0_epc; return 0; } int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info) { return 0; } crac-criu-1.5.0/criu/arch/mips/include/000077500000000000000000000000001471504326700176505ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/mips/include/asm/000077500000000000000000000000001471504326700204305ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/mips/include/asm/dump.h000066400000000000000000000006421471504326700215500ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); static inline void core_put_tls(CoreEntry *core, tls_t tls) { core->ti_mips->tls = tls; } #endif crac-criu-1.5.0/criu/arch/mips/include/asm/int.h000066400000000000000000000001571471504326700213760ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ crac-criu-1.5.0/criu/arch/mips/include/asm/kerndat.h000066400000000000000000000002341471504326700222300ustar00rootroot00000000000000#ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ #define kdat_compatible_cr() 0 #define kdat_can_map_vdso() 0 #endif /* __CR_ASM_KERNDAT_H__ */ crac-criu-1.5.0/criu/arch/mips/include/asm/parasite-syscall.h000066400000000000000000000002021471504326700240530ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ #include "asm/types.h" struct parasite_ctl; #endif crac-criu-1.5.0/criu/arch/mips/include/asm/parasite.h000066400000000000000000000002251471504326700224100ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ static inline void arch_get_tls(tls_t *ptls) { asm("rdhwr %0, $29" : "=r"(*ptls)); } #endif crac-criu-1.5.0/criu/arch/mips/include/asm/restore.h000066400000000000000000000012571471504326700222710ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" /* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args) \ asm volatile( \ "move $4, %0 \n" \ "move $25, %1 \n" \ "move $5, %2 \n" \ "move $29, $5 \n" \ "jalr $25 \n" \ "nop \n" \ : \ :"r"(task_args),"r"(restore_task_exec_start), \ "g"(new_sp) \ : "$25", "$4","$5") /* clang-format on */ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { *ptls = pcore->ti_mips->tls; } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif crac-criu-1.5.0/criu/arch/mips/include/asm/restorer.h000066400000000000000000000050731471504326700224530ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include "asm/types.h" #include #include "images/core.pb-c.h" #include #include static inline void restore_tls(tls_t *ptls) { /* clang-format off */ asm volatile("move $4, %0 \n" "li $2, " __stringify(__NR_set_thread_area) " \n" "syscall \n" : : "r"(*ptls) : "$4", "$2", "memory"); /* clang-format on */ } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } /* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "ld $5,%2 \n" /* a1 = new_sp */ \ "dsubu $5,32 \n" \ "sd %5,0($5) \n" \ "sd %6,8($5) \n" \ "sd %1,16($5) \n" \ "move $4,%1 \n" /* a0=flags */ \ "move $6,%3 \n" /* a2=parent_tid */ \ "li $7,0 \n" /* a3 = tls is 0 */ \ "move $8,%4 \n" /* a4 = child_tid */ \ "li $2, "__stringify(__NR_clone)" \n" \ "syscall \n" /* syscall */ \ "sync \n" \ "bnez $7,err \n" \ "nop \n" \ "beqz $2,thread_start \n" \ "nop \n" \ "move %0,$2 \n" \ "b end \n" \ "err:break \n" \ "thread_start: \n" \ "ld $25,0($29) \n" \ "ld $4,8($29) \n" \ "jal $25 \n" \ "nop \n" \ "end: \n" \ : "=r"(ret) \ : "r"(clone_flags), \ "m"(new_sp), \ "r"(&parent_tid), \ "r"(&thread_args[i].pid), \ "r"(clone_restore_fn), \ "r"(&thread_args[i]) \ :"$2","$4","$5","$6","$7","$8","$25","memory") #define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ clone_restore_fn) do { \ pr_err("This architecture does not support clone3() with set_tid, yet!\n"); \ ret = -1; \ } while (0) /* clang-format on */ #define kdat_compatible_cr() 0 #define arch_map_vdso(map, compat) -1 static inline void *alloc_compat_syscall_stack(void) { return NULL; } static inline void free_compat_syscall_stack(void *stack32) { } int restore_gpregs(struct rt_sigframe *f, UserMipsRegsEntry *r); int restore_nonsigframe_gpregs(UserMipsRegsEntry *r); #define ARCH_HAS_SHMAT_HOOK unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size); #endif crac-criu-1.5.0/criu/arch/mips/include/asm/syscall32.h000066400000000000000000000020551471504326700224220ustar00rootroot00000000000000#ifndef __CR_SYSCALL32_H__ #define __CR_SYSCALL32_H__ extern long sys_socket(int domain, int type, int protocol); extern long sys_connect(int sockfd, struct sockaddr *addr, int addrlen); extern long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len); extern long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len); extern long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags); extern long sys_recvmsg(int sockfd, struct msghdr *msg, int flags); extern long sys_shutdown(int sockfd, int how); extern long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen); extern long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen); extern long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen); extern long sys_shmat(int shmid, void *shmaddr, int shmflag); extern long sys_pread(unsigned int fd, char *ubuf, u32 count, u64 pos); #endif /* __CR_SYSCALL32_H__ */ crac-criu-1.5.0/criu/arch/mips/include/asm/thread_pointer.h000066400000000000000000000017751471504326700236220ustar00rootroot00000000000000/* __thread_pointer definition. Generic version. Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #ifndef _SYS_THREAD_POINTER_H #define _SYS_THREAD_POINTER_H static inline void *__criu_thread_pointer(void) { return __builtin_thread_pointer(); } #endif /* _SYS_THREAD_POINTER_H */ crac-criu-1.5.0/criu/arch/mips/include/asm/types.h000066400000000000000000000013141471504326700217440ustar00rootroot00000000000000#ifndef __CR_ASM_TYPES_H__ #define __CR_ASM_TYPES_H__ #include #include #include "page.h" #include "bitops.h" #include "asm/int.h" #include #include "images/core.pb-c.h" #define core_is_compat(core) false #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__MIPS #define CORE_THREAD_ARCH_INFO(core) core->ti_mips #define TI_IP(core) ((core)->ti_mips->gpregs->cp0_epc) typedef UserMipsRegsEntry UserRegsEntry; static inline u64 encode_pointer(void *p) { return (u64)p; } static inline void *decode_pointer(u64 v) { return (void *)v; } #define AT_VECTOR_SIZE 44 typedef uint64_t auxv_t; typedef unsigned long tls_t; #endif /* __CR_ASM_TYPES_H__ */ crac-criu-1.5.0/criu/arch/mips/include/asm/vdso.h000066400000000000000000000013201471504326700215500ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "asm-generic/vdso.h" /* This definition is used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' */ /* * This is a minimal amount of symbols * we should support at the moment. */ #define VDSO_SYMBOL_MAX 3 #define VDSO_SYMBOL_GTOD 0 #define ARCH_VDSO_SYMBOLS_LIST \ const char *aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ const char *aarch_vdso_symbol2 = "__vdso_gettimeofday"; \ const char *aarch_vdso_symbol3 = "__vdso_clock_getres"; #define ARCH_VDSO_SYMBOLS aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, #endif /* __CR_ASM_VDSO_H__ */ crac-criu-1.5.0/criu/arch/mips/restorer.c000066400000000000000000000024031471504326700202350ustar00rootroot00000000000000#include #include "types.h" #include "restorer.h" #include "asm/restorer.h" #include #include #include #include #include "log.h" #include "cpu.h" int restore_nonsigframe_gpregs(UserMipsRegsEntry *r) { return 0; } #define SHMLBA 0x40000 unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size) { unsigned long smap; /* SHMLBA-aligned, direct call shmat() */ if (!((unsigned long)shmaddr & (SHMLBA - 1))) return sys_shmat(shmid, shmaddr, shmflg); smap = sys_shmat(shmid, NULL, shmflg); if (IS_ERR_VALUE(smap)) { pr_err("shmat() with NULL shmaddr failed: %d\n", (int)smap); return smap; } /* We're lucky! */ if (smap == (unsigned long)shmaddr) return smap; /* Warn ALOUD */ pr_warn("Restoring shmem %p unaligned to SHMLBA.\n", shmaddr); pr_warn("Make sure that you don't migrate shmem from non-VIPT cached CPU to VIPT cached \n"); pr_warn("Otherwise YOU HAVE A CHANCE OF DATA CORRUPTIONS in writeable shmem\n"); smap = sys_mremap(smap, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)shmaddr); if (IS_ERR_VALUE(smap)) pr_err("mremap() for shmem failed: %d\n", (int)smap); return smap; } crac-criu-1.5.0/criu/arch/mips/sigaction_compat.c000066400000000000000000000005441471504326700217170ustar00rootroot00000000000000#include "log.h" #include "asm/restorer.h" #include #include "asm/compat.h" #include #ifdef CR_NOGLIBC #include #endif #include "cpu.h" extern char restore_rt_sigaction; int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) { return 0; } crac-criu-1.5.0/criu/arch/mips/sigframe.c000066400000000000000000000003501471504326700201640ustar00rootroot00000000000000#include #include #include "asm/sigframe.h" #include "asm/types.h" #include "log.h" #include int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } crac-criu-1.5.0/criu/arch/mips/vdso-pie.c000066400000000000000000000023371471504326700201240ustar00rootroot00000000000000#include #include "asm/types.h" #include #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX #undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " static void insert_trampoline(uintptr_t from, uintptr_t to) { struct { uint32_t ldr_pc; uint32_t imm32; uint32_t guards; } __packed jmp = { .ldr_pc = 0x1000fffe, /* b -4 */ .imm32 = to, .guards = 0x0000000d, /* break */ }; void *iflush_start = (void *)from; void *iflush_end = iflush_start + sizeof(jmp); memcpy((void *)from, &jmp, sizeof(jmp)); sys_cacheflush(iflush_start, sizeof(jmp), 0); } int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, struct vdso_symtable *sfrom, bool compat_vdso) { unsigned int i; for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { uintptr_t from, to; if (vdso_symbol_empty(&sfrom->symbols[i])) continue; pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, sto->symbols[i].offset, i); from = base_from + sfrom->symbols[i].offset; to = base_to + sto->symbols[i].offset; insert_trampoline(from, to); } return 0; } crac-criu-1.5.0/criu/arch/ppc64/000077500000000000000000000000001471504326700162115ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/ppc64/Makefile000066400000000000000000000001631471504326700176510ustar00rootroot00000000000000builtin-name := crtools.built-in.o ldflags-y += -r obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o crac-criu-1.5.0/criu/arch/ppc64/cpu.c000066400000000000000000000066631471504326700171570ustar00rootroot00000000000000#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include #include #include #include "asm/types.h" #include "cr_options.h" #include "image.h" #include "util.h" #include "log.h" #include "cpu.h" #include "protobuf.h" #include "images/cpuinfo.pb-c.h" static compel_cpuinfo_t rt_cpuinfo; #ifdef __LITTLE_ENDIAN__ #define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANNESS__LITTLEENDIAN #else #define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANESS__BIGENDIAN #endif int cpu_init(void) { return compel_cpuid(&rt_cpuinfo); } int cpu_dump_cpuinfo(void) { CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT; CpuinfoPpc64Entry cpu_ppc64_info = CPUINFO_PPC64_ENTRY__INIT; CpuinfoPpc64Entry *cpu_ppc64_info_ptr = &cpu_ppc64_info; struct cr_img *img; int ret = -1; img = open_image(CR_FD_CPUINFO, O_DUMP); if (!img) return -1; cpu_info.ppc64_entry = &cpu_ppc64_info_ptr; cpu_info.n_ppc64_entry = 1; cpu_ppc64_info.endian = CURRENT_ENDIANNESS; cpu_ppc64_info.n_hwcap = 2; cpu_ppc64_info.hwcap = rt_cpuinfo.hwcap; ret = pb_write_one(img, &cpu_info, PB_CPUINFO); close_image(img); return ret; } int cpu_validate_cpuinfo(void) { CpuinfoEntry *cpu_info; CpuinfoPpc64Entry *cpu_ppc64_entry; struct cr_img *img; int ret = -1; img = open_image(CR_FD_CPUINFO, O_RSTR); if (!img) return -1; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; if (cpu_info->n_ppc64_entry != 1) { pr_err("No PPC64 related entry in image\n"); goto error; } cpu_ppc64_entry = cpu_info->ppc64_entry[0]; if (cpu_ppc64_entry->endian != CURRENT_ENDIANNESS) { pr_err("Bad endianness\n"); goto error; } if (cpu_ppc64_entry->n_hwcap != 2) { pr_err("Hardware capabilities information missing\n"); goto error; } #define CHECK_FEATURE(s, f) \ do { \ if ((cpu_ppc64_entry->hwcap[s] & f) && !(rt_cpuinfo.hwcap[s] & f)) { \ pr_err("CPU Feature %s required by image " \ "is not supported on host.\n", \ #f); \ goto error; \ } \ } while (0) #define REQUIRE_FEATURE(s, f) \ do { \ if (!(cpu_ppc64_entry->hwcap[s] & f)) { \ pr_err("CPU Feature %s missing in image.\n", #f); \ goto error; \ } \ } while (0) REQUIRE_FEATURE(0, PPC_FEATURE_64); REQUIRE_FEATURE(0, PPC_FEATURE_HAS_FPU); REQUIRE_FEATURE(0, PPC_FEATURE_HAS_MMU); REQUIRE_FEATURE(0, PPC_FEATURE_HAS_VSX); REQUIRE_FEATURE(1, PPC_FEATURE2_ARCH_2_07); CHECK_FEATURE(0, PPC_FEATURE_TRUE_LE); CHECK_FEATURE(1, PPC_FEATURE2_HTM); CHECK_FEATURE(1, PPC_FEATURE2_DSCR); CHECK_FEATURE(1, PPC_FEATURE2_EBB); CHECK_FEATURE(1, PPC_FEATURE2_ISEL); CHECK_FEATURE(1, PPC_FEATURE2_TAR); CHECK_FEATURE(1, PPC_FEATURE2_VEC_CRYPTO); ret = 0; error: close_image(img); return ret; } int cpuinfo_dump(void) { if (cpu_init()) return -1; if (cpu_dump_cpuinfo()) return -1; return 0; } int cpuinfo_check(void) { if (cpu_init()) return -1; if (cpu_validate_cpuinfo()) return 1; return 0; } crac-criu-1.5.0/criu/arch/ppc64/crtools.c000066400000000000000000000302701471504326700200440ustar00rootroot00000000000000#include #include #include #include #include #include #include "types.h" #include #include "asm/restorer.h" #include "asm/dump.h" #include "cr_options.h" #include "common/compiler.h" #include #include "parasite-syscall.h" #include "log.h" #include "util.h" #include "cpu.h" #include "compel/infect.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" static UserPpc64FpstateEntry *copy_fp_regs(uint64_t *fpregs) { UserPpc64FpstateEntry *fpe; int i; fpe = xmalloc(sizeof(UserPpc64FpstateEntry)); if (!fpe) return NULL; user_ppc64_fpstate_entry__init(fpe); fpe->n_fpregs = NFPREG; fpe->fpregs = xmalloc(fpe->n_fpregs * sizeof(fpe->fpregs[0])); if (!fpe->fpregs) { xfree(fpe); return NULL; } /* FPSRC is the last (33th) register in the set */ for (i = 0; i < NFPREG; i++) fpe->fpregs[i] = fpregs[i]; return fpe; } static void put_fpu_regs(mcontext_t *mc, UserPpc64FpstateEntry *fpe) { uint64_t *mcfp = (uint64_t *)mc->fp_regs; size_t i; for (i = 0; i < fpe->n_fpregs; i++) mcfp[i] = fpe->fpregs[i]; } static UserPpc64VrstateEntry *copy_altivec_regs(__vector128 *vrregs) { UserPpc64VrstateEntry *vse; uint64_t *p64; uint32_t *p32; int i; vse = xmalloc(sizeof(*vse)); if (!vse) return NULL; user_ppc64_vrstate_entry__init(vse); /* protocol buffer store only 64bit entries and we need 128bit */ vse->n_vrregs = (NVRREG - 1) * 2; vse->vrregs = xmalloc(vse->n_vrregs * sizeof(vse->vrregs[0])); if (!vse->vrregs) { xfree(vse); return NULL; } /* Vectors are 2*64bits entries */ for (i = 0; i < (NVRREG - 1); i++) { p64 = (uint64_t *)&vrregs[i]; vse->vrregs[i * 2] = p64[0]; vse->vrregs[i * 2 + 1] = p64[1]; } p32 = (uint32_t *)&vrregs[NVRREG - 1]; vse->vrsave = *p32; return vse; } static int put_altivec_regs(mcontext_t *mc, UserPpc64VrstateEntry *vse) { vrregset_t *v_regs = (vrregset_t *)(((unsigned long)mc->vmx_reserve + 15) & ~0xful); pr_debug("Restoring Altivec registers\n"); if (vse->n_vrregs != (NVRREG - 1) * 2) { pr_err("Corrupted Altivec dump data\n"); return -1; } /* Note that this should only be done in the case MSR_VEC is set but * this is not a big deal to do that in all cases. */ memcpy(&v_regs->vrregs[0][0], vse->vrregs, sizeof(uint64_t) * 2 * (NVRREG - 1)); /* vscr has been restored with the previous memcpy which copied 32 * 128bits registers + a 128bits field containing the vscr value in * the low part. */ v_regs->vrsave = vse->vrsave; mc->v_regs = v_regs; return 0; } static UserPpc64VsxstateEntry *copy_vsx_regs(uint64_t *vsregs) { UserPpc64VsxstateEntry *vse; int i; vse = xmalloc(sizeof(*vse)); if (!vse) return NULL; user_ppc64_vsxstate_entry__init(vse); vse->n_vsxregs = NVSXREG; vse->vsxregs = xmalloc(vse->n_vsxregs * sizeof(vse->vsxregs[0])); if (!vse->vsxregs) { xfree(vse); return NULL; } for (i = 0; i < vse->n_vsxregs; i++) vse->vsxregs[i] = vsregs[i]; return vse; } static int put_vsx_regs(mcontext_t *mc, UserPpc64VsxstateEntry *vse) { uint64_t *buf; int i; pr_debug("Restoring VSX registers\n"); if (!mc->v_regs) { /* VSX implies Altivec so v_regs should be set */ pr_err("Internal error\n"); return -1; } /* point after the Altivec registers */ buf = (uint64_t *)(mc->v_regs + 1); /* Copy the value saved by get_vsx_regs in the sigframe */ for (i = 0; i < vse->n_vsxregs; i++) buf[i] = vse->vsxregs[i]; return 0; } static void copy_gp_regs(UserPpc64RegsEntry *dst, user_regs_struct_t *src) { int i; #define assign_reg(e) \ do { \ dst->e = (__typeof__(dst->e))src->e; \ } while (0) for (i = 0; i < 32; i++) assign_reg(gpr[i]); assign_reg(nip); assign_reg(msr); assign_reg(orig_gpr3); assign_reg(ctr); assign_reg(link); assign_reg(xer); assign_reg(ccr); assign_reg(trap); #undef assign_reg } static void restore_gp_regs(mcontext_t *dst, UserPpc64RegsEntry *src) { int i; /* r0 to r31 */ for (i = 0; i < 32; i++) dst->gp_regs[i] = src->gpr[i]; dst->gp_regs[PT_NIP] = src->nip; dst->gp_regs[PT_MSR] = src->msr; dst->gp_regs[PT_ORIG_R3] = src->orig_gpr3; dst->gp_regs[PT_CTR] = src->ctr; dst->gp_regs[PT_LNK] = src->link; dst->gp_regs[PT_XER] = src->xer; dst->gp_regs[PT_CCR] = src->ccr; dst->gp_regs[PT_TRAP] = src->trap; } static UserPpc64RegsEntry *allocate_gp_regs(void) { UserPpc64RegsEntry *gpregs; gpregs = xmalloc(sizeof(*gpregs)); if (!gpregs) return NULL; user_ppc64_regs_entry__init(gpregs); gpregs->n_gpr = 32; gpregs->gpr = xmalloc(32 * sizeof(uint64_t)); if (!gpregs->gpr) { xfree(gpregs); return NULL; } return gpregs; } /**************************************************************************** * TRANSACTIONAL MEMORY SUPPORT */ static void xfree_tm_state(UserPpc64TmRegsEntry *tme) { if (tme) { if (tme->fpstate) { xfree(tme->fpstate->fpregs); xfree(tme->fpstate); } if (tme->vrstate) { xfree(tme->vrstate->vrregs); xfree(tme->vrstate); } if (tme->vsxstate) { xfree(tme->vsxstate->vsxregs); xfree(tme->vsxstate); } if (tme->gpregs) { if (tme->gpregs->gpr) xfree(tme->gpregs->gpr); xfree(tme->gpregs); } xfree(tme); } } static int put_tm_regs(struct rt_sigframe *f, UserPpc64TmRegsEntry *tme) { /* * WARNING: As stated in kernel's restore_tm_sigcontexts, TEXASR has to be * restored by the process itself : * TEXASR was set by the signal delivery reclaim, as was TFIAR. * Users doing anything abhorrent like thread-switching w/ signals for * TM-Suspended code will have to back TEXASR/TFIAR up themselves. * For the case of getting a signal and simply returning from it, * we don't need to re-copy them here. */ ucontext_t *tm_uc = &f->uc_transact; pr_debug("Restoring TM registers FP:%d VR:%d VSX:%d\n", !!(tme->fpstate), !!(tme->vrstate), !!(tme->vsxstate)); restore_gp_regs(&tm_uc->uc_mcontext, tme->gpregs); if (tme->fpstate) put_fpu_regs(&tm_uc->uc_mcontext, tme->fpstate); if (tme->vrstate && put_altivec_regs(&tm_uc->uc_mcontext, tme->vrstate)) return -1; if (tme->vsxstate && put_vsx_regs(&tm_uc->uc_mcontext, tme->vsxstate)) return -1; f->uc.uc_link = tm_uc; return 0; } /****************************************************************************/ static int copy_tm_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, CoreEntry *core) { UserPpc64TmRegsEntry *tme; UserPpc64RegsEntry *gpregs = core->ti_ppc64->gpregs; pr_debug("Copying TM registers\n"); tme = xmalloc(sizeof(*tme)); if (!tme) return -1; user_ppc64_tm_regs_entry__init(tme); tme->gpregs = allocate_gp_regs(); if (!tme->gpregs) goto out_free; gpregs->has_tfhar = true; gpregs->tfhar = fpregs->tm.tm_spr_regs.tfhar; gpregs->has_texasr = true; gpregs->texasr = fpregs->tm.tm_spr_regs.texasr; gpregs->has_tfiar = true; gpregs->tfiar = fpregs->tm.tm_spr_regs.tfiar; /* This is the checkpointed state, we must save it in place of the * current state because the signal handler is made in this way. * We invert the 2 states instead of when building the signal frame, * because we can't modify the gpregs manipulated by the common layer. */ copy_gp_regs(gpregs, &fpregs->tm.regs); if (fpregs->tm.flags & USER_FPREGS_FL_FP) { core->ti_ppc64->fpstate = copy_fp_regs(fpregs->tm.fpregs); if (!core->ti_ppc64->fpstate) goto out_free; } if (fpregs->tm.flags & USER_FPREGS_FL_ALTIVEC) { core->ti_ppc64->vrstate = copy_altivec_regs(fpregs->tm.vrregs); if (!core->ti_ppc64->vrstate) goto out_free; /* * Force the MSR_VEC bit of the restored MSR otherwise the * kernel will not restore them from the signal frame. */ gpregs->msr |= MSR_VEC; if (fpregs->tm.flags & USER_FPREGS_FL_VSX) { core->ti_ppc64->vsxstate = copy_vsx_regs(fpregs->tm.vsxregs); if (!core->ti_ppc64->vsxstate) goto out_free; /* * Force the MSR_VSX bit of the restored MSR otherwise * the kernel will not restore them from the signal * frame. */ gpregs->msr |= MSR_VSX; } } core->ti_ppc64->tmstate = tme; return 0; out_free: xfree_tm_state(tme); return -1; } static int __copy_task_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, CoreEntry *core) { UserPpc64RegsEntry *gpregs; UserPpc64FpstateEntry **fpstate; UserPpc64VrstateEntry **vrstate; UserPpc64VsxstateEntry **vsxstate; /* Copy retrieved registers in the proto data * If TM is in the loop we switch the saved register set because * the signal frame is built with checkpointed registers on top to not * confused TM unaware process, while ptrace is retrieving the * checkpointed set through the TM specific ELF notes. */ if (fpregs->flags & USER_FPREGS_FL_TM) { if (copy_tm_regs(regs, fpregs, core)) return -1; gpregs = core->ti_ppc64->tmstate->gpregs; fpstate = &(core->ti_ppc64->tmstate->fpstate); vrstate = &(core->ti_ppc64->tmstate->vrstate); vsxstate = &(core->ti_ppc64->tmstate->vsxstate); } else { gpregs = core->ti_ppc64->gpregs; fpstate = &(core->ti_ppc64->fpstate); vrstate = &(core->ti_ppc64->vrstate); vsxstate = &(core->ti_ppc64->vsxstate); } copy_gp_regs(gpregs, regs); if (fpregs->flags & USER_FPREGS_FL_FP) { *fpstate = copy_fp_regs(fpregs->fpregs); if (!*fpstate) return -1; } if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) { *vrstate = copy_altivec_regs(fpregs->vrregs); if (!*vrstate) return -1; /* * Force the MSR_VEC bit of the restored MSR otherwise the * kernel will not restore them from the signal frame. */ gpregs->msr |= MSR_VEC; if (fpregs->flags & USER_FPREGS_FL_VSX) { *vsxstate = copy_vsx_regs(fpregs->vsxregs); if (!*vsxstate) return -1; /* * Force the MSR_VSX bit of the restored MSR otherwise * the kernel will not restore them from the signal * frame. */ gpregs->msr |= MSR_VSX; } } return 0; } int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { return __copy_task_regs(u, f, (CoreEntry *)arg); } /****************************************************************************/ int arch_alloc_thread_info(CoreEntry *core) { ThreadInfoPpc64 *ti_ppc64; ti_ppc64 = xmalloc(sizeof(*ti_ppc64)); if (!ti_ppc64) return -1; thread_info_ppc64__init(ti_ppc64); ti_ppc64->gpregs = allocate_gp_regs(); if (!ti_ppc64->gpregs) { xfree(ti_ppc64); return -1; } CORE_THREAD_ARCH_INFO(core) = ti_ppc64; return 0; } void arch_free_thread_info(CoreEntry *core) { if (CORE_THREAD_ARCH_INFO(core)) { if (CORE_THREAD_ARCH_INFO(core)->fpstate) { xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->fpregs); xfree(CORE_THREAD_ARCH_INFO(core)->fpstate); } if (CORE_THREAD_ARCH_INFO(core)->vrstate) { xfree(CORE_THREAD_ARCH_INFO(core)->vrstate->vrregs); xfree(CORE_THREAD_ARCH_INFO(core)->vrstate); } if (CORE_THREAD_ARCH_INFO(core)->vsxstate) { xfree(CORE_THREAD_ARCH_INFO(core)->vsxstate->vsxregs); xfree(CORE_THREAD_ARCH_INFO(core)->vsxstate); } xfree_tm_state(CORE_THREAD_ARCH_INFO(core)->tmstate); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->gpr); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); CORE_THREAD_ARCH_INFO(core) = NULL; } } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { int ret = 0; if (CORE_THREAD_ARCH_INFO(core)->fpstate) put_fpu_regs(&sigframe->uc.uc_mcontext, CORE_THREAD_ARCH_INFO(core)->fpstate); if (CORE_THREAD_ARCH_INFO(core)->vrstate) ret = put_altivec_regs(&sigframe->uc.uc_mcontext, CORE_THREAD_ARCH_INFO(core)->vrstate); else if (core->ti_ppc64->gpregs->msr & MSR_VEC) { pr_err("Register's data mismatch, corrupted image ?\n"); ret = -1; } if (!ret && CORE_THREAD_ARCH_INFO(core)->vsxstate) ret = put_vsx_regs(&sigframe->uc.uc_mcontext, CORE_THREAD_ARCH_INFO(core)->vsxstate); else if (core->ti_ppc64->gpregs->msr & MSR_VSX) { pr_err("VSX register's data mismatch, corrupted image ?\n"); ret = -1; } if (!ret && CORE_THREAD_ARCH_INFO(core)->tmstate) ret = put_tm_regs(sigframe, CORE_THREAD_ARCH_INFO(core)->tmstate); else if (MSR_TM_ACTIVE(core->ti_ppc64->gpregs->msr)) { pr_err("TM register's data mismatch, corrupted image ?\n"); ret = -1; } return ret; } int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r) { restore_gp_regs(&f->uc.uc_mcontext, r); return 0; } crac-criu-1.5.0/criu/arch/ppc64/include/000077500000000000000000000000001471504326700176345ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/ppc64/include/asm/000077500000000000000000000000001471504326700204145ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/ppc64/include/asm/dump.h000066400000000000000000000005201471504326700215270ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); #define core_put_tls(core, tls) #define get_task_futex_robust_list_compat(pid, info) -1 #endif crac-criu-1.5.0/criu/arch/ppc64/include/asm/int.h000066400000000000000000000001571471504326700213620ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ crac-criu-1.5.0/criu/arch/ppc64/include/asm/kerndat.h000066400000000000000000000002341471504326700222140ustar00rootroot00000000000000#ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ #define kdat_compatible_cr() 0 #define kdat_can_map_vdso() 0 #endif /* __CR_ASM_KERNDAT_H__ */ crac-criu-1.5.0/criu/arch/ppc64/include/asm/parasite-syscall.h000066400000000000000000000001521471504326700240430ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ struct parasite_ctl; #endif crac-criu-1.5.0/criu/arch/ppc64/include/asm/parasite.h000066400000000000000000000002731471504326700223770ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ /* TLS is accessed through r13, which is already processed */ static inline void arch_get_tls(tls_t *ptls) { (void)ptls; } #endif crac-criu-1.5.0/criu/arch/ppc64/include/asm/restore.h000066400000000000000000000015151471504326700222520ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" /* * Set R2 to blob + 8000 which is the default value * Jump to restore_task_exec_start + 8 since R2 is already set (local call) */ /* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ "mr 1,%0 \n" \ "mr 12,%1 \n" \ "mtctr 12 \n" \ "mr 3,%2 \n" \ "bctr \n" \ : \ : "r"(new_sp), \ "r"((unsigned long)restore_task_exec_start), \ "r"(task_args) \ : "3", "12") /* clang-format on */ /* There is nothing to do since TLS is accessed through r13 */ #define core_get_tls(pcore, ptls) int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif /* __CR_ASM_RESTORE_H__ */ crac-criu-1.5.0/criu/arch/ppc64/include/asm/restorer.h000066400000000000000000000071171471504326700224400ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include #include #include #include "asm/types.h" #include #include /* * Clone trampoline * * See glibc sysdeps/powerpc/powerpc64/sysdep.h for FRAME_MIN_SIZE defines */ /* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "clone_emul: \n" \ "/* Save fn, args, stack across syscall. */ \n" \ "mr 14, %5 /* clone_restore_fn in r14 */ \n" \ "mr 15, %6 /* &thread_args[i] in r15 */ \n" \ "mr 3, %1 /* clone_flags */ \n" \ "ld 4, %2 /* new_sp */ \n" \ "mr 5, %3 /* &parent_tid */ \n" \ "li 6, 0 /* tls = 0 ? */ \n" \ "mr 7, %4 /* &thread_args[i].pid */ \n" \ "li 0,"__stringify(__NR_clone)" \n" \ "sc \n" \ "/* Check for child process. */ \n" \ "cmpdi cr1,3,0 \n" \ "crandc cr1*4+eq,cr1*4+eq,cr0*4+so \n" \ "bne- cr1,clone_end \n" \ "/* child */ \n" \ "addi 14, 14, 8 /* jump over r2 fixup */ \n" \ "mtctr 14 \n" \ "mr 3,15 \n" \ "bctr \n" \ "clone_end: \n" \ "mr %0,3 \n" \ : "=r"(ret) /* %0 */ \ : "r"(clone_flags), /* %1 */ \ "m"(new_sp), /* %2 */ \ "r"(&parent_tid), /* %3 */ \ "r"(&thread_args[i].pid), /* %4 */ \ "r"(clone_restore_fn), /* %5 */ \ "r"(&thread_args[i]) /* %6 */ \ : "memory","0","3","4","5","6","7","14","15") #define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ clone_restore_fn) \ /* * The clone3() function accepts following parameters: * int clone3(struct clone_args *args, size_t size) * * Always consult the CLONE3 wrappers for other architectures * for additional details. * * For PPC64LE the first parameter (clone_args) is passed in r3 and * the second parameter (size) is passed in r4. * * This clone3() wrapper is based on the clone() wrapper from above. */ \ asm volatile( \ "clone3_emul: \n" \ "/* Save fn, args across syscall. */ \n" \ "mr 14, %3 /* clone_restore_fn in r14 */ \n" \ "mr 15, %4 /* &thread_args[i] in r15 */ \n" \ "mr 3, %1 /* clone_args */ \n" \ "mr 4, %2 /* size */ \n" \ "li 0,"__stringify(__NR_clone3)" \n" \ "sc \n" \ "/* Check for child process. */ \n" \ "cmpdi cr1,3,0 \n" \ "crandc cr1*4+eq,cr1*4+eq,cr0*4+so \n" \ "bne- cr1,clone3_end \n" \ "/* child */ \n" \ "addi 14, 14, 8 /* jump over r2 fixup */ \n" \ "mtctr 14 \n" \ "mr 3,15 \n" \ "bctr \n" \ "clone3_end: \n" \ "mr %0,3 \n" \ : "=r"(ret) /* %0 */ \ : "r"(&clone_args), /* %1 */ \ "r"(size), /* %2 */ \ "r"(clone_restore_fn), /* %3 */ \ "r"(args) /* %4 */ \ : "memory","0","3","4","5","14","15") /* clang-format on */ #define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r); int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r); /* Nothing to do, TLS is accessed through r13 */ static inline void restore_tls(tls_t *ptls) { (void)ptls; } /* * Defined in arch/ppc64/syscall-common-ppc64.S */ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg); static inline void *alloc_compat_syscall_stack(void) { return NULL; } static inline void free_compat_syscall_stack(void *stack32) { } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } #endif /*__CR_ASM_RESTORER_H__*/ crac-criu-1.5.0/criu/arch/ppc64/include/asm/thread_pointer.h000066400000000000000000000021571471504326700236010ustar00rootroot00000000000000/* __thread_pointer definition. powerpc version. Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #ifndef _SYS_THREAD_POINTER_H #define _SYS_THREAD_POINTER_H #ifdef __powerpc64__ register void *__thread_register asm("r13"); #else register void *__thread_register asm("r2"); #endif static inline void *__criu_thread_pointer(void) { return __thread_register; } #endif /* _SYS_THREAD_POINTER_H */crac-criu-1.5.0/criu/arch/ppc64/include/asm/types.h000066400000000000000000000021241471504326700217300ustar00rootroot00000000000000#ifndef __CR_ASM_TYPES_H__ #define __CR_ASM_TYPES_H__ #include #include #include "images/core.pb-c.h" #include "page.h" #include "bitops.h" #include "asm/int.h" #include typedef UserPpc64RegsEntry UserRegsEntry; #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__PPC64 #define core_is_compat(core) false #define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64 #define TI_IP(core) ((core)->ti_ppc64->gpregs->nip) static inline void *decode_pointer(uint64_t v) { return (void *)v; } static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } /* * Copied from the following kernel header files : * include/linux/auxvec.h * arch/powerpc/include/uapi/asm/auxvec.h * include/linux/mm_types.h */ #define AT_VECTOR_SIZE_BASE 20 #if !defined AT_VECTOR_SIZE_ARCH #define AT_VECTOR_SIZE_ARCH 6 #endif #define AT_VECTOR_SIZE (2 * (AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) typedef uint64_t auxv_t; /* Not used but the structure parasite_dump_thread needs a tls_t field */ typedef uint64_t tls_t; #endif /* __CR_ASM_TYPES_H__ */ crac-criu-1.5.0/criu/arch/ppc64/include/asm/vdso.h000066400000000000000000000030051471504326700215360ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "asm-generic/vdso.h" /* This definition is used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' * * Poke from kernel file arch/powerpc/kernel/vdso64/vdso64.lds.S * * Note that '__kernel_datapage_offset' is not a service but mostly a data * inside the text page which should not be used as is from user space. */ #define VDSO_SYMBOL_MAX 10 #define VDSO_SYMBOL_GTOD 5 #define ARCH_VDSO_SYMBOLS_LIST \ const char *aarch_vdso_symbol1 = "__kernel_clock_getres"; \ const char *aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ const char *aarch_vdso_symbol3 = "__kernel_get_syscall_map"; \ const char *aarch_vdso_symbol4 = "__kernel_get_tbfreq"; \ const char *aarch_vdso_symbol5 = "__kernel_getcpu"; \ const char *aarch_vdso_symbol6 = "__kernel_gettimeofday"; \ const char *aarch_vdso_symbol7 = "__kernel_sigtramp_rt64"; \ const char *aarch_vdso_symbol8 = "__kernel_sync_dicache"; \ const char *aarch_vdso_symbol9 = "__kernel_sync_dicache_p5"; \ const char *aarch_vdso_symbol10 = "__kernel_time"; #define ARCH_VDSO_SYMBOLS \ aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5, \ aarch_vdso_symbol6, aarch_vdso_symbol7, aarch_vdso_symbol8, aarch_vdso_symbol9, aarch_vdso_symbol10 #endif /* __CR_ASM_VDSO_H__ */ crac-criu-1.5.0/criu/arch/ppc64/misc.S000066400000000000000000000104031471504326700172660ustar00rootroot00000000000000/* * This is from linux/arch/powerpc/lib/crtsavres.S: * * Special support for eabi and SVR4 * * Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc. * Copyright 2008 Freescale Semiconductor, Inc. * Written By Michael Meissner * * Based on gcc/config/rs6000/crtsavres.asm from gcc * 64 bit additions from reading the PPC elf64abi document. * * This file is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2, or (at your option) any * later version. * * In addition to the permissions in the GNU General Public License, the * Free Software Foundation gives you unlimited permission to link the * compiled version of this file with other programs, and to distribute * those programs without any restriction coming from the use of this * file. (The General Public License restrictions do apply in other * respects; for example, they cover modification of the file, and * distribution when not linked into another program.) * * This file is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; see the file COPYING. If not, write to * the Free Software Foundation, 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * * As a special exception, if you link this library with files * compiled with GCC to produce an executable, this does not cause * the resulting executable to be covered by the GNU General Public License. * This exception does not however invalidate any other reasons why * the executable file might be covered by the GNU General Public License. */ #define r0 0 #define r1 1 #define r2 2 #define r3 3 #define r4 4 #define r5 5 #define r6 6 #define r7 7 #define r8 8 #define r9 9 #define r10 10 #define r11 11 #define r12 12 #define r13 13 #define r14 14 #define r15 15 #define r16 16 #define r17 17 #define r18 18 #define r19 19 #define r20 20 #define r21 21 #define r22 22 #define r23 23 #define r24 24 #define r25 25 #define r26 26 #define r27 27 #define r28 28 #define r29 29 #define r30 30 #define r31 31 .text .globl _savegpr0_14 _savegpr0_14: std r14,-144(r1) .globl _savegpr0_15 _savegpr0_15: std r15,-136(r1) .globl _savegpr0_16 _savegpr0_16: std r16,-128(r1) .globl _savegpr0_17 _savegpr0_17: std r17,-120(r1) .globl _savegpr0_18 _savegpr0_18: std r18,-112(r1) .globl _savegpr0_19 _savegpr0_19: std r19,-104(r1) .globl _savegpr0_20 _savegpr0_20: std r20,-96(r1) .globl _savegpr0_21 _savegpr0_21: std r21,-88(r1) .globl _savegpr0_22 _savegpr0_22: std r22,-80(r1) .globl _savegpr0_23 _savegpr0_23: std r23,-72(r1) .globl _savegpr0_24 _savegpr0_24: std r24,-64(r1) .globl _savegpr0_25 _savegpr0_25: std r25,-56(r1) .globl _savegpr0_26 _savegpr0_26: std r26,-48(r1) .globl _savegpr0_27 _savegpr0_27: std r27,-40(r1) .globl _savegpr0_28 _savegpr0_28: std r28,-32(r1) .globl _savegpr0_29 _savegpr0_29: std r29,-24(r1) .globl _savegpr0_30 _savegpr0_30: std r30,-16(r1) .globl _savegpr0_31 _savegpr0_31: std r31,-8(r1) std r0,16(r1) blr .globl _restgpr0_14 _restgpr0_14: ld r14,-144(r1) .globl _restgpr0_15 _restgpr0_15: ld r15,-136(r1) .globl _restgpr0_16 _restgpr0_16: ld r16,-128(r1) .globl _restgpr0_17 _restgpr0_17: ld r17,-120(r1) .globl _restgpr0_18 _restgpr0_18: ld r18,-112(r1) .globl _restgpr0_19 _restgpr0_19: ld r19,-104(r1) .globl _restgpr0_20 _restgpr0_20: ld r20,-96(r1) .globl _restgpr0_21 _restgpr0_21: ld r21,-88(r1) .globl _restgpr0_22 _restgpr0_22: ld r22,-80(r1) .globl _restgpr0_23 _restgpr0_23: ld r23,-72(r1) .globl _restgpr0_24 _restgpr0_24: ld r24,-64(r1) .globl _restgpr0_25 _restgpr0_25: ld r25,-56(r1) .globl _restgpr0_26 _restgpr0_26: ld r26,-48(r1) .globl _restgpr0_27 _restgpr0_27: ld r27,-40(r1) .globl _restgpr0_28 _restgpr0_28: ld r28,-32(r1) .globl _restgpr0_29 _restgpr0_29: ld r0,16(r1) ld r29,-24(r1) mtlr r0 ld r30,-16(r1) ld r31,-8(r1) blr .globl _restgpr0_30 _restgpr0_30: ld r30,-16(r1) .globl _restgpr0_31 _restgpr0_31: ld r0,16(r1) ld r31,-8(r1) mtlr r0 blr crac-criu-1.5.0/criu/arch/ppc64/restorer.c000066400000000000000000000022011471504326700202150ustar00rootroot00000000000000#include #include "restorer.h" #include "asm/restorer.h" #include #include #include "log.h" int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r) { #define SPRN_TFHAR 128 #define SPRN_TFIAR 129 #define SPRN_TEXASR 130 if (r->has_tfhar) { asm __volatile__("ld 3, %[value] ;" "mtspr %[sprn],3 ;" : [value] "=m"(r->tfhar) : [sprn] "i"(SPRN_TFHAR) : "r3"); } if (r->has_tfiar) { asm __volatile__("ld 3, %[value] ;" "mtspr %[sprn],3 ;" : [value] "=m"(r->tfiar) : [sprn] "i"(SPRN_TFIAR) : "r3"); } if (r->has_texasr) { asm __volatile__("ld 3, %[value] ;" "mtspr %[sprn],3 ;" : [value] "=m"(r->texasr) : [sprn] "i"(SPRN_TEXASR) : "r3"); } return 0; } unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) { unsigned long raddr; int ret; ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ shmflg, /* second */ (unsigned long)&raddr, /* third */ shmaddr, /* ptr */ 0 /* fifth not used */); if (ret) raddr = (unsigned long)ret; return raddr; } crac-criu-1.5.0/criu/arch/ppc64/sigframe.c000066400000000000000000000024461471504326700201600ustar00rootroot00000000000000#include #include #include "asm/sigframe.h" #include "asm/types.h" #include "log.h" #include "common/bug.h" /* * The signal frame has been built using local addresses. Since it has to be * used in the context of the checkpointed process, the v_regs pointer in the * signal frame must be updated to match the address in the remote stack. */ static inline void update_vregs(mcontext_t *lcontext, mcontext_t *rcontext) { if (lcontext->v_regs) { uint64_t offset = (uint64_t)(lcontext->v_regs) - (uint64_t)lcontext; lcontext->v_regs = (vrregset_t *)((uint64_t)rcontext + offset); pr_debug("Updated v_regs:%llx (rcontext:%llx)\n", (unsigned long long)lcontext->v_regs, (unsigned long long)rcontext); } } int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, struct rt_sigframe *rframe) { uint64_t msr = frame->uc.uc_mcontext.gp_regs[PT_MSR]; update_vregs(&frame->uc.uc_mcontext, &rframe->uc.uc_mcontext); /* Sanity check: If TM so uc_link should be set, otherwise not */ if (MSR_TM_ACTIVE(msr) ^ (!!(frame->uc.uc_link))) { BUG(); return 1; } /* Updating the transactional state address if any */ if (frame->uc.uc_link) { update_vregs(&frame->uc_transact.uc_mcontext, &rframe->uc_transact.uc_mcontext); frame->uc.uc_link = &rframe->uc_transact; } return 0; } crac-criu-1.5.0/criu/arch/ppc64/vdso-pie.c000066400000000000000000000077671471504326700201240ustar00rootroot00000000000000#include #include "asm/types.h" #include #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX #undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " /* This symbols are defined in vdso-trampoline.S */ extern char *vdso_trampoline, *vdso_trampoline_end; static inline void invalidate_caches(unsigned long at) { asm volatile("isync \n" "li 3,0 \n" "dcbf 3,%0 \n" "sync \n" "icbi 3,%0 \n" "isync \n" : /* no output */ : "r"(at) : "memory", "r3"); } /* This is the size of the trampoline call : * mlfr r0 * bl trampoline * <64 bit address> */ #define TRAMP_CALL_SIZE (2 * sizeof(uint32_t) + sizeof(uint64_t)) /* * put_trampoline does 2 things : * * 1. it looks for a place in the checkpointed vDSO where to put the * trampoline code (see vdso-trampoline.S). * * 2. for each symbol from the checkpointed vDSO, it checks that there are * enough place to put the call to the vDSO trampoline (see * TRAMP_CALL_SIZE's comment above). * This done by checking that there is no interesting symbols in the range * of current one's offset -> (current one's offset + TRAMP_CALL_SIZE). * Unfortunately the symbols are not sorted by address so we have to look * for the complete table all the time. Since the vDSO is small, this is * not a big issue. */ static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym) { int i, j; unsigned long size; unsigned long trampoline = 0; /* First of all we have to find a place where to put the trampoline * code. */ size = (unsigned long)&vdso_trampoline_end - (unsigned long)&vdso_trampoline; for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { if (vdso_symbol_empty(&sym->symbols[i])) continue; pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, sym->symbols[i].offset); /* find the nearest following symbol we are interested in */ for (j = 0; j < ARRAY_SIZE(sym->symbols); j++) { if (i == j || vdso_symbol_empty(&sym->symbols[j])) continue; if (sym->symbols[j].offset <= sym->symbols[i].offset) /* this symbol is above the current one */ continue; if ((sym->symbols[i].offset + TRAMP_CALL_SIZE) > sym->symbols[j].offset) { /* we have a major issue here since we cannot * even put the trampoline call for this symbol */ pr_err("Can't handle small vDSO symbol %s\n", sym->symbols[i].name); return 0; } if (trampoline) /* no need to put it twice */ continue; if ((sym->symbols[j].offset - (sym->symbols[i].offset + TRAMP_CALL_SIZE)) <= size) /* not enough place */ continue; /* We can put the trampoline there */ trampoline = at + sym->symbols[i].offset; trampoline += TRAMP_CALL_SIZE; pr_debug("Putting vDSO trampoline in %s at %lx\n", sym->symbols[i].name, trampoline); memcpy((void *)trampoline, &vdso_trampoline, size); invalidate_caches(trampoline); } } return trampoline; } static inline void put_trampoline_call(unsigned long at, unsigned long to, unsigned long tr) { uint32_t *addr = (uint32_t *)at; *addr++ = 0x7C0802a6; /* mflr r0 */ *addr++ = 0x48000001 | ((long)(tr - at - 4) & 0x3fffffc); /* bl tr */ *(uint64_t *)addr = to; /* the address to read by the trampoline */ invalidate_caches(at); } int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, struct vdso_symtable *from, bool __always_unused compat_vdso) { unsigned int i; unsigned long trampoline; trampoline = (unsigned long)put_trampoline(base_from, from); if (!trampoline) return 1; for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { if (vdso_symbol_empty(&from->symbols[i])) continue; pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", base_from, from->symbols[i].offset, base_to, to->symbols[i].offset, i, from->symbols[i].name); put_trampoline_call(base_from + from->symbols[i].offset, base_to + to->symbols[i].offset, trampoline); } return 0; } crac-criu-1.5.0/criu/arch/ppc64/vdso-trampoline.S000066400000000000000000000004041471504326700214560ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .text GLOBAL(vdso_trampoline) mflr r12 /* r12 vdso_ptr's address */ mtlr r0 /* restore lr */ ld r12,0(r12) /* read value store in vdso_ptr */ mtctr r12 /* branch to it */ bctr GLOBAL(vdso_trampoline_end) crac-criu-1.5.0/criu/arch/s390/000077500000000000000000000000001471504326700157535ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/s390/Makefile000066400000000000000000000001631471504326700174130ustar00rootroot00000000000000builtin-name := crtools.built-in.o ldflags-y += -r obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o crac-criu-1.5.0/criu/arch/s390/cpu.c000066400000000000000000000061161471504326700167120ustar00rootroot00000000000000#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include #include #include "asm/types.h" #include "cr_options.h" #include "image.h" #include "util.h" #include "log.h" #include "cpu.h" #include "protobuf.h" #include "images/cpuinfo.pb-c.h" static compel_cpuinfo_t rt_cpuinfo; static const char *hwcap_str1[64] = { "HWCAP_S390_ESAN3", "HWCAP_S390_ZARCH", "HWCAP_S390_STFLE", "HWCAP_S390_MSA", "HWCAP_S390_LDISP", "HWCAP_S390_EIMM", "HWCAP_S390_DFP", "HWCAP_S390_HPAGE", "HWCAP_S390_ETF3EH", "HWCAP_S390_HIGH_GPRS", "HWCAP_S390_TE", "HWCAP_S390_VXRS", "HWCAP_S390_VXRS_BCD", "HWCAP_S390_VXRS_EXT", }; static const char *hwcap_str2[64] = {}; static const char **hwcap_str[2] = { hwcap_str1, hwcap_str2 }; static void print_hwcaps(const char *msg, unsigned long hwcap[2]) { int nr, cap; pr_debug("%s: Capabilities: %016lx %016lx\n", msg, hwcap[0], hwcap[1]); for (nr = 0; nr < 2; nr++) { for (cap = 0; cap < 64; cap++) { if (!(hwcap[nr] & (1 << cap))) continue; if (hwcap_str[nr][cap]) pr_debug("%s\n", hwcap_str[nr][cap]); else pr_debug("Capability %d/0x%x\n", nr, 1 << cap); } } } int cpu_init(void) { int ret; ret = compel_cpuid(&rt_cpuinfo); print_hwcaps("Host (init)", rt_cpuinfo.hwcap); return ret; } int cpu_dump_cpuinfo(void) { CpuinfoS390Entry cpu_s390_info = CPUINFO_S390_ENTRY__INIT; CpuinfoS390Entry *cpu_s390_info_ptr = &cpu_s390_info; CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT; struct cr_img *img; int ret = -1; img = open_image(CR_FD_CPUINFO, O_DUMP); if (!img) return -1; cpu_info.s390_entry = &cpu_s390_info_ptr; cpu_info.n_s390_entry = 1; cpu_s390_info.n_hwcap = 2; cpu_s390_info.hwcap = rt_cpuinfo.hwcap; ret = pb_write_one(img, &cpu_info, PB_CPUINFO); close_image(img); return ret; } int cpu_validate_cpuinfo(void) { CpuinfoS390Entry *cpu_s390_entry; CpuinfoEntry *cpu_info; struct cr_img *img; int cap, nr, ret; img = open_image(CR_FD_CPUINFO, O_RSTR); if (!img) return -1; ret = 0; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; if (cpu_info->n_s390_entry != 1) { pr_err("No S390 related entry in image\n"); goto error; } cpu_s390_entry = cpu_info->s390_entry[0]; if (cpu_s390_entry->n_hwcap != 2) { pr_err("Hardware capabilities information missing\n"); ret = -1; goto error; } print_hwcaps("Host", rt_cpuinfo.hwcap); print_hwcaps("Image", cpu_s390_entry->hwcap); for (nr = 0; nr < 2; nr++) { for (cap = 0; cap < 64; cap++) { if (!(cpu_s390_entry->hwcap[nr] & (1 << cap))) continue; if (rt_cpuinfo.hwcap[nr] & (1 << cap)) continue; if (hwcap_str[nr][cap]) pr_err("CPU Feature %s not supported on host\n", hwcap_str[nr][cap]); else pr_err("CPU Feature %d/%x not supported on host\n", nr, 1 << cap); ret = -1; } } if (ret == -1) pr_err("See also: /usr/include/bits/hwcap.h\n"); error: close_image(img); return ret; } int cpuinfo_dump(void) { if (cpu_init()) return -1; if (cpu_dump_cpuinfo()) return -1; return 0; } int cpuinfo_check(void) { if (cpu_init()) return 1; if (cpu_validate_cpuinfo()) return 1; return 0; } crac-criu-1.5.0/criu/arch/s390/crtools.c000066400000000000000000000432301471504326700176060ustar00rootroot00000000000000#include #include #include #include #include #include #include "types.h" #include #include "asm/restorer.h" #include "asm/dump.h" #include "cr_options.h" #include "common/compiler.h" #include #include "parasite-syscall.h" #include "log.h" #include "util.h" #include "cpu.h" #include "compel/infect.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" #include "ptrace.h" #include "pstree.h" #include "image.h" #define NT_PRFPREG 2 #define NT_S390_VXRS_LOW 0x309 #define NT_S390_VXRS_HIGH 0x30a #define NT_S390_GS_CB 0x30b #define NT_S390_GS_BC 0x30c #define NT_S390_RI_CB 0x30d /* * Print general purpose and access registers */ static void print_core_gpregs(const char *msg, UserS390RegsEntry *gpregs) { int i; pr_debug("%s: General purpose registers\n", msg); pr_debug(" psw %016lx %016lx\n", gpregs->psw_mask, gpregs->psw_addr); pr_debug(" orig_gpr2 %016lx\n", gpregs->orig_gpr2); for (i = 0; i < 16; i++) pr_debug(" g%02d %016lx\n", i, gpregs->gprs[i]); for (i = 0; i < 16; i++) pr_debug(" a%02d %08x\n", i, gpregs->acrs[i]); } /* * Print vector registers */ static void print_core_vx_regs(CoreEntry *core) { UserS390VxrsHighEntry *vxrs_high; UserS390VxrsLowEntry *vxrs_low; int i; vxrs_high = CORE_THREAD_ARCH_INFO(core)->vxrs_high; vxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (vxrs_low == NULL) { pr_debug(" No VXRS\n"); return; } for (i = 0; i < 16; i++) pr_debug(" vx_low%02d %016lx\n", i, vxrs_low->regs[i]); for (i = 0; i < 32; i += 2) pr_debug(" vx_high%02d %016lx %016lx\n", i / 2, vxrs_high->regs[i], vxrs_high->regs[i + 1]); } /* * Print guarded-storage control block */ static void print_core_gs_cb(CoreEntry *core) { UserS390GsCbEntry *gs_cb; int i; gs_cb = CORE_THREAD_ARCH_INFO(core)->gs_cb; if (!gs_cb) { pr_debug(" No GS_CB\n"); return; } for (i = 0; i < 4; i++) pr_debug(" gs_cb%d %lx\n", i, gs_cb->regs[i]); } /* * Print guarded-storage broadcast control block */ static void print_core_gs_bc(CoreEntry *core) { UserS390GsCbEntry *gs_bc; int i; gs_bc = CORE_THREAD_ARCH_INFO(core)->gs_bc; if (!gs_bc) { pr_debug(" No GS_BC\n"); return; } for (i = 0; i < 4; i++) pr_debug(" gs_bc%d %lx\n", i, gs_bc->regs[i]); } /* * Print runtime-instrumentation control block */ static void print_core_ri_cb(CoreEntry *core) { UserS390RiEntry *ri_cb; int i; ri_cb = CORE_THREAD_ARCH_INFO(core)->ri_cb; if (!ri_cb) { pr_debug(" No RI_CB\n"); return; } for (i = 0; i < 8; i++) pr_debug(" ri_cb%d %lx\n", i, ri_cb->regs[i]); } /* * Print architecture registers */ static void print_core_fp_regs(const char *msg, CoreEntry *core) { UserS390FpregsEntry *fpregs; int i; fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; pr_debug("%s: Floating point registers\n", msg); pr_debug(" fpc %08x\n", fpregs->fpc); for (i = 0; i < 16; i++) pr_debug(" f%02d %016lx\n", i, fpregs->fprs[i]); print_core_vx_regs(core); print_core_gs_cb(core); print_core_gs_bc(core); print_core_ri_cb(core); } /* * Allocate VxrsLow registers */ static UserS390VxrsLowEntry *allocate_vxrs_low_regs(void) { UserS390VxrsLowEntry *vxrs_low; vxrs_low = xmalloc(sizeof(*vxrs_low)); if (!vxrs_low) return NULL; user_s390_vxrs_low_entry__init(vxrs_low); vxrs_low->n_regs = 16; vxrs_low->regs = xzalloc(16 * sizeof(uint64_t)); if (!vxrs_low->regs) goto fail_free_vxrs_low; return vxrs_low; fail_free_vxrs_low: xfree(vxrs_low); return NULL; } /* * Free VxrsLow registers */ static void free_vxrs_low_regs(UserS390VxrsLowEntry *vxrs_low) { if (vxrs_low) { xfree(vxrs_low->regs); xfree(vxrs_low); } } /* * Allocate VxrsHigh registers */ static UserS390VxrsHighEntry *allocate_vxrs_high_regs(void) { UserS390VxrsHighEntry *vxrs_high; vxrs_high = xmalloc(sizeof(*vxrs_high)); if (!vxrs_high) return NULL; user_s390_vxrs_high_entry__init(vxrs_high); vxrs_high->n_regs = 32; vxrs_high->regs = xzalloc(32 * sizeof(uint64_t)); if (!vxrs_high->regs) goto fail_free_vxrs_high; return vxrs_high; fail_free_vxrs_high: xfree(vxrs_high); return NULL; } /* * Free VxrsHigh registers */ static void free_vxrs_high_regs(UserS390VxrsHighEntry *vxrs_high) { if (vxrs_high) { xfree(vxrs_high->regs); xfree(vxrs_high); } } /* * Allocate guarded-storage control block (GS_CB and GS_BC) */ static UserS390GsCbEntry *allocate_gs_cb(void) { UserS390GsCbEntry *gs_cb; gs_cb = xmalloc(sizeof(*gs_cb)); if (!gs_cb) return NULL; user_s390_gs_cb_entry__init(gs_cb); gs_cb->n_regs = 4; gs_cb->regs = xzalloc(4 * sizeof(uint64_t)); if (!gs_cb->regs) goto fail_free_gs_cb; return gs_cb; fail_free_gs_cb: xfree(gs_cb); return NULL; } /* * Free Guarded Storage control blocks */ static void free_gs_cb(UserS390GsCbEntry *gs_cb) { if (gs_cb) { xfree(gs_cb->regs); xfree(gs_cb); } } /* * Allocate runtime-instrumentation control block */ static UserS390RiEntry *allocate_ri_cb(void) { UserS390RiEntry *ri_cb; ri_cb = xmalloc(sizeof(*ri_cb)); if (!ri_cb) return NULL; user_s390_ri_entry__init(ri_cb); ri_cb->ri_on = 0; ri_cb->n_regs = 8; ri_cb->regs = xzalloc(8 * sizeof(uint64_t)); if (!ri_cb->regs) goto fail_free_ri_cb; return ri_cb; fail_free_ri_cb: xfree(ri_cb); return NULL; } /* * Free runtime-instrumentation control block */ static void free_ri_cb(UserS390RiEntry *ri_cb) { if (ri_cb) { xfree(ri_cb->regs); xfree(ri_cb); } } /* * Copy internal structures into Google Protocol Buffers */ int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { UserS390VxrsHighEntry *vxrs_high = NULL; UserS390VxrsLowEntry *vxrs_low = NULL; UserS390FpregsEntry *fpregs = NULL; UserS390RegsEntry *gpregs = NULL; UserS390GsCbEntry *gs_cb = NULL; UserS390GsCbEntry *gs_bc = NULL; UserS390RiEntry *ri_cb = NULL; CoreEntry *core = arg; gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; /* Vector registers */ if (f->flags & USER_FPREGS_VXRS) { vxrs_low = allocate_vxrs_low_regs(); if (!vxrs_low) return -1; vxrs_high = allocate_vxrs_high_regs(); if (!vxrs_high) goto fail_free_vxrs_low; memcpy(vxrs_low->regs, &f->vxrs_low, sizeof(f->vxrs_low)); memcpy(vxrs_high->regs, &f->vxrs_high, sizeof(f->vxrs_high)); CORE_THREAD_ARCH_INFO(core)->vxrs_low = vxrs_low; CORE_THREAD_ARCH_INFO(core)->vxrs_high = vxrs_high; } /* Guarded-storage control block */ if (f->flags & USER_GS_CB) { gs_cb = allocate_gs_cb(); if (!gs_cb) goto fail_free_gs_cb; memcpy(gs_cb->regs, &f->gs_cb, sizeof(f->gs_cb)); CORE_THREAD_ARCH_INFO(core)->gs_cb = gs_cb; } /* Guarded-storage broadcast control block */ if (f->flags & USER_GS_BC) { gs_bc = allocate_gs_cb(); if (!gs_bc) goto fail_free_gs_bc; memcpy(gs_bc->regs, &f->gs_bc, sizeof(f->gs_bc)); CORE_THREAD_ARCH_INFO(core)->gs_bc = gs_bc; } /* Runtime-instrumentation control block */ if (f->flags & USER_RI_CB) { ri_cb = allocate_ri_cb(); if (!ri_cb) goto fail_free_ri_cb; memcpy(ri_cb->regs, &f->ri_cb, sizeof(f->ri_cb)); CORE_THREAD_ARCH_INFO(core)->ri_cb = ri_cb; /* We need to remember that the RI bit was on */ if (f->flags & USER_RI_ON) ri_cb->ri_on = 1; } /* General purpose registers */ memcpy(gpregs->gprs, u->prstatus.gprs, sizeof(u->prstatus.gprs)); gpregs->psw_mask = u->prstatus.psw.mask; gpregs->psw_addr = u->prstatus.psw.addr; /* Access registers */ memcpy(gpregs->acrs, u->prstatus.acrs, sizeof(u->prstatus.acrs)); /* System call */ gpregs->system_call = u->system_call; /* Floating point registers */ fpregs->fpc = f->prfpreg.fpc; memcpy(fpregs->fprs, f->prfpreg.fprs, sizeof(f->prfpreg.fprs)); return 0; fail_free_ri_cb: free_ri_cb(ri_cb); fail_free_gs_cb: free_gs_cb(gs_cb); fail_free_gs_bc: free_gs_cb(gs_bc); fail_free_vxrs_low: free_vxrs_low_regs(vxrs_low); return -1; } /* * Copy general and access registers to signal frame */ int restore_gpregs(struct rt_sigframe *f, UserS390RegsEntry *src) { _sigregs *dst = &f->uc.uc_mcontext; dst->regs.psw.mask = src->psw_mask; dst->regs.psw.addr = src->psw_addr; memcpy(dst->regs.gprs, src->gprs, sizeof(dst->regs.gprs)); memcpy(dst->regs.acrs, src->acrs, sizeof(dst->regs.acrs)); print_core_gpregs("restore_gpregs_regs", src); return 0; } /* * Copy floating point and vector registers to mcontext */ int restore_fpu(struct rt_sigframe *f, CoreEntry *core) { UserS390VxrsHighEntry *vxrs_high; UserS390VxrsLowEntry *vxrs_low; UserS390FpregsEntry *fpregs; _sigregs *dst = &f->uc.uc_mcontext; _sigregs_ext *dst_ext = &f->uc.uc_mcontext_ext; fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; vxrs_high = CORE_THREAD_ARCH_INFO(core)->vxrs_high; vxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; dst->fpregs.fpc = fpregs->fpc; memcpy(dst->fpregs.fprs, fpregs->fprs, sizeof(dst->fpregs.fprs)); if (vxrs_low) { memcpy(&dst_ext->vxrs_low, vxrs_low->regs, sizeof(dst_ext->vxrs_low)); memcpy(&dst_ext->vxrs_high, vxrs_high->regs, sizeof(dst_ext->vxrs_high)); } return 0; } /* * Allocate floating point registers */ static UserS390FpregsEntry *allocate_fp_regs(void) { UserS390FpregsEntry *fpregs; fpregs = xmalloc(sizeof(*fpregs)); if (!fpregs) return NULL; user_s390_fpregs_entry__init(fpregs); fpregs->n_fprs = 16; fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); if (!fpregs->fprs) goto fail_free_fpregs; return fpregs; fail_free_fpregs: xfree(fpregs); return NULL; } /* * Free floating point registers */ static void free_fp_regs(UserS390FpregsEntry *fpregs) { xfree(fpregs->fprs); xfree(fpregs); } /* * Allocate general purpose and access registers */ static UserS390RegsEntry *allocate_gp_regs(void) { UserS390RegsEntry *gpregs; gpregs = xmalloc(sizeof(*gpregs)); if (!gpregs) return NULL; user_s390_regs_entry__init(gpregs); gpregs->n_gprs = 16; gpregs->gprs = xzalloc(16 * sizeof(uint64_t)); if (!gpregs->gprs) goto fail_free_gpregs; gpregs->n_acrs = 16; gpregs->acrs = xzalloc(16 * sizeof(uint32_t)); if (!gpregs->acrs) goto fail_free_gprs; return gpregs; fail_free_gprs: xfree(gpregs->gprs); fail_free_gpregs: xfree(gpregs); return NULL; } /* * Free general purpose and access registers */ static void free_gp_regs(UserS390RegsEntry *gpregs) { xfree(gpregs->gprs); xfree(gpregs->acrs); xfree(gpregs); } /* * Allocate thread info */ int arch_alloc_thread_info(CoreEntry *core) { ThreadInfoS390 *ti_s390; ti_s390 = xmalloc(sizeof(*ti_s390)); if (!ti_s390) return -1; thread_info_s390__init(ti_s390); ti_s390->gpregs = allocate_gp_regs(); if (!ti_s390->gpregs) goto fail_free_ti_s390; ti_s390->fpregs = allocate_fp_regs(); if (!ti_s390->fpregs) goto fail_free_gp_regs; CORE_THREAD_ARCH_INFO(core) = ti_s390; return 0; fail_free_gp_regs: free_gp_regs(ti_s390->gpregs); fail_free_ti_s390: xfree(ti_s390); return -1; } /* * Free thread info */ void arch_free_thread_info(CoreEntry *core) { if (!CORE_THREAD_ARCH_INFO(core)) return; free_gp_regs(CORE_THREAD_ARCH_INFO(core)->gpregs); free_fp_regs(CORE_THREAD_ARCH_INFO(core)->fpregs); free_vxrs_low_regs(CORE_THREAD_ARCH_INFO(core)->vxrs_low); free_vxrs_high_regs(CORE_THREAD_ARCH_INFO(core)->vxrs_high); free_gs_cb(CORE_THREAD_ARCH_INFO(core)->gs_cb); free_gs_cb(CORE_THREAD_ARCH_INFO(core)->gs_bc); free_ri_cb(CORE_THREAD_ARCH_INFO(core)->ri_cb); xfree(CORE_THREAD_ARCH_INFO(core)); CORE_THREAD_ARCH_INFO(core) = NULL; } /* * Set regset for pid */ static int setregset(int pid, int set, const char *set_str, struct iovec *iov) { if (ptrace(PTRACE_SETREGSET, pid, set, iov) == 0) return 0; pr_perror("Couldn't set %s registers for pid %d", set_str, pid); return -1; } /* * Set floating point registers for pid from fpregs */ static int set_fp_regs(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; iov.iov_base = &fpregs->prfpreg; iov.iov_len = sizeof(fpregs->prfpreg); return setregset(pid, NT_PRFPREG, "PRFPREG", &iov); } /* * Set vector registers */ static int set_vx_regs(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; if (!(fpregs->flags & USER_FPREGS_VXRS)) return 0; iov.iov_base = &fpregs->vxrs_low; iov.iov_len = sizeof(fpregs->vxrs_low); if (setregset(pid, NT_S390_VXRS_LOW, "S390_VXRS_LOW", &iov)) return -1; iov.iov_base = &fpregs->vxrs_high; iov.iov_len = sizeof(fpregs->vxrs_high); return setregset(pid, NT_S390_VXRS_HIGH, "S390_VXRS_HIGH", &iov); } /* * Set guarded-storage control block */ static int set_gs_cb(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; if (fpregs->flags & USER_GS_CB) { iov.iov_base = &fpregs->gs_cb; iov.iov_len = sizeof(fpregs->gs_cb); if (setregset(pid, NT_S390_GS_CB, "S390_GS_CB", &iov)) return -1; } if (!(fpregs->flags & USER_GS_BC)) return 0; iov.iov_base = &fpregs->gs_bc; iov.iov_len = sizeof(fpregs->gs_bc); return setregset(pid, NT_S390_GS_BC, "S390_GS_BC", &iov); } /* * Set runtime-instrumentation control block */ static int set_ri_cb(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; if (!(fpregs->flags & USER_RI_CB)) return 0; iov.iov_base = &fpregs->ri_cb; iov.iov_len = sizeof(fpregs->ri_cb); return setregset(pid, NT_S390_RI_CB, "S390_RI_CB", &iov); } /* * Set runtime-instrumentation bit * * The CPU collects information when the RI bit of the PSW is set. * The RI control block is not part of the signal frame. Therefore during * sigreturn it is not set. If the RI control block is present, the CPU * writes into undefined storage. Hence, we have disabled the RI bit in * the sigreturn PSW and set this bit after sigreturn by modifying the PSW * of the task. */ static int set_ri_bit(pid_t pid) { user_regs_struct_t regs; struct iovec iov; psw_t *psw; iov.iov_base = ®s.prstatus; iov.iov_len = sizeof(regs.prstatus); if (ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov) < 0) { pr_perror("Fail to activate RI bit"); return -1; } psw = ®s.prstatus.psw; psw->mask |= PSW_MASK_RI; return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); } /* * Restore registers not present in sigreturn signal frame */ static int set_task_regs_nosigrt(pid_t pid, CoreEntry *core) { user_fpregs_struct_t fpregs; UserS390GsCbEntry *cgs_cb; UserS390GsCbEntry *cgs_bc; UserS390RiEntry *cri_cb; int ret = 0; memset(&fpregs, 0, sizeof(fpregs)); /* Guarded-storage control block (optional) */ cgs_cb = CORE_THREAD_ARCH_INFO(core)->gs_cb; if (cgs_cb != NULL) { fpregs.flags |= USER_GS_CB; memcpy(&fpregs.gs_cb, cgs_cb->regs, sizeof(fpregs.gs_cb)); } /* Guarded-storage broadcast control block (optional) */ cgs_bc = CORE_THREAD_ARCH_INFO(core)->gs_bc; if (cgs_bc != NULL) { fpregs.flags |= USER_GS_BC; memcpy(&fpregs.gs_bc, cgs_bc->regs, sizeof(fpregs.gs_bc)); } if (set_gs_cb(pid, &fpregs) < 0) return -1; /* Runtime-instrumentation control block (optional) */ cri_cb = CORE_THREAD_ARCH_INFO(core)->ri_cb; if (cri_cb != NULL) { fpregs.flags |= USER_RI_CB; memcpy(&fpregs.ri_cb, cri_cb->regs, sizeof(fpregs.ri_cb)); if (set_ri_cb(pid, &fpregs) < 0) return -1; if (cri_cb->ri_on) { fpregs.flags |= USER_RI_ON; ret = set_ri_bit(pid); } } return ret; } /* * Restore registers for pid from core */ static int set_task_regs(pid_t pid, CoreEntry *core) { UserS390VxrsHighEntry *cvxrs_high; UserS390VxrsLowEntry *cvxrs_low; UserS390FpregsEntry *cfpregs; user_fpregs_struct_t fpregs; memset(&fpregs, 0, sizeof(fpregs)); /* Floating point registers */ cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; if (!cfpregs) return -1; fpregs.prfpreg.fpc = cfpregs->fpc; memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); if (set_fp_regs(pid, &fpregs) < 0) return -1; /* Vector registers (optional) */ cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (cvxrs_low != NULL) { cvxrs_high = CORE_THREAD_ARCH_INFO(core)->vxrs_high; if (!cvxrs_high) return -1; fpregs.flags |= USER_FPREGS_VXRS; memcpy(&fpregs.vxrs_low, cvxrs_low->regs, sizeof(fpregs.vxrs_low)); memcpy(&fpregs.vxrs_high, cvxrs_high->regs, sizeof(fpregs.vxrs_high)); if (set_vx_regs(pid, &fpregs) < 0) return -1; } return set_task_regs_nosigrt(pid, core); } /* * Restore registers for all threads: * - Floating point registers * - Vector registers * - Guarded-storage control block * - Guarded-storage broadcast control block * - Runtime-instrumentation control block */ int arch_set_thread_regs(struct pstree_item *item, bool with_threads) { int i; for_each_pstree_item(item) { if (item->pid->state == TASK_DEAD || item->pid->state == TASK_ZOMBIE) continue; for (i = 0; i < item->nr_threads; i++) { if (item->threads[i].state == TASK_DEAD || item->threads[i].state == TASK_ZOMBIE) continue; if (!with_threads && i > 0) continue; if (set_task_regs(item->threads[i].real, item->core[i])) { pr_perror("Not set registers for task %d", item->threads[i].real); return -1; } } } return 0; } static int open_core(int pid, CoreEntry **pcore) { struct cr_img *img; int ret; img = open_image(CR_FD_CORE, O_RSTR, pid); if (!img) { pr_err("Can't open core data for %d\n", pid); return -1; } ret = pb_read_one(img, pcore, PB_CORE); close_image(img); return ret <= 0 ? -1 : 0; } /* * Restore all registers not present in sigreturn signal frame * * - Guarded-storage control block * - Guarded-storage broadcast control block * - Runtime-instrumentation control block */ int arch_set_thread_regs_nosigrt(struct pid *pid) { CoreEntry *core; core = xmalloc(sizeof(*core)); if (open_core(pid->ns[0].virt, &core) < 0) { pr_perror("Cannot open core for virt pid %d", pid->ns[0].virt); return -1; } if (set_task_regs_nosigrt(pid->real, core) < 0) { pr_perror("Set register for pid %d", pid->real); return -1; } print_core_fp_regs("restore_fp_regs", core); return 0; } crac-criu-1.5.0/criu/arch/s390/include/000077500000000000000000000000001471504326700173765ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/s390/include/asm/000077500000000000000000000000001471504326700201565ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/s390/include/asm/dump.h000066400000000000000000000005401471504326700212730ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); int arch_alloc_thread_info(CoreEntry *core); void arch_free_thread_info(CoreEntry *core); static inline void core_put_tls(CoreEntry *core, tls_t tls) { } #define get_task_futex_robust_list_compat(pid, info) -1 #endif crac-criu-1.5.0/criu/arch/s390/include/asm/int.h000066400000000000000000000001571471504326700211240ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ crac-criu-1.5.0/criu/arch/s390/include/asm/kerndat.h000066400000000000000000000002341471504326700217560ustar00rootroot00000000000000#ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ #define kdat_compatible_cr() 0 #define kdat_can_map_vdso() 0 #endif /* __CR_ASM_KERNDAT_H__ */ crac-criu-1.5.0/criu/arch/s390/include/asm/parasite-syscall.h000066400000000000000000000001521471504326700236050ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ struct parasite_ctl; #endif crac-criu-1.5.0/criu/arch/s390/include/asm/parasite.h000066400000000000000000000002741471504326700221420ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ /* TLS is accessed through %a01, which is already processed */ static inline void arch_get_tls(tls_t *ptls) { (void)ptls; } #endif crac-criu-1.5.0/criu/arch/s390/include/asm/restore.h000066400000000000000000000013761471504326700220210ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" /* * Load stack to %r15, return address in %r14 and argument 1 into %r2 */ /* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ "lgr %%r15,%0\n" \ "lgr %%r14,%1\n" \ "lgr %%r2,%2\n" \ "basr %%r14,%%r14\n" \ : \ : "d" (new_sp), \ "d"((unsigned long)restore_task_exec_start), \ "d" (task_args) \ : "2", "14", "memory") /* clang-format on */ /* There is nothing to do since TLS is accessed through %a01 */ #define core_get_tls(pcore, ptls) int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif crac-criu-1.5.0/criu/arch/s390/include/asm/restorer.h000066400000000000000000000071231471504326700221770ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include #include #include "asm/types.h" #include "sigframe.h" /* * Clone trampoline - see glibc sysdeps/unix/sysv/linux/s390/s390-64/clone.S */ /* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "lgr %%r0,%6\n" /* Save thread_args in %r0 */ \ "lgr %%r1,%5\n" /* Save clone_restore_fn in %r1 */ \ "lgr %%r2,%2\n" /* Parameter 1: new_sp (child stack) */ \ "lgr %%r3,%1\n" /* Parameter 2: clone_flags */ \ "lgr %%r4,%3\n" /* Parameter 3: &parent_tid */ \ "lgr %%r5,%4\n" /* Parameter 4: &thread_args[i].pid */ \ "lghi %%r6,0\n" /* Parameter 5: tls = 0 */ \ "svc "__stringify(__NR_clone)"\n" \ "ltgr %0,%%r2\n" /* Set and check "ret" */ \ "jnz 0f\n" /* ret != 0: Continue caller */ \ "lgr %%r2,%%r0\n" /* Parameter 1: &thread_args */ \ "aghi %%r15,-160\n" /* Prepare stack frame */ \ "xc 0(8,%%r15),0(%%r15)\n" \ "basr %%r14,%%r1\n" /* Jump to clone_restore_fn() */ \ "j .+2\n" /* BUG(): Force PGM check */ \ "0:\n" /* Continue caller */ \ : "=d"(ret) \ : "d"(clone_flags), \ "a"(new_sp), \ "d"(&parent_tid), \ "d"(&thread_args[i].pid), \ "d"(clone_restore_fn), \ "d"(&thread_args[i]) \ : "0", "1", "2", "3", "4", "5", "6", "cc", "memory") #define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ clone_restore_fn) \ asm volatile( \ /* * clone3 only needs two arguments (r2, r3), this means * we can use r4 and r5 for args and thread function. * r4 and r5 are callee-saved and are not overwritten. * No need to put these values on the child stack. */ \ "lgr %%r4,%4\n" /* Save args in %r4 */ \ "lgr %%r5,%3\n" /* Save clone_restore_fn in %r5 */ \ "lgr %%r2,%1\n" /* Parameter 1: clone_args */ \ "lgr %%r3,%2\n" /* Parameter 2: size */ \ /* * On s390x a syscall is done sc . * That only works for syscalls < 255. clone3 is 435, * therefore it is necessary to load the syscall number * into r1 and do 'svc 0'. */ \ "lghi %%r1,"__stringify(__NR_clone3)"\n" \ "svc 0\n" \ "ltgr %0,%%r2\n" /* Set and check "ret" */ \ "jnz 0f\n" /* ret != 0: Continue caller */ \ "lgr %%r2,%%r4\n" /* Thread arguments taken from r4. */ \ "lgr %%r1,%%r5\n" /* Thread function taken from r5. */ \ "aghi %%r15,-160\n" /* Prepare stack frame */ \ "xc 0(8,%%r15),0(%%r15)\n" \ "basr %%r14,%%r1\n" /* Jump to clone_restore_fn() */ \ "j .+2\n" /* BUG(): Force PGM check */ \ "0:\n" /* Continue caller */ \ : "=d"(ret) \ : "a"(&clone_args), \ "d"(size), \ "d"(clone_restore_fn), \ "d"(args) \ : "0", "1", "2", "3", "4", "5", "cc", "memory") /* clang-format on */ #define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserS390RegsEntry *r); int restore_nonsigframe_gpregs(UserS390RegsEntry *r); unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg); unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset); static inline void restore_tls(tls_t *ptls) { (void)ptls; } static inline void *alloc_compat_syscall_stack(void) { return NULL; } static inline void free_compat_syscall_stack(void *stack32) { } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } #endif /*__CR_ASM_RESTORER_H__*/ crac-criu-1.5.0/criu/arch/s390/include/asm/thread_pointer.h000066400000000000000000000017751471504326700233500ustar00rootroot00000000000000/* __thread_pointer definition. Generic version. Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #ifndef _SYS_THREAD_POINTER_H #define _SYS_THREAD_POINTER_H static inline void *__criu_thread_pointer(void) { return __builtin_thread_pointer(); } #endif /* _SYS_THREAD_POINTER_H */ crac-criu-1.5.0/criu/arch/s390/include/asm/types.h000066400000000000000000000016311471504326700214740ustar00rootroot00000000000000#ifndef _UAPI_S390_TYPES_H #define _UAPI_S390_TYPES_H #include #include #include "images/core.pb-c.h" #include "page.h" #include "bitops.h" #include "asm/int.h" #include typedef UserS390RegsEntry UserRegsEntry; #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__S390 #define core_is_compat(core) false #define CORE_THREAD_ARCH_INFO(core) core->ti_s390 #define TI_IP(core) ((core)->ti_s390->gpregs->psw_addr) static inline u64 encode_pointer(void *p) { return (u64)p; } static inline void *decode_pointer(u64 v) { return (void *)v; } /* * See also: * * arch/s390/include/uapi/asm/auxvec.h * * include/linux/auxvec.h */ #define AT_VECTOR_SIZE_BASE 20 #define AT_VECTOR_SIZE_ARCH 1 #define AT_VECTOR_SIZE (2 * (AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) typedef uint64_t auxv_t; typedef uint64_t tls_t; #endif /* _UAPI_S390_TYPES_H */ crac-criu-1.5.0/criu/arch/s390/include/asm/vdso.h000066400000000000000000000014511471504326700213030ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "asm-generic/vdso.h" /* * This is a minimal amount of symbols * we should support at the moment. */ #define VDSO_SYMBOL_MAX 4 #define VDSO_SYMBOL_GTOD 0 /* * These definitions are used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' */ #define ARCH_VDSO_SYMBOLS_LIST \ const char *aarch_vdso_symbol1 = "__kernel_gettimeofday"; \ const char *aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ const char *aarch_vdso_symbol3 = "__kernel_clock_getres"; \ const char *aarch_vdso_symbol4 = "__kernel_getcpu"; #define ARCH_VDSO_SYMBOLS aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4 #endif /* __CR_ASM_VDSO_H__ */ crac-criu-1.5.0/criu/arch/s390/restorer.c000066400000000000000000000013051471504326700177630ustar00rootroot00000000000000#include #include "restorer.h" #include "asm/restorer.h" #include #include #include "log.h" /* * All registers are restored by sigreturn - nothing to do here */ int restore_nonsigframe_gpregs(UserS390RegsEntry *r) { return 0; } /* * Call underlying ipc system call for shmat */ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) { unsigned long raddr; int ret; ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ shmflg, /* second */ (unsigned long)&raddr, /* third */ shmaddr, /* ptr */ 0 /* fifth not used */); if (ret) raddr = (unsigned long)ret; return raddr; } crac-criu-1.5.0/criu/arch/s390/sigframe.c000066400000000000000000000006351471504326700177200ustar00rootroot00000000000000#include #include #include "asm/sigframe.h" #include "asm/types.h" #include "log.h" /* * Nothing to do since we don't have any pointers to adjust * in the signal frame. * * - sigframe : Pointer to local signal frame * - rsigframe: Pointer to remote signal frame of inferior */ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } crac-criu-1.5.0/criu/arch/s390/vdso-pie.c000066400000000000000000000027051471504326700176510ustar00rootroot00000000000000#include #include "asm/types.h" #include #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX #undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " /* * Trampoline instruction sequence */ typedef struct { u8 larl[6]; /* Load relative address of imm64 */ u8 lg[6]; /* Load %r1 with imm64 */ u8 br[2]; /* Branch to %r1 */ u64 addr; /* Jump address */ u32 guards; /* Guard bytes */ } __packed jmp_t; /* * Trampoline template: Use %r1 to jump */ jmp_t jmp = { /* larl %r1,e (addr) */ .larl = { 0xc0, 0x10, 0x00, 0x00, 0x00, 0x07 }, /* lg %r1,0(%r1) */ .lg = { 0xe3, 0x10, 0x10, 0x00, 0x00, 0x04 }, /* br %r1 */ .br = { 0x07, 0xf1 }, .guards = 0xcccccccc, }; /* * Insert trampoline code into old vdso entry points to * jump to new vdso functions. */ int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, struct vdso_symtable *from, bool __always_unused compat_vdso) { unsigned int i; for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { if (vdso_symbol_empty(&from->symbols[i])) continue; pr_debug("jmp: %s: %lx/%lx -> %lx/%lx (index %d)\n", from->symbols[i].name, base_from, from->symbols[i].offset, base_to, to->symbols[i].offset, i); jmp.addr = base_to + to->symbols[i].offset; memcpy((void *)(base_from + from->symbols[i].offset), &jmp, sizeof(jmp)); } return 0; } crac-criu-1.5.0/criu/arch/x86/000077500000000000000000000000001471504326700157025ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/x86/Makefile000066400000000000000000000005201471504326700173370ustar00rootroot00000000000000builtin-name := crtools.built-in.o asflags-y += -Wstrict-prototypes asflags-y += -nostdlib -fomit-frame-pointer asflags-y += -iquote $(obj)/include ldflags-y += -r -z noexecstack obj-y += cpu.o obj-y += crtools.o obj-y += kerndat.o obj-y += sigframe.o ifeq ($(CONFIG_COMPAT),y) obj-y += sigaction_compat.o endif crac-criu-1.5.0/criu/arch/x86/cpu.c000066400000000000000000000334741471504326700166500ustar00rootroot00000000000000#include #include #include #include #include #include #include "bitops.h" #include "asm/cpu.h" #include #include #include "common/compiler.h" #include "cr_options.h" #include "image.h" #include "util.h" #include "log.h" #include "cpu.h" #include "protobuf.h" #include "images/cpuinfo.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_cpu_info; static int cpu_has_unsupported_features(void) { /* * Put any unsupported features here. */ return 0; } int cpu_init(void) { compel_cpu_copy_cpuinfo(&rt_cpu_info); BUILD_BUG_ON(sizeof(struct xsave_struct) != XSAVE_SIZE); BUILD_BUG_ON(sizeof(struct i387_fxsave_struct) != FXSAVE_SIZE); /* * Make sure that at least FPU is onboard * and fxsave is supported. */ if (compel_cpu_has_feature(X86_FEATURE_FPU)) { if (!compel_cpu_has_feature(X86_FEATURE_FXSR)) { pr_err("missing support fxsave/restore insns\n"); return -1; } } pr_debug("fpu:%d fxsr:%d xsave:%d xsaveopt:%d xsavec:%d xgetbv1:%d xsaves:%d\n", !!compel_cpu_has_feature(X86_FEATURE_FPU), !!compel_cpu_has_feature(X86_FEATURE_FXSR), !!compel_cpu_has_feature(X86_FEATURE_OSXSAVE), !!compel_cpu_has_feature(X86_FEATURE_XSAVEOPT), !!compel_cpu_has_feature(X86_FEATURE_XSAVEC), !!compel_cpu_has_feature(X86_FEATURE_XGETBV1), !!compel_cpu_has_feature(X86_FEATURE_XSAVES)); return cpu_has_unsupported_features() ? -1 : 0; } int cpu_dump_cpuinfo(void) { CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT; CpuinfoX86Entry cpu_x86_info = CPUINFO_X86_ENTRY__INIT; CpuinfoX86Entry *cpu_x86_info_ptr = &cpu_x86_info; struct cr_img *img; img = open_image(CR_FD_CPUINFO, O_DUMP); if (!img) return -1; cpu_info.x86_entry = &cpu_x86_info_ptr; cpu_info.n_x86_entry = 1; cpu_x86_info.vendor_id = (rt_cpu_info.x86_vendor == X86_VENDOR_INTEL) ? CPUINFO_X86_ENTRY__VENDOR__INTEL : CPUINFO_X86_ENTRY__VENDOR__AMD; cpu_x86_info.cpu_family = rt_cpu_info.x86_family; cpu_x86_info.model = rt_cpu_info.x86_model; cpu_x86_info.stepping = rt_cpu_info.x86_mask; cpu_x86_info.capability_ver = 2; cpu_x86_info.n_capability = ARRAY_SIZE(rt_cpu_info.x86_capability); cpu_x86_info.capability = (void *)rt_cpu_info.x86_capability; cpu_x86_info.has_xfeatures_mask = true; cpu_x86_info.xfeatures_mask = rt_cpu_info.xfeatures_mask; cpu_x86_info.has_xsave_size = true; cpu_x86_info.xsave_size = rt_cpu_info.xsave_size; cpu_x86_info.has_xsave_size_max = true; cpu_x86_info.xsave_size_max = rt_cpu_info.xsave_size_max; if (rt_cpu_info.x86_model_id[0]) cpu_x86_info.model_id = rt_cpu_info.x86_model_id; if (pb_write_one(img, &cpu_info, PB_CPUINFO) < 0) { close_image(img); return -1; } close_image(img); return 0; } #define __ins_bit(__l, __v) (1u << ((__v)-32u * (__l))) // clang-format off static uint32_t x86_ins_capability_mask[NCAPINTS] = { [CPUID_1_EDX] = __ins_bit(CPUID_1_EDX, X86_FEATURE_FPU) | __ins_bit(CPUID_1_EDX, X86_FEATURE_TSC) | __ins_bit(CPUID_1_EDX, X86_FEATURE_CX8) | __ins_bit(CPUID_1_EDX, X86_FEATURE_SEP) | __ins_bit(CPUID_1_EDX, X86_FEATURE_CMOV) | __ins_bit(CPUID_1_EDX, X86_FEATURE_CLFLUSH) | __ins_bit(CPUID_1_EDX, X86_FEATURE_MMX) | __ins_bit(CPUID_1_EDX, X86_FEATURE_FXSR) | __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM) | __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM2), [CPUID_8000_0001_EDX] = __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_SYSCALL) | __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_MMXEXT) | __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_RDTSCP) | __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOWEXT) | __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOW), [CPUID_LNX_1] = __ins_bit(CPUID_LNX_1, X86_FEATURE_REP_GOOD) | __ins_bit(CPUID_LNX_1, X86_FEATURE_NOPL), [CPUID_1_ECX] = __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM3) | __ins_bit(CPUID_1_ECX, X86_FEATURE_PCLMULQDQ) | __ins_bit(CPUID_1_ECX, X86_FEATURE_MWAIT) | __ins_bit(CPUID_1_ECX, X86_FEATURE_SSSE3) | __ins_bit(CPUID_1_ECX, X86_FEATURE_CX16) | __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_1) | __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_2) | __ins_bit(CPUID_1_ECX, X86_FEATURE_MOVBE) | __ins_bit(CPUID_1_ECX, X86_FEATURE_POPCNT) | __ins_bit(CPUID_1_ECX, X86_FEATURE_AES) | __ins_bit(CPUID_1_ECX, X86_FEATURE_XSAVE) | __ins_bit(CPUID_1_ECX, X86_FEATURE_OSXSAVE) | __ins_bit(CPUID_1_ECX, X86_FEATURE_AVX) | __ins_bit(CPUID_1_ECX, X86_FEATURE_F16C) | __ins_bit(CPUID_1_ECX, X86_FEATURE_RDRAND), [CPUID_8000_0001_ECX] = __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_ABM) | __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_SSE4A) | __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_MISALIGNSSE) | __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_3DNOWPREFETCH) | __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_XOP) | __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_FMA4) | __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_TBM), [CPUID_7_0_EBX] = __ins_bit(CPUID_7_0_EBX, X86_FEATURE_FSGSBASE) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI1) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_HLE) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX2) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI2) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ERMS) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RTM) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_MPX) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512F) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512DQ) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RDSEED) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ADX) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_CLFLUSHOPT) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512PF) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512ER) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512CD) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_SHA_NI) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512BW) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512VL), [CPUID_D_1_EAX] = __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEOPT) | __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEC) | __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XGETBV1), [CPUID_7_0_ECX] = __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512VBMI) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VBMI2) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_GFNI) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VAES) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VPCLMULQDQ) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VNNI) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_BITALG) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_TME) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VPOPCNTDQ) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_RDPID), [CPUID_8000_0008_EBX] = __ins_bit(CPUID_8000_0008_EBX, X86_FEATURE_CLZERO), [CPUID_7_0_EDX] = __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4VNNIW) | __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4FMAPS), }; // clang-format on #undef __ins_bit static int cpu_validate_ins_features(compel_cpuinfo_t *cpu_info) { size_t i; for (i = 0; i < ARRAY_SIZE(cpu_info->x86_capability); i++) { uint32_t s = cpu_info->x86_capability[i] & x86_ins_capability_mask[i]; uint32_t d = rt_cpu_info.x86_capability[i] & x86_ins_capability_mask[i]; /* * Destination might be more feature rich * but not the reverse. */ if (s & ~d) { pr_err("CPU instruction capabilities do not match run time\n"); return -1; } } return 0; } static int cpu_validate_features(compel_cpuinfo_t *cpu_info) { if (cpu_has_unsupported_features()) return -1; if (opts.cpu_cap & CPU_CAP_FPU) { uint64_t m; /* * If we're requested to check FPU only ignore * any other bit. It's up to a user if the * rest of mismatches won't cause problems. */ #define __mismatch_fpu_bit(__bit) (test_bit(__bit, (void *)cpu_info->x86_capability) && !compel_cpu_has_feature(__bit)) if (__mismatch_fpu_bit(X86_FEATURE_FPU) || __mismatch_fpu_bit(X86_FEATURE_FXSR) || __mismatch_fpu_bit(X86_FEATURE_OSXSAVE) || __mismatch_fpu_bit(X86_FEATURE_XSAVES)) { pr_err("FPU feature required by image " "is not supported on host " "(fpu:%d fxsr:%d osxsave:%d xsaves:%d)\n", __mismatch_fpu_bit(X86_FEATURE_FPU), __mismatch_fpu_bit(X86_FEATURE_FXSR), __mismatch_fpu_bit(X86_FEATURE_OSXSAVE), __mismatch_fpu_bit(X86_FEATURE_XSAVES)); return -1; } #undef __mismatch_fpu_bit /* * Make sure the xsave features are compatible. Check that on * the destination there are all the features which were on the * source. */ if ((m = cpu_info->xfeatures_mask & ~rt_cpu_info.xfeatures_mask)) { pr_err("CPU xfeatures has unsupported bits (%#" PRIx64 ")\n", m); return -1; } /* * Make sure the xsave sizes are compatible. We already hit the * issue with libc where we've checkpointed the container on * old machine but restored on more modern one and libc fetched * new xsave frame size directly by xsave instruction with * greedy feature mask causing programs to misbehave. */ if (cpu_info->xsave_size != rt_cpu_info.xsave_size) { pr_err("CPU xsave size mismatch (%u/%u)\n", cpu_info->xsave_size, rt_cpu_info.xsave_size); return -1; } if (cpu_info->xsave_size_max != rt_cpu_info.xsave_size_max) { pr_err("CPU xsave max size mismatch (%u/%u)\n", cpu_info->xsave_size_max, rt_cpu_info.xsave_size_max); return -1; } } /* * Capability on instructions level only. */ if (opts.cpu_cap & CPU_CAP_INS) { if (cpu_validate_ins_features(cpu_info)) return -1; } /* * Strict capability mode. Everything must match. */ if (opts.cpu_cap & CPU_CAP_CPU) { if (memcmp(cpu_info->x86_capability, rt_cpu_info.x86_capability, sizeof(cpu_info->x86_capability))) { pr_err("CPU capabilities do not match run time\n"); return -1; } } return 0; } static const struct { const uint32_t capability_ver; const uint32_t ncapints; } ncapints[] = { { .capability_ver = 1, .ncapints = NCAPINTS_V1 }, { .capability_ver = 2, .ncapints = NCAPINTS_V2 }, }; static compel_cpuinfo_t *img_to_cpuinfo(CpuinfoX86Entry *img_x86_entry) { compel_cpuinfo_t *cpu_info; size_t size, i; BUILD_BUG_ON(sizeof(img_x86_entry->capability[0]) != sizeof(cpu_info->x86_capability[0])); BUILD_BUG_ON(ARRAY_SIZE(rt_cpu_info.x86_capability) != NCAPINTS); if (img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__INTEL && img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__AMD) { pr_err("Image carries unknown vendor %u\n", (unsigned)img_x86_entry->vendor_id); return NULL; } for (i = 0; i < ARRAY_SIZE(ncapints); i++) { if (img_x86_entry->capability_ver == ncapints[i].capability_ver) { if (img_x86_entry->n_capability != ncapints[i].ncapints) { pr_err("Image carries %u words while %u expected\n", (unsigned)img_x86_entry->n_capability, (unsigned)ncapints[i].ncapints); return NULL; } break; } } if (i >= ARRAY_SIZE(ncapints)) { pr_err("Image carries unknown capability version %d\n", (unsigned)img_x86_entry->capability_ver); return NULL; } cpu_info = xzalloc(sizeof(*cpu_info)); if (!cpu_info) return NULL; /* * Copy caps from image and fill the left ones from * run-time information for easier compatibility testing. */ size = sizeof(img_x86_entry->capability[0]) * img_x86_entry->n_capability; memcpy(cpu_info->x86_capability, img_x86_entry->capability, size); if (img_x86_entry->capability_ver == 1) { memcpy(&cpu_info->x86_capability[NCAPINTS_V1], &rt_cpu_info.x86_capability[NCAPINTS_V1], (NCAPINTS_V2 - NCAPINTS_V1) * sizeof(rt_cpu_info.x86_capability[0])); } if (img_x86_entry->vendor_id == CPUINFO_X86_ENTRY__VENDOR__INTEL) cpu_info->x86_vendor = X86_VENDOR_INTEL; else cpu_info->x86_vendor = X86_VENDOR_AMD; cpu_info->x86_family = img_x86_entry->cpu_family; cpu_info->x86_model = img_x86_entry->model; cpu_info->x86_mask = img_x86_entry->stepping; cpu_info->extended_cpuid_level = rt_cpu_info.extended_cpuid_level; cpu_info->cpuid_level = rt_cpu_info.cpuid_level; cpu_info->x86_power = rt_cpu_info.x86_power; memcpy(cpu_info->x86_vendor_id, rt_cpu_info.x86_model_id, sizeof(cpu_info->x86_vendor_id)); strncpy(cpu_info->x86_model_id, img_x86_entry->model_id, sizeof(cpu_info->x86_model_id) - 1); /* * For old images where no xfeatures_mask present we * simply fetch runtime cpu mask because later we will * do either instruction capability check, either strict * check for capabilities. */ if (!img_x86_entry->has_xfeatures_mask) { cpu_info->xfeatures_mask = rt_cpu_info.xfeatures_mask; } else cpu_info->xfeatures_mask = img_x86_entry->xfeatures_mask; /* * Same for other fields. */ if (!img_x86_entry->has_xsave_size) cpu_info->xsave_size = rt_cpu_info.xsave_size; else cpu_info->xsave_size = img_x86_entry->xsave_size; if (!img_x86_entry->has_xsave_size_max) cpu_info->xsave_size_max = rt_cpu_info.xsave_size_max; else cpu_info->xsave_size_max = img_x86_entry->xsave_size_max; return cpu_info; } int cpu_validate_cpuinfo(void) { compel_cpuinfo_t *cpu_info = NULL; CpuinfoX86Entry *img_x86_entry; CpuinfoEntry *img_cpu_info; struct cr_img *img; int ret = -1; img = open_image(CR_FD_CPUINFO, O_RSTR); if (!img) return -1; if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) goto err; if (img_cpu_info->n_x86_entry != 1) { pr_err("No x86 related cpuinfo in image, " "corruption (n_x86_entry = %zi)\n", img_cpu_info->n_x86_entry); goto err; } img_x86_entry = img_cpu_info->x86_entry[0]; if (img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__INTEL && img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__AMD) { pr_err("Unknown cpu vendor %d\n", img_x86_entry->vendor_id); goto err; } cpu_info = img_to_cpuinfo(img_x86_entry); if (cpu_info) ret = cpu_validate_features(cpu_info); err: xfree(cpu_info); close_image(img); return ret; } int cpuinfo_dump(void) { if (cpu_init()) return -1; if (cpu_dump_cpuinfo()) return -1; return 0; } int cpuinfo_check(void) { if (cpu_init()) return 1; /* * Force to check all caps if empty passed, * still allow to check instructions only * and etc. */ if (opts.cpu_cap == CPU_CAP_NONE) opts.cpu_cap = CPU_CAP_ALL; if (cpu_validate_cpuinfo()) return 1; return 0; } crac-criu-1.5.0/criu/arch/x86/crtools.c000066400000000000000000000514351471504326700175430ustar00rootroot00000000000000#include "compel/asm/fpu.h" #include "compel/infect.h" #include "compel/plugins/std/syscall-codes.h" #include "cpu.h" #include "cr_options.h" #include "images/core.pb-c.h" #include "log.h" #include "protobuf.h" #include "types.h" #include "asm/compat.h" #undef LOG_PREFIX #define LOG_PREFIX "x86: " #define XSAVE_PB_NELEMS(__s, __obj, __member) (sizeof(__s) / sizeof(*(__obj)->__member)) int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; UserX86RegsEntry *gpregs = core->thread_info->gpregs; #define assign_reg(dst, src, e) \ do { \ dst->e = (__typeof__(dst->e))src.e; \ } while (0) #define assign_array(dst, src, e) memcpy(dst->e, &src.e, sizeof(src.e)) #define assign_xsave(feature, xsave, member, area) \ do { \ if (compel_fpu_has_feature(feature)) { \ uint32_t off = compel_fpu_feature_offset(feature); \ void *from = &area[off]; \ size_t size = pb_repeated_size(xsave, member); \ size_t xsize = (size_t)compel_fpu_feature_size(feature); \ if (xsize != size) { \ pr_err("%s reported %zu bytes (expecting %zu)\n", #feature, xsize, size); \ return -1; \ } \ memcpy(xsave->member, from, size); \ } \ } while (0) if (user_regs_native(regs)) { assign_reg(gpregs, regs->native, r15); assign_reg(gpregs, regs->native, r14); assign_reg(gpregs, regs->native, r13); assign_reg(gpregs, regs->native, r12); assign_reg(gpregs, regs->native, bp); assign_reg(gpregs, regs->native, bx); assign_reg(gpregs, regs->native, r11); assign_reg(gpregs, regs->native, r10); assign_reg(gpregs, regs->native, r9); assign_reg(gpregs, regs->native, r8); assign_reg(gpregs, regs->native, ax); assign_reg(gpregs, regs->native, cx); assign_reg(gpregs, regs->native, dx); assign_reg(gpregs, regs->native, si); assign_reg(gpregs, regs->native, di); assign_reg(gpregs, regs->native, orig_ax); assign_reg(gpregs, regs->native, ip); assign_reg(gpregs, regs->native, cs); assign_reg(gpregs, regs->native, flags); assign_reg(gpregs, regs->native, sp); assign_reg(gpregs, regs->native, ss); assign_reg(gpregs, regs->native, fs_base); assign_reg(gpregs, regs->native, gs_base); assign_reg(gpregs, regs->native, ds); assign_reg(gpregs, regs->native, es); assign_reg(gpregs, regs->native, fs); assign_reg(gpregs, regs->native, gs); gpregs->mode = USER_X86_REGS_MODE__NATIVE; } else { assign_reg(gpregs, regs->compat, bx); assign_reg(gpregs, regs->compat, cx); assign_reg(gpregs, regs->compat, dx); assign_reg(gpregs, regs->compat, si); assign_reg(gpregs, regs->compat, di); assign_reg(gpregs, regs->compat, bp); assign_reg(gpregs, regs->compat, ax); assign_reg(gpregs, regs->compat, ds); assign_reg(gpregs, regs->compat, es); assign_reg(gpregs, regs->compat, fs); assign_reg(gpregs, regs->compat, gs); assign_reg(gpregs, regs->compat, orig_ax); assign_reg(gpregs, regs->compat, ip); assign_reg(gpregs, regs->compat, cs); assign_reg(gpregs, regs->compat, flags); assign_reg(gpregs, regs->compat, sp); assign_reg(gpregs, regs->compat, ss); gpregs->mode = USER_X86_REGS_MODE__COMPAT; } gpregs->has_mode = true; if (!fpregs) return 0; assign_reg(core->thread_info->fpregs, fpregs->i387, cwd); assign_reg(core->thread_info->fpregs, fpregs->i387, swd); assign_reg(core->thread_info->fpregs, fpregs->i387, twd); assign_reg(core->thread_info->fpregs, fpregs->i387, fop); assign_reg(core->thread_info->fpregs, fpregs->i387, rip); assign_reg(core->thread_info->fpregs, fpregs->i387, rdp); assign_reg(core->thread_info->fpregs, fpregs->i387, mxcsr); assign_reg(core->thread_info->fpregs, fpregs->i387, mxcsr_mask); /* Make sure we have enough space */ BUG_ON(core->thread_info->fpregs->n_st_space != ARRAY_SIZE(fpregs->i387.st_space)); BUG_ON(core->thread_info->fpregs->n_xmm_space != ARRAY_SIZE(fpregs->i387.xmm_space)); assign_array(core->thread_info->fpregs, fpregs->i387, st_space); assign_array(core->thread_info->fpregs, fpregs->i387, xmm_space); if (compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { UserX86XsaveEntry *xsave = core->thread_info->fpregs->xsave; uint8_t *extended_state_area = (void *)fpregs; /* * xcomp_bv is designated for compacted format but user * space never use it, thus we can simply ignore. */ assign_reg(xsave, fpregs->xsave_hdr, xstate_bv); assign_xsave(XFEATURE_YMM, xsave, ymmh_space, extended_state_area); assign_xsave(XFEATURE_BNDREGS, xsave, bndreg_state, extended_state_area); assign_xsave(XFEATURE_BNDCSR, xsave, bndcsr_state, extended_state_area); assign_xsave(XFEATURE_OPMASK, xsave, opmask_reg, extended_state_area); assign_xsave(XFEATURE_ZMM_Hi256, xsave, zmm_upper, extended_state_area); assign_xsave(XFEATURE_Hi16_ZMM, xsave, hi16_zmm, extended_state_area); assign_xsave(XFEATURE_PKRU, xsave, pkru, extended_state_area); } #undef assign_reg #undef assign_array #undef assign_xsave return 0; } static void alloc_tls(ThreadInfoX86 *ti, void **mempool) { int i; ti->tls = xptr_pull_s(mempool, GDT_ENTRY_TLS_NUM * sizeof(UserDescT *)); ti->n_tls = GDT_ENTRY_TLS_NUM; for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { ti->tls[i] = xptr_pull(mempool, UserDescT); user_desc_t__init(ti->tls[i]); } } static int alloc_xsave_extends(UserX86XsaveEntry *xsave) { if (compel_fpu_has_feature(XFEATURE_YMM)) { xsave->n_ymmh_space = XSAVE_PB_NELEMS(struct ymmh_struct, xsave, ymmh_space); xsave->ymmh_space = xzalloc(pb_repeated_size(xsave, ymmh_space)); if (!xsave->ymmh_space) goto err; } if (compel_fpu_has_feature(XFEATURE_BNDREGS)) { xsave->n_bndreg_state = XSAVE_PB_NELEMS(struct mpx_bndreg_state, xsave, bndreg_state); xsave->bndreg_state = xzalloc(pb_repeated_size(xsave, bndreg_state)); if (!xsave->bndreg_state) goto err; } if (compel_fpu_has_feature(XFEATURE_BNDCSR)) { xsave->n_bndcsr_state = XSAVE_PB_NELEMS(struct mpx_bndcsr_state, xsave, bndcsr_state); xsave->bndcsr_state = xzalloc(pb_repeated_size(xsave, bndcsr_state)); if (!xsave->bndcsr_state) goto err; } if (compel_fpu_has_feature(XFEATURE_OPMASK)) { xsave->n_opmask_reg = XSAVE_PB_NELEMS(struct avx_512_opmask_state, xsave, opmask_reg); xsave->opmask_reg = xzalloc(pb_repeated_size(xsave, opmask_reg)); if (!xsave->opmask_reg) goto err; } if (compel_fpu_has_feature(XFEATURE_ZMM_Hi256)) { xsave->n_zmm_upper = XSAVE_PB_NELEMS(struct avx_512_zmm_uppers_state, xsave, zmm_upper); xsave->zmm_upper = xzalloc(pb_repeated_size(xsave, zmm_upper)); if (!xsave->zmm_upper) goto err; } if (compel_fpu_has_feature(XFEATURE_Hi16_ZMM)) { xsave->n_hi16_zmm = XSAVE_PB_NELEMS(struct avx_512_hi16_state, xsave, hi16_zmm); xsave->hi16_zmm = xzalloc(pb_repeated_size(xsave, hi16_zmm)); if (!xsave->hi16_zmm) goto err; } if (compel_fpu_has_feature(XFEATURE_PKRU)) { xsave->n_pkru = XSAVE_PB_NELEMS(struct pkru_state, xsave, pkru); xsave->pkru = xzalloc(pb_repeated_size(xsave, pkru)); if (!xsave->pkru) goto err; } return 0; err: return -1; } int arch_alloc_thread_info(CoreEntry *core) { size_t sz; bool with_fpu, with_xsave = false; void *m; ThreadInfoX86 *ti = NULL; with_fpu = compel_cpu_has_feature(X86_FEATURE_FPU); sz = sizeof(ThreadInfoX86) + sizeof(UserX86RegsEntry) + GDT_ENTRY_TLS_NUM * sizeof(UserDescT) + GDT_ENTRY_TLS_NUM * sizeof(UserDescT *); if (with_fpu) { sz += sizeof(UserX86FpregsEntry); with_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); if (with_xsave) sz += sizeof(UserX86XsaveEntry); } m = xmalloc(sz); if (!m) return -1; ti = core->thread_info = xptr_pull(&m, ThreadInfoX86); thread_info_x86__init(ti); ti->gpregs = xptr_pull(&m, UserX86RegsEntry); user_x86_regs_entry__init(ti->gpregs); alloc_tls(ti, &m); if (with_fpu) { UserX86FpregsEntry *fpregs; fpregs = ti->fpregs = xptr_pull(&m, UserX86FpregsEntry); user_x86_fpregs_entry__init(fpregs); /* These are numbers from kernel */ fpregs->n_st_space = 32; fpregs->n_xmm_space = 64; fpregs->st_space = xzalloc(pb_repeated_size(fpregs, st_space)); fpregs->xmm_space = xzalloc(pb_repeated_size(fpregs, xmm_space)); if (!fpregs->st_space || !fpregs->xmm_space) goto err; if (with_xsave) { UserX86XsaveEntry *xsave; xsave = fpregs->xsave = xptr_pull(&m, UserX86XsaveEntry); user_x86_xsave_entry__init(xsave); if (alloc_xsave_extends(xsave)) goto err; } } return 0; err: return -1; } void arch_free_thread_info(CoreEntry *core) { if (!core->thread_info) return; if (core->thread_info->fpregs->xsave) { xfree(core->thread_info->fpregs->xsave->ymmh_space); xfree(core->thread_info->fpregs->xsave->pkru); xfree(core->thread_info->fpregs->xsave->hi16_zmm); xfree(core->thread_info->fpregs->xsave->zmm_upper); xfree(core->thread_info->fpregs->xsave->opmask_reg); xfree(core->thread_info->fpregs->xsave->bndcsr_state); xfree(core->thread_info->fpregs->xsave->bndreg_state); } xfree(core->thread_info->fpregs->st_space); xfree(core->thread_info->fpregs->xmm_space); xfree(core->thread_info); } static bool valid_xsave_frame(CoreEntry *core) { UserX86XsaveEntry *xsave = core->thread_info->fpregs->xsave; struct xsave_struct *x = NULL; if (core->thread_info->fpregs->n_st_space < ARRAY_SIZE(x->i387.st_space)) { pr_err("Corruption in FPU st_space area " "(got %li but %li expected)\n", (long)core->thread_info->fpregs->n_st_space, (long)ARRAY_SIZE(x->i387.st_space)); return false; } if (core->thread_info->fpregs->n_xmm_space < ARRAY_SIZE(x->i387.xmm_space)) { pr_err("Corruption in FPU xmm_space area " "(got %li but %li expected)\n", (long)core->thread_info->fpregs->n_st_space, (long)ARRAY_SIZE(x->i387.xmm_space)); return false; } if (compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { if (xsave) { size_t i; struct { const char *name; size_t expected; size_t obtained; void *ptr; } features[] = { { .name = __stringify_1(XFEATURE_YMM), .expected = XSAVE_PB_NELEMS(struct ymmh_struct, xsave, ymmh_space), .obtained = xsave->n_ymmh_space, .ptr = xsave->ymmh_space, }, { .name = __stringify_1(XFEATURE_BNDREGS), .expected = XSAVE_PB_NELEMS(struct mpx_bndreg_state, xsave, bndreg_state), .obtained = xsave->n_bndreg_state, .ptr = xsave->bndreg_state, }, { .name = __stringify_1(XFEATURE_BNDCSR), .expected = XSAVE_PB_NELEMS(struct mpx_bndcsr_state, xsave, bndcsr_state), .obtained = xsave->n_bndcsr_state, .ptr = xsave->bndcsr_state, }, { .name = __stringify_1(XFEATURE_OPMASK), .expected = XSAVE_PB_NELEMS(struct avx_512_opmask_state, xsave, opmask_reg), .obtained = xsave->n_opmask_reg, .ptr = xsave->opmask_reg, }, { .name = __stringify_1(XFEATURE_ZMM_Hi256), .expected = XSAVE_PB_NELEMS(struct avx_512_zmm_uppers_state, xsave, zmm_upper), .obtained = xsave->n_zmm_upper, .ptr = xsave->zmm_upper, }, { .name = __stringify_1(XFEATURE_Hi16_ZMM), .expected = XSAVE_PB_NELEMS(struct avx_512_hi16_state, xsave, hi16_zmm), .obtained = xsave->n_hi16_zmm, .ptr = xsave->hi16_zmm, }, { .name = __stringify_1(XFEATURE_PKRU), .expected = XSAVE_PB_NELEMS(struct pkru_state, xsave, pkru), .obtained = xsave->n_pkru, .ptr = xsave->pkru, }, }; for (i = 0; i < ARRAY_SIZE(features); i++) { if (!features[i].ptr) continue; if (features[i].expected > features[i].obtained) { pr_err("Corruption in %s area (expected %zu but %zu obtained)\n", features[i].name, features[i].expected, features[i].obtained); return false; } } } } else { /* * If the image has xsave area present then CPU we're restoring * on must have X86_FEATURE_OSXSAVE feature until explicitly * stated in options. */ if (xsave) { if (opts.cpu_cap & CPU_CAP_FPU) { pr_err("FPU xsave area present, " "but host cpu doesn't support it\n"); return false; } else pr_warn_once("FPU is about to restore ignoring xsave state!\n"); } } return true; } static void show_rt_xsave_frame(struct xsave_struct *x) { struct fpx_sw_bytes *fpx = (void *)&x->i387.sw_reserved; struct xsave_hdr_struct *xsave_hdr = &x->xsave_hdr; struct i387_fxsave_struct *i387 = &x->i387; pr_debug("xsave runtime structure\n"); pr_debug("-----------------------\n"); pr_debug("cwd:%#x swd:%#x twd:%#x fop:%#x mxcsr:%#x mxcsr_mask:%#x\n", (int)i387->cwd, (int)i387->swd, (int)i387->twd, (int)i387->fop, (int)i387->mxcsr, (int)i387->mxcsr_mask); pr_debug("magic1:%#x extended_size:%u xstate_bv:%#lx xstate_size:%u\n", fpx->magic1, fpx->extended_size, (long)fpx->xstate_bv, fpx->xstate_size); pr_debug("xstate_bv: %#lx\n", (long)xsave_hdr->xstate_bv); pr_debug("-----------------------\n"); } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { fpu_state_t *fpu_state = core_is_compat(core) ? &sigframe->compat.fpu_state : &sigframe->native.fpu_state; struct xsave_struct *x = core_is_compat(core) ? (void *)&fpu_state->fpu_state_ia32.xsave : (void *)&fpu_state->fpu_state_64.xsave; /* * If no FPU information provided -- we're restoring * old image which has no FPU support, or the dump simply * has no FPU support at all. */ if (!core->thread_info->fpregs) { fpu_state->has_fpu = false; return 0; } if (!valid_xsave_frame(core)) return -1; fpu_state->has_fpu = true; #define assign_reg(dst, src, e) \ do { \ dst.e = (__typeof__(dst.e))src->e; \ } while (0) #define assign_array(dst, src, e) memcpy(dst.e, (src)->e, sizeof(dst.e)) #define assign_xsave(feature, xsave, member, area) \ do { \ if (compel_fpu_has_feature(feature) && (xsave->xstate_bv & (1UL << feature))) { \ uint32_t off = compel_fpu_feature_offset(feature); \ void *to = &area[off]; \ void *from = xsave->member; \ size_t size = pb_repeated_size(xsave, member); \ size_t xsize = (size_t)compel_fpu_feature_size(feature); \ size_t xstate_size_next = off + xsize; \ if (xsize != size) { \ if (size) { \ pr_err("%s reported %zu bytes (expecting %zu)\n", #feature, xsize, size); \ return -1; \ } else { \ pr_debug("%s is not present in image, ignore\n", #feature); \ } \ } \ xstate_bv |= (1UL << feature); \ BUG_ON(xstate_size > xstate_size_next); \ xstate_size = xstate_size_next; \ memcpy(to, from, size); \ } \ } while (0) assign_reg(x->i387, core->thread_info->fpregs, cwd); assign_reg(x->i387, core->thread_info->fpregs, swd); assign_reg(x->i387, core->thread_info->fpregs, twd); assign_reg(x->i387, core->thread_info->fpregs, fop); assign_reg(x->i387, core->thread_info->fpregs, rip); assign_reg(x->i387, core->thread_info->fpregs, rdp); assign_reg(x->i387, core->thread_info->fpregs, mxcsr); assign_reg(x->i387, core->thread_info->fpregs, mxcsr_mask); assign_array(x->i387, core->thread_info->fpregs, st_space); assign_array(x->i387, core->thread_info->fpregs, xmm_space); if (core_is_compat(core)) compel_convert_from_fxsr(&fpu_state->fpu_state_ia32.fregs_state.i387_ia32, &fpu_state->fpu_state_ia32.xsave.i387); if (compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { struct fpx_sw_bytes *fpx_sw = (void *)&x->i387.sw_reserved; size_t xstate_size = XSAVE_YMM_OFFSET; uint32_t xstate_bv = 0; void *magic2; xstate_bv = XFEATURE_MASK_FP | XFEATURE_MASK_SSE; /* * fpregs->xsave pointer might not present on image so we * simply clear out everything. */ if (core->thread_info->fpregs->xsave) { UserX86XsaveEntry *xsave = core->thread_info->fpregs->xsave; uint8_t *extended_state_area = (void *)x; /* * Note the order does matter here and bound * to the increasing offsets of XFEATURE_x * inside memory layout (xstate_size calculation). */ assign_xsave(XFEATURE_YMM, xsave, ymmh_space, extended_state_area); assign_xsave(XFEATURE_BNDREGS, xsave, bndreg_state, extended_state_area); assign_xsave(XFEATURE_BNDCSR, xsave, bndcsr_state, extended_state_area); assign_xsave(XFEATURE_OPMASK, xsave, opmask_reg, extended_state_area); assign_xsave(XFEATURE_ZMM_Hi256, xsave, zmm_upper, extended_state_area); assign_xsave(XFEATURE_Hi16_ZMM, xsave, hi16_zmm, extended_state_area); assign_xsave(XFEATURE_PKRU, xsave, pkru, extended_state_area); } x->xsave_hdr.xstate_bv = xstate_bv; fpx_sw->magic1 = FP_XSTATE_MAGIC1; fpx_sw->xstate_bv = xstate_bv; fpx_sw->xstate_size = xstate_size; fpx_sw->extended_size = xstate_size + FP_XSTATE_MAGIC2_SIZE; /* * This should be at the end of xsave frame. */ magic2 = (void *)x + xstate_size; *(u32 *)magic2 = FP_XSTATE_MAGIC2; } show_rt_xsave_frame(x); #undef assign_reg #undef assign_array #undef assign_xsave return 0; } #define CPREG32(d) f->compat.uc.uc_mcontext.d = r->d static void restore_compat_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) { CPREG32(gs); CPREG32(fs); CPREG32(es); CPREG32(ds); CPREG32(di); CPREG32(si); CPREG32(bp); CPREG32(sp); CPREG32(bx); CPREG32(dx); CPREG32(cx); CPREG32(ip); CPREG32(ax); CPREG32(cs); CPREG32(ss); CPREG32(flags); f->is_native = false; } #undef CPREG32 #define CPREG64(d, s) f->native.uc.uc_mcontext.d = r->s static void restore_native_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) { CPREG64(rdi, di); CPREG64(rsi, si); CPREG64(rbp, bp); CPREG64(rsp, sp); CPREG64(rbx, bx); CPREG64(rdx, dx); CPREG64(rcx, cx); CPREG64(rip, ip); CPREG64(rax, ax); CPREG64(r8, r8); CPREG64(r9, r9); CPREG64(r10, r10); CPREG64(r11, r11); CPREG64(r12, r12); CPREG64(r13, r13); CPREG64(r14, r14); CPREG64(r15, r15); CPREG64(cs, cs); CPREG64(eflags, flags); f->is_native = true; } #undef CPREG64 int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) { switch (r->mode) { case USER_X86_REGS_MODE__NATIVE: restore_native_gpregs(f, r); break; case USER_X86_REGS_MODE__COMPAT: restore_compat_gpregs(f, r); break; default: pr_err("Can't prepare rt_sigframe: registers mode corrupted (%d)\n", r->mode); return -1; } return 0; } static int get_robust_list32(pid_t pid, uintptr_t head, uintptr_t len) { struct syscall_args32 s = { .nr = __NR32_get_robust_list, .arg0 = pid, .arg1 = (uint32_t)head, .arg2 = (uint32_t)len, }; return do_full_int80(&s); } static int set_robust_list32(uint32_t head, uint32_t len) { struct syscall_args32 s = { .nr = __NR32_set_robust_list, .arg0 = head, .arg1 = len, }; return do_full_int80(&s); } int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info) { void *mmap32; int ret = -1; mmap32 = alloc_compat_syscall_stack(); if (!mmap32) return -1; ret = get_robust_list32(pid, (uintptr_t)mmap32, (uintptr_t)mmap32 + 4); if (ret == -ENOSYS) { /* Check native get_task_futex_robust_list() for details. */ if (set_robust_list32(0, 0) == (uint32_t)-ENOSYS) { info->futex_rla = 0; info->futex_rla_len = 0; ret = 0; } } else if (ret == 0) { uint32_t *arg1 = (uint32_t *)mmap32; info->futex_rla = *arg1; info->futex_rla_len = *(arg1 + 1); ret = 0; } free_compat_syscall_stack(mmap32); return ret; } crac-criu-1.5.0/criu/arch/x86/include/000077500000000000000000000000001471504326700173255ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/x86/include/asm/000077500000000000000000000000001471504326700201055ustar00rootroot00000000000000crac-criu-1.5.0/criu/arch/x86/include/asm/compat.h000066400000000000000000000050451471504326700215450ustar00rootroot00000000000000#ifndef __CR_ASM_COMPAT_H__ #define __CR_ASM_COMPAT_H__ #ifdef CR_NOGLIBC #include #include #else #define sys_mmap mmap #define sys_munmap munmap #endif #include static inline void *alloc_compat_syscall_stack(void) { void *mem = (void *)sys_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if ((uintptr_t)mem % PAGE_SIZE) { int err = (~(uint32_t)(uintptr_t)mem) + 1; pr_err("mmap() of compat syscall stack failed with %d\n", err); return 0; } return mem; } static inline void free_compat_syscall_stack(void *mem) { long int ret = sys_munmap(mem, PAGE_SIZE); if (ret) pr_err("munmap() of compat addr %p failed with %ld\n", mem, ret); } struct syscall_args32 { uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5; }; static inline uint32_t do_full_int80(struct syscall_args32 *args) { /* * Kernel older than v4.4 do not preserve r8-r15 registers when * invoking int80, so we need to preserve them. * * Additionally, %rbp is used as the 6th syscall argument, and we need * to preserve its value when returning from the syscall to avoid * upsetting GCC. However, we can't use %rbp in the GCC asm clobbers * due to a GCC limitation. Instead, we explicitly save %rbp on the * stack before invoking the syscall and restore its value afterward. * * Further, GCC may not adjust the %rsp pointer when allocating the * args and ret variables because 1) do_full_int80() is a leaf * function, and 2) the local variables (args and ret) are in the * 128-byte red-zone as defined in the x86_64 ABI. To use the stack * when preserving %rbp, we must either tell GCC to a) mark the * function as non-leaf, or b) move away from the red-zone when using * the stack. It seems that there is no easy way to do a), so we'll go * with b). * Note 1: Another workaround would have been to add %rsp in the list * of clobbers, but this was deprecated in GCC 9. * Note 2: This red-zone bug only manifests when compiling CRIU with * DEBUG=1. */ uint32_t ret; asm volatile("sub $128, %%rsp\n\t" "pushq %%rbp\n\t" "mov %7, %%ebp\n\t" "int $0x80\n\t" "popq %%rbp\n\t" "add $128, %%rsp\n\t" : "=a"(ret) : "a"(args->nr), "b"(args->arg0), "c"(args->arg1), "d"(args->arg2), "S"(args->arg3), "D"(args->arg4), "g"(args->arg5) : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"); return ret; } #ifndef CR_NOGLIBC #undef sys_mmap #undef sys_munmap #endif #endif crac-criu-1.5.0/criu/arch/x86/include/asm/dump.h000066400000000000000000000015771471504326700212350ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); static inline void core_put_tls(CoreEntry *core, tls_t tls) { ThreadInfoX86 *ti = core->thread_info; int i; for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *from = &tls.desc[i]; UserDescT *to = ti->tls[i]; #define COPY_TLS(field) to->field = from->field COPY_TLS(entry_number); COPY_TLS(base_addr); COPY_TLS(limit); COPY_TLS(seg_32bit); to->contents_h = from->contents & 0x2; to->contents_l = from->contents & 0x1; COPY_TLS(read_exec_only); COPY_TLS(limit_in_pages); COPY_TLS(seg_not_present); COPY_TLS(usable); #undef COPY_TLS } } #endif crac-criu-1.5.0/criu/arch/x86/include/asm/int.h000066400000000000000000000001571471504326700210530ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ crac-criu-1.5.0/criu/arch/x86/include/asm/kerndat.h000066400000000000000000000003331471504326700217050ustar00rootroot00000000000000#ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ extern int kdat_compatible_cr(void); extern int kdat_can_map_vdso(void); extern int kdat_x86_has_ptrace_fpu_xsave_bug(void); #endif /* __CR_ASM_KERNDAT_H__ */ crac-criu-1.5.0/criu/arch/x86/include/asm/parasite-syscall.h000066400000000000000000000002021471504326700235300ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ #include "asm/types.h" struct parasite_ctl; #endif crac-criu-1.5.0/criu/arch/x86/include/asm/parasite.h000066400000000000000000000003371471504326700220710ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ /* * TLS is accessed through PTRACE_GET_THREAD_AREA, * see compel_arch_fetch_thread_area(). */ static inline void arch_get_tls(tls_t *ptls) { (void)ptls; } #endif crac-criu-1.5.0/criu/arch/x86/include/asm/restore.h000066400000000000000000000026451471504326700217500ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" /* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ "movq %0, %%rbx \n" \ "movq %1, %%rax \n" \ "movq %2, %%rdi \n" \ "movq %%rbx, %%rsp \n" \ "callq *%%rax \n" \ : \ : "g"(new_sp), \ "g"(restore_task_exec_start), \ "g"(task_args) \ : "rdi", "rsi", "rbx", "rax", "memory") /* clang-format on */ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { ThreadInfoX86 *ti = pcore->thread_info; size_t i; for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *to = &ptls->desc[i]; UserDescT *from; /* * If proto image has lesser TLS entries, * mark them as not present (and thus skip restore). */ if (i >= ti->n_tls) { to->seg_not_present = 1; continue; } from = ti->tls[i]; #define COPY_TLS(field) to->field = from->field COPY_TLS(entry_number); COPY_TLS(base_addr); COPY_TLS(limit); COPY_TLS(seg_32bit); to->contents = ((u32)from->contents_h << 1) | from->contents_l; COPY_TLS(read_exec_only); COPY_TLS(limit_in_pages); COPY_TLS(seg_not_present); COPY_TLS(usable); #undef COPY_TLS } } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif crac-criu-1.5.0/criu/arch/x86/include/asm/restorer.h000066400000000000000000000153641471504326700221340ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include "asm/types.h" #include #include #include "images/core.pb-c.h" #include #include #include "asm/compat.h" #ifdef CONFIG_COMPAT extern void restore_tls(tls_t *ptls); extern int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act); extern int set_compat_robust_list(uint32_t head_ptr, uint32_t len); #else /* CONFIG_COMPAT */ static inline void restore_tls(tls_t *ptls) { } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } #endif /* !CONFIG_COMPAT */ /* * Documentation copied from glibc sysdeps/unix/sysv/linux/x86_64/clone.S * The kernel expects: * rax: system call number * rdi: flags * rsi: child_stack * rdx: TID field in parent * r10: TID field in child * r8: thread pointer * * int clone(unsigned long clone_flags, unsigned long newsp, * int *parent_tidptr, int *child_tidptr, * unsigned long tls); */ /* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "clone_emul: \n" \ "movq %2, %%rsi \n" \ "subq $16, %%rsi \n" \ "movq %6, %%rdi \n" \ "movq %%rdi, 8(%%rsi) \n" \ "movq %5, %%rdi \n" \ "movq %%rdi, 0(%%rsi) \n" \ "movq %1, %%rdi \n" \ "movq %3, %%rdx \n" \ "movq %4, %%r10 \n" \ "movl $"__stringify(__NR_clone)", %%eax \n" \ "syscall \n" \ \ "testq %%rax,%%rax \n" \ "jz thread_run \n" \ \ "movq %%rax, %0 \n" \ "jmp clone_end \n" \ \ "thread_run: \n" \ "movl $"__stringify(__NR_gettid)", %%eax \n" \ "syscall \n" \ "cmpq %%rax, %7 \n" \ "je cont \n" \ "mov $2, %%rdi \n" \ "movl $"__stringify(__NR_exit)", %%eax \n" \ "syscall \n" \ \ "cont: \n" \ "xorq %%rbp, %%rbp \n" \ "movq 0(%%rsp), %%rax \n" \ "movq 8(%%rsp), %%rdi \n" \ "callq *%%rax \n" \ \ "clone_end: \n" \ : "=r"(ret) \ : "g"(clone_flags), \ "g"(new_sp), \ "g"(&parent_tid), \ "g"(&thread_args[i].pid), \ "g"(clone_restore_fn), \ "g"(&thread_args[i]), \ "g"(thread_args[i].pid) \ : "rax", "rcx", "rdi", "rsi", "rdx", "r10", "r11", "memory") /* int clone3(struct clone_args *args, size_t size) */ #define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ clone_restore_fn) \ asm volatile( \ "clone3_emul: \n" \ /* * Prepare stack pointer for child process. The kernel does * stack + stack_size before passing the stack pointer to the * child process. As we have to put the function and the * arguments for the new process on that stack we have handle * the kernel's implicit stack + stack_size. */ \ "movq (%3), %%rsi /* new stack pointer */ \n" \ /* Move the stack_size to %rax to use later as the offset */ \ "movq %4, %%rax \n" \ /* 16 bytes are needed on the stack for function and args */ \ "subq $16, (%%rsi, %%rax) \n" \ "movq %6, %%rdi /* thread args */ \n" \ "movq %%rdi, 8(%%rsi, %%rax) \n" \ "movq %5, %%rdi /* thread function */ \n" \ "movq %%rdi, 0(%%rsi, %%rax) \n" \ /* * The stack address has been modified for the two * elements above (child function, child arguments). * This modified stack needs to be stored back into the * clone_args structure. */ \ "movq (%%rsi), %3 \n" \ /* * Do the actual clone3() syscall. First argument (%rdi) is * the clone_args structure, second argument is the size * of clone_args. */ \ "movq %1, %%rdi /* clone_args */ \n" \ "movq %2, %%rsi /* size */ \n" \ "movl $"__stringify(__NR_clone3)", %%eax \n" \ "syscall \n" \ /* * If clone3() was successful and if we are in the child * '0' is returned. Jump to the child function handler. */ \ "testq %%rax,%%rax \n" \ "jz thread3_run \n" \ /* Return the PID to the parent process. */ \ "movq %%rax, %0 \n" \ "jmp clone3_end \n" \ \ "thread3_run: /* Child process */ \n" \ /* Clear the frame pointer */ \ "xorq %%rbp, %%rbp \n" \ /* Pop the child function from the stack */ \ "popq %%rax \n" \ /* Pop the child function arguments from the stack */ \ "popq %%rdi \n" \ /* Run the child function */ \ "callq *%%rax \n" \ /* * If the child function is expected to return, this * would be the place to handle the return code. In CRIU's * case the child function is expected to not return * and do exit() itself. */ \ \ "clone3_end: \n" \ : "=r"(ret) \ /* * This uses the "r" modifier for all parameters * as clang complained if using "g". */ \ : "r"(&clone_args), \ "r"(size), \ "r"(&clone_args.stack), \ "r"(clone_args.stack_size), \ "r"(clone_restore_fn), \ "r"(args) \ : "rax", "rcx", "rdi", "rsi", "rdx", "r10", "r11", "memory") #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "movq %0, %%rsp \n" \ "movq 0, %%rax \n" \ "jmp *%%rax \n" \ : \ : "r"(ret) \ : "memory") /* clang-format on */ static inline void __setup_sas_compat(struct ucontext_ia32 *uc, ThreadSasEntry *sas) { uc->uc_stack.ss_sp = (compat_uptr_t)(sas)->ss_sp; uc->uc_stack.ss_flags = (int)(sas)->ss_flags; uc->uc_stack.ss_size = (compat_size_t)(sas)->ss_size; } static inline void __setup_sas(struct rt_sigframe *sigframe, ThreadSasEntry *sas) { if (sigframe->is_native) { struct rt_ucontext *uc = &sigframe->native.uc; uc->uc_stack.ss_sp = (void *)decode_pointer((sas)->ss_sp); uc->uc_stack.ss_flags = (int)(sas)->ss_flags; uc->uc_stack.ss_size = (size_t)(sas)->ss_size; } else { __setup_sas_compat(&sigframe->compat.uc, sas); } } static inline void _setup_sas(struct rt_sigframe *sigframe, ThreadSasEntry *sas) { if (sas) __setup_sas(sigframe, sas); } #define setup_sas _setup_sas int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r); int restore_nonsigframe_gpregs(UserX86RegsEntry *r); int ptrace_set_breakpoint(pid_t pid, void *addr); int ptrace_flush_breakpoints(pid_t pid); extern int arch_map_vdso(unsigned long map_at, bool compatible); #endif crac-criu-1.5.0/criu/arch/x86/include/asm/syscall32.h000066400000000000000000000020551471504326700220770ustar00rootroot00000000000000#ifndef __CR_SYSCALL32_H__ #define __CR_SYSCALL32_H__ extern long sys_socket(int domain, int type, int protocol); extern long sys_connect(int sockfd, struct sockaddr *addr, int addrlen); extern long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len); extern long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len); extern long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags); extern long sys_recvmsg(int sockfd, struct msghdr *msg, int flags); extern long sys_shutdown(int sockfd, int how); extern long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen); extern long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen); extern long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen); extern long sys_shmat(int shmid, void *shmaddr, int shmflag); extern long sys_pread(unsigned int fd, char *ubuf, u32 count, u64 pos); #endif /* __CR_SYSCALL32_H__ */ crac-criu-1.5.0/criu/arch/x86/include/asm/thread_pointer.h000066400000000000000000000023101471504326700232610ustar00rootroot00000000000000/* __thread_pointer definition. x86 version. Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #ifndef _SYS_THREAD_POINTER_H #define _SYS_THREAD_POINTER_H static inline void *__criu_thread_pointer(void) { #if __GNUC_PREREQ(11, 1) return __builtin_thread_pointer(); #else void *__result; #ifdef __x86_64__ __asm__("mov %%fs:0, %0" : "=r"(__result)); #else __asm__("mov %%gs:0, %0" : "=r"(__result)); #endif return __result; #endif /* !GCC 11 */ } #endif /* _SYS_THREAD_POINTER_H */crac-criu-1.5.0/criu/arch/x86/include/asm/types.h000066400000000000000000000015561471504326700214310ustar00rootroot00000000000000#ifndef __CR_ASM_TYPES_H__ #define __CR_ASM_TYPES_H__ #include #include #include "page.h" #include "bitops.h" #include "asm/int.h" #include #include "images/core.pb-c.h" static inline int core_is_compat(CoreEntry *c) { switch (c->thread_info->gpregs->mode) { case USER_X86_REGS_MODE__NATIVE: return 0; case USER_X86_REGS_MODE__COMPAT: return 1; default: return -1; } } #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__X86_64 #define CORE_THREAD_ARCH_INFO(core) core->thread_info #define TI_IP(core) ((core)->thread_info->gpregs->ip) typedef UserX86RegsEntry UserRegsEntry; static inline u64 encode_pointer(void *p) { return (u64)(long)p; } static inline void *decode_pointer(u64 v) { return (void *)(long)v; } #define AT_VECTOR_SIZE 44 typedef uint64_t auxv_t; #endif /* __CR_ASM_TYPES_H__ */ crac-criu-1.5.0/criu/arch/x86/include/asm/vdso.h000066400000000000000000000043711471504326700212360ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "asm-generic/vdso.h" /* This definition is used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' */ /* * This is a minimal amount of symbols * we should support at the moment. */ #define VDSO_SYMBOL_MAX 6 #define VDSO_SYMBOL_GTOD 2 /* * XXX: we don't patch __kernel_vsyscall as it's too small: * * byte *before* *after* * 0x0 push %ecx mov $[rt-vdso],%eax * 0x1 push %edx ^ * 0x2 push %ebp ^ * 0x3 mov %esp,%ebp ^ * 0x5 sysenter jmp *%eax * 0x7 int $0x80 int3 * 0x9 pop %ebp int3 * 0xa pop %edx int3 * 0xb pop %ecx pop %ecx * 0xc ret ret * * As restarting a syscall is quite likely after restore, * the patched version quitly crashes. * vsyscall will be patched again when addressing: * https://github.com/checkpoint-restore/criu/issues/512 */ #define ARCH_VDSO_SYMBOLS_LIST \ const char *aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ const char *aarch_vdso_symbol2 = "__vdso_getcpu"; \ const char *aarch_vdso_symbol3 = "__vdso_gettimeofday"; \ const char *aarch_vdso_symbol4 = "__vdso_time"; \ const char *aarch_vdso_symbol5 = "__kernel_sigreturn"; \ const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; #define ARCH_VDSO_SYMBOLS \ aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5, \ aarch_vdso_symbol6 /* "__kernel_vsyscall", */ #ifndef ARCH_MAP_VDSO_32 #define ARCH_MAP_VDSO_32 0x2002 #endif #ifndef ARCH_MAP_VDSO_64 #define ARCH_MAP_VDSO_64 0x2003 #endif #if defined(CONFIG_COMPAT) && !defined(__ASSEMBLY__) struct vdso_symtable; extern int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t); extern int vdso_fill_symtable_compat(uintptr_t mem, size_t size, struct vdso_symtable *t); static inline int __vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t, bool compat_vdso) { if (compat_vdso) return vdso_fill_symtable_compat(mem, size, t); else return vdso_fill_symtable(mem, size, t); } #endif #endif /* __CR_ASM_VDSO_H__ */ crac-criu-1.5.0/criu/arch/x86/kerndat.c000066400000000000000000000134341471504326700175030ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "compel/asm/fpu.h" #include "compel/plugins/std/syscall-codes.h" #include "cpu.h" #include "kerndat.h" #include "log.h" #include "types.h" #include "asm/compat.h" #include "asm/dump.h" int kdat_can_map_vdso(void) { pid_t child; int stat; /* * Running under fork so if vdso_64 is disabled - don't create * it for criu accidentally. */ child = fork(); if (child < 0) { pr_perror("%s(): failed to fork()", __func__); return -1; } if (child == 0) { int ret; ret = syscall(SYS_arch_prctl, ARCH_MAP_VDSO_32, 0); if (ret == 0) exit(1); /* * Mapping vDSO while have not unmap it yet: * this is restricted by API if ARCH_MAP_VDSO_* is supported. */ if (ret == -1 && errno == EEXIST) exit(1); exit(0); } if (waitpid(child, &stat, 0) != child) { pr_err("Failed to wait for arch_prctl() test\n"); kill(child, SIGKILL); return -1; } if (!WIFEXITED(stat)) return -1; return WEXITSTATUS(stat); } #ifdef CONFIG_COMPAT void *mmap_ia32(void *addr, size_t len, int prot, int flags, int fildes, off_t off) { struct syscall_args32 s; s.nr = __NR32_mmap2; s.arg0 = (uint32_t)(uintptr_t)addr; s.arg1 = (uint32_t)len; s.arg2 = prot; s.arg3 = flags; s.arg4 = fildes; s.arg5 = (uint32_t)off; return (void *)(uintptr_t)do_full_int80(&s); } /* * The idea of the test: * From kernel's top-down allocator we assume here that * 1. A = mmap(0, ...); munmap(A); * 2. B = mmap(0, ...); * results in A == B. * ...but if we have 32-bit mmap() bug, then A will have only lower * 4 bytes of 64-bit address allocated with mmap(). * That means, that the next mmap() will return B != A * (as munmap(A) hasn't really unmapped A mapping). * * As mapping with lower 4 bytes of A may really exist, we run * this test under fork(). * * Another approach to test bug's presence would be to parse * /proc/self/maps before and after 32-bit mmap(), but that would * be soo slow. */ static void mmap_bug_test(void) { void *map1, *map2; int err; map1 = mmap_ia32(0, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); /* 32-bit error, not sign-extended - can't use IS_ERR_VALUE() here */ err = (uintptr_t)map1 % PAGE_SIZE; if (err) { pr_err("ia32 mmap() failed: %d\n", err); exit(1); } if (munmap(map1, PAGE_SIZE)) { pr_perror("Failed to unmap() 32-bit mapping"); exit(1); } map2 = mmap_ia32(0, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); err = (uintptr_t)map2 % PAGE_SIZE; if (err) { pr_err("ia32 mmap() failed: %d\n", err); exit(1); } if (map1 != map2) exit(1); exit(0); } /* * Pre v4.12 kernels have a bug: for a process started as 64-bit * 32-bit mmap() may return 8 byte pointer. * Which is fatal for us: after 32-bit C/R a task will map 64-bit * addresses, cut upper 4 bytes and try to use lower 4 bytes. * This is a check if the bug was fixed in the kernel. */ static int has_32bit_mmap_bug(void) { pid_t child = fork(); int stat; if (child < 0) { pr_perror("%s(): failed to fork()", __func__); return -1; } if (child == 0) mmap_bug_test(); if (waitpid(child, &stat, 0) != child) { pr_err("Failed to wait for mmap test\n"); kill(child, SIGKILL); return -1; } if (!WIFEXITED(stat) || WEXITSTATUS(stat) != 0) return 1; return 0; } int kdat_compatible_cr(void) { if (!kdat.can_map_vdso) return 0; if (has_32bit_mmap_bug()) return 0; return 1; } #else /* !CONFIG_COMPAT */ int kdat_compatible_cr(void) { return 0; } #endif static int kdat_x86_has_ptrace_fpu_xsave_bug_child(void *arg) { if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { int err = errno; pr_pwarn("%d: ptrace(PTRACE_TRACEME) failed", getpid()); _exit(err == EPERM ? 2 : 1); } if (kill(getpid(), SIGSTOP)) pr_perror("%d: failed to kill myself", getpid()); pr_err("Continue after SIGSTOP.. Urr what?\n"); _exit(1); } /* * Pre v4.14 kernels have a bug on Skylake CPUs: * copyout_from_xsaves() creates fpu state for * ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) * without MXCSR and MXCSR_FLAGS if there is SSE/YMM state, but no FP state. * That is xfeatures had either/both XFEATURE_MASK_{SSE,YMM} set, but not * XFEATURE_MASK_FP. * But we *really* need to C/R MXCSR & MXCSR_FLAGS if SSE/YMM active, * as mxcsr store part of the state. */ int kdat_x86_has_ptrace_fpu_xsave_bug(void) { user_fpregs_struct_t xsave = {}; struct iovec iov; char stack[PAGE_SIZE]; int flags = CLONE_VM | CLONE_FILES | CLONE_UNTRACED | SIGCHLD; int ret = -1; pid_t child; int stat; /* OSXSAVE can't be changed during boot. */ if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) return 0; child = clone(kdat_x86_has_ptrace_fpu_xsave_bug_child, stack + ARRAY_SIZE(stack), flags, 0); if (child < 0) { pr_perror("%s(): failed to clone()", __func__); return -1; } if (waitpid(child, &stat, WUNTRACED) != child) { /* * waitpid() may end with ECHILD if SIGCHLD == SIG_IGN, * and the child has stopped already. */ pr_perror("Failed to wait for %s() test", __func__); goto out_kill; } if (!WIFSTOPPED(stat)) { pr_warn("Born child is unstoppable! (might be dead)\n"); if (WIFEXITED(stat) && WEXITSTATUS(stat) == 2) { ret = 0; } goto out_kill; } iov.iov_base = &xsave; iov.iov_len = sizeof(xsave); if (ptrace(PTRACE_GETREGSET, child, (unsigned)NT_X86_XSTATE, &iov) < 0) { pr_perror("Can't obtain FPU registers for %d", child); goto out_kill; } /* * MXCSR should be never 0x0: e.g., it should contain either: * R+/R-/RZ/RN to determine rounding model. */ ret = !xsave.i387.mxcsr; out_kill: if (kill(child, SIGKILL)) pr_pwarn("Failed to kill my own child"); if (waitpid(child, &stat, 0) < 0) pr_pwarn("Failed wait for a dead child"); return ret; } crac-criu-1.5.0/criu/arch/x86/restorer.c000066400000000000000000000047351471504326700177240ustar00rootroot00000000000000#include #include #include "types.h" #include "restorer.h" #include "asm/compat.h" #include "asm/restorer.h" #include #include #include #include #include "log.h" #include "cpu.h" int arch_map_vdso(unsigned long map_at, bool compatible) { int vdso_type = compatible ? ARCH_MAP_VDSO_32 : ARCH_MAP_VDSO_64; pr_debug("Mapping %s vDSO at %lx\n", compatible ? "compatible" : "native", map_at); return sys_arch_prctl(vdso_type, map_at); } int restore_nonsigframe_gpregs(UserX86RegsEntry *r) { long ret; unsigned long fsgs_base; fsgs_base = r->fs_base; ret = sys_arch_prctl(ARCH_SET_FS, fsgs_base); if (ret) { pr_info("SET_FS fail %ld\n", ret); return -1; } fsgs_base = r->gs_base; ret = sys_arch_prctl(ARCH_SET_GS, fsgs_base); if (ret) { pr_info("SET_GS fail %ld\n", ret); return -1; } return 0; } #ifdef CONFIG_COMPAT int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { struct syscall_args32 s = { .nr = __NR32_set_robust_list, .arg0 = head_ptr, .arg1 = len, }; return do_full_int80(&s); } static int prepare_stack32(void **stack32) { if (*stack32) return 0; *stack32 = alloc_compat_syscall_stack(); if (!*stack32) { pr_err("Failed to allocate stack for 32-bit TLS restore\n"); return -1; } return 0; } void restore_tls(tls_t *ptls) { /* * We need here compatible stack, because 32-bit syscalls get * 4-byte pointer and _usally_ restorer is also under 4Gb, but * it can be upper and then pointers are messed up. * (we lose high 4 bytes and... BANG!) * Nothing serious, but syscall will return -EFAULT - or if we're * lucky and lower 4 bytes points on some writeable VMA - corruption). */ void *stack32 = NULL; unsigned i; for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *desc = &ptls->desc[i]; int ret; if (desc->seg_not_present) continue; if (prepare_stack32(&stack32) < 0) return; memcpy(stack32, desc, sizeof(user_desc_t)); asm volatile(" mov %1,%%eax \n" " mov %2,%%ebx \n" " int $0x80 \n" " mov %%eax,%0 \n" : "=g"(ret) : "r"(__NR32_set_thread_area), "r"((uint32_t)(uintptr_t)stack32) : "eax", "ebx", "r8", "r9", "r10", "r11", "memory"); if (ret) pr_err("Failed to restore TLS descriptor %u in GDT: %d\n", desc->entry_number, ret); } if (stack32) free_compat_syscall_stack(stack32); } #endif crac-criu-1.5.0/criu/arch/x86/restorer_unmap.S000066400000000000000000000004161471504326700210740ustar00rootroot00000000000000#include "common/asm/linkage.h" #include "compel/plugins/std/syscall-codes.h" .text ENTRY(__export_unmap_compat) .code32 mov bootstrap_start, %ebx mov bootstrap_len, %ecx sub vdso_rt_size, %ecx movl $__NR32_munmap, %eax int $0x80 int $0x03 /* Guard */ .code64 crac-criu-1.5.0/criu/arch/x86/sigaction_compat.c000066400000000000000000000026521471504326700213760ustar00rootroot00000000000000#include "log.h" #include "asm/restorer.h" #include #include "asm/compat.h" #include #ifdef CR_NOGLIBC #include #endif #include "cpu.h" asm(" .pushsection .text \n" " .global restore_rt_sigaction \n" " .code32 \n" "restore_rt_sigaction: \n" " mov %edx, %esi \n" " mov $0, %edx \n" " movl $" __stringify(__NR32_rt_sigaction) ",%eax \n" " int $0x80 \n" " ret \n" " .popsection \n" " .code64"); extern char restore_rt_sigaction; /* * Call raw rt_sigaction syscall through int80 - so the ABI kernel choses * to deliver this signal would be i386. */ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) { struct syscall_args32 arg = {}; unsigned long act_stack = (unsigned long)stack32; /* To make sure the 32-bit stack was allocated in caller */ if (act_stack >= (uint32_t)-1) { pr_err("compat rt_sigaction without 32-bit stack\n"); return -1; } /* * To be sure, that sigaction pointer lies under 4G, * coping it on the bottom of the stack. */ memcpy(stack32, act, sizeof(rt_sigaction_t_compat)); arg.nr = __NR32_rt_sigaction; arg.arg0 = sig; arg.arg1 = (uint32_t)act_stack; /* act */ arg.arg2 = 0; /* oldact */ arg.arg3 = (uint32_t)sizeof(act->rt_sa_mask); /* sigsetsize */ return do_full_int80(&arg); } crac-criu-1.5.0/criu/arch/x86/sigaction_compat_pie.c000077700000000000000000000000001471504326700257102sigaction_compat.custar00rootroot00000000000000crac-criu-1.5.0/criu/arch/x86/sigframe.c000066400000000000000000000020211471504326700176360ustar00rootroot00000000000000#include #include #include "asm/sigframe.h" #include "asm/types.h" #include "log.h" int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { /* * Use local sigframe to check native/compat type, * but set address for rsigframe. */ fpu_state_t *fpu_state = (sigframe->is_native) ? &rsigframe->native.fpu_state : &rsigframe->compat.fpu_state; if (sigframe->is_native) { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_64.xsave; if ((addr % 64ul)) { pr_err("Unaligned address passed: %lx (native %d)\n", addr, sigframe->is_native); return -1; } sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; } else { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_ia32.xsave; sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; if ((addr % 64ul)) { pr_err("Unaligned address passed: %lx (native %d)\n", addr, sigframe->is_native); return -1; } } return 0; } crac-criu-1.5.0/criu/arch/x86/sys-exec-tbl.c000066400000000000000000000017341471504326700203720ustar00rootroot00000000000000 static struct syscall_exec_desc sc_exec_table_64[] = { #include "sys-exec-tbl-64.c" {}, /* terminator */ }; #ifdef CONFIG_COMPAT static struct syscall_exec_desc sc_exec_table_32[] = { #include "sys-exec-tbl-32.c" {}, /* terminator */ }; #endif struct syscall_exec_desc; static inline struct syscall_exec_desc *find_syscall_table(char *name, struct syscall_exec_desc *tbl) { int i; for (i = 0; tbl[i].name != NULL; i++) if (!strcmp(tbl[i].name, name)) return &tbl[i]; return NULL; } #define ARCH_HAS_FIND_SYSCALL /* overwrite default to search in two tables above */ #ifdef CONFIG_COMPAT struct syscall_exec_desc *find_syscall(char *name, struct parasite_ctl *ctl) { if (compel_mode_native(ctl)) return find_syscall_table(name, sc_exec_table_64); else return find_syscall_table(name, sc_exec_table_32); } #else struct syscall_exec_desc *find_syscall(char *name, __always_unused struct parasite_ctl *ctl) { return find_syscall_table(name, sc_exec_table_64); } #endif crac-criu-1.5.0/criu/arch/x86/vdso-pie.c000066400000000000000000000026641471504326700176040ustar00rootroot00000000000000#include #include "asm/types.h" #include #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX #undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " static void insert_trampoline32(uintptr_t from, uintptr_t to) { struct { u8 movl; u32 imm32; u16 jmp_eax; u32 guards; } __packed jmp = { .movl = 0xb8, .imm32 = (uint32_t)to, .jmp_eax = 0xe0ff, .guards = 0xcccccccc, }; memcpy((void *)from, &jmp, sizeof(jmp)); } static void insert_trampoline64(uintptr_t from, uintptr_t to) { struct { u16 movabs; u64 imm64; u16 jmp_rax; u32 guards; } __packed jmp = { .movabs = 0xb848, .imm64 = to, .jmp_rax = 0xe0ff, .guards = 0xcccccccc, }; memcpy((void *)from, &jmp, sizeof(jmp)); } int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, struct vdso_symtable *sfrom, bool compat_vdso) { unsigned int i; for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { uintptr_t from, to; if (vdso_symbol_empty(&sfrom->symbols[i])) continue; pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, sto->symbols[i].offset, i); from = base_from + sfrom->symbols[i].offset; to = base_to + sto->symbols[i].offset; if (!compat_vdso) insert_trampoline64(from, to); else insert_trampoline32(from, to); } return 0; } crac-criu-1.5.0/criu/autofs.c000066400000000000000000000624251471504326700160160ustar00rootroot00000000000000#include #include #include #include #include #include "int.h" #include "fdinfo.h" #include "autofs.h" #include "rst-malloc.h" #include "mount.h" #include "pstree.h" #include "namespaces.h" #include "protobuf.h" #include "pipes.h" #include "crtools.h" #include "util.h" #include "images/autofs.pb-c.h" #define AUTOFS_OPT_UNKNOWN INT_MIN #define AUTOFS_MODE_DIRECT 0 #define AUTOFS_MODE_INDIRECT 1 #define AUTOFS_MODE_OFFSET 2 #define AUTOFS_CATATONIC_FD -1 static int autofs_mnt_open(const char *mnt_path, dev_t devid); struct autofs_pipe_s { struct list_head list; unsigned long inode; }; struct list_head autofs_pipes = LIST_HEAD_INIT(autofs_pipes); bool is_autofs_pipe(unsigned long inode) { struct autofs_pipe_s *p; list_for_each_entry(p, &autofs_pipes, list) { if (p->inode == inode) return true; } return false; } static int autofs_gather_pipe(unsigned long inode) { struct autofs_pipe_s *pipe; pipe = xmalloc(sizeof(*pipe)); if (!pipe) return -1; pipe->inode = inode; list_add_tail(&pipe->list, &autofs_pipes); return 0; } int autofs_parse(struct mount_info *pm) { long pipe_ino = AUTOFS_OPT_UNKNOWN; char **opts; int nr_opts, i, ret; split(pm->options, ',', &opts, &nr_opts); if (!opts) return -1; for (i = 0; i < nr_opts; i++) { if (!strncmp(opts[i], "pipe_ino=", strlen("pipe_ino="))) if (xatol(opts[i] + strlen("pipe_ino="), &pipe_ino)) { pr_err("pipe_ino (%s) mount option parse failed\n", opts[i] + strlen("pipe_ino=")); ret = -1; goto free; } } /* * We must inform user about bug if pipe_ino is greater than UINT32_MAX, * because it means that something changed in Linux Kernel virtual fs * inode numbers generation mechanism. What we have at the moment: * 1. struct inode i_ino field (include/linux/fs.h in Linux kernel) * has unsigned long type. * 2. get_next_ino() function (fs/inode.c), that used for generating inode * numbers on virtual filesystems (pipefs, debugfs for instance) * has unsigned int as return type. * So, it means that ATM it is safe to keep uint32 type for pipe_id field * in pipe-data.proto. */ if (pipe_ino > UINT32_MAX) { pr_err("overflow: pipe_ino > UINT32_MAX\n"); ret = -1; goto free; } if (pipe_ino == AUTOFS_OPT_UNKNOWN) { pr_warn("Failed to find pipe_ino option (old kernel?)\n"); ret = 0; goto free; } ret = autofs_gather_pipe(pipe_ino); free: for (i = 0; i < nr_opts; i++) xfree(opts[i]); xfree(opts); return ret; } static int autofs_check_fd_stat(struct stat *stat, int prgp, int fd, long ino, int *mode) { struct fdinfo_common fdinfo; if (!S_ISFIFO(stat->st_mode)) return 0; if (stat->st_ino != ino) return 0; if (parse_fdinfo_pid(prgp, fd, FD_TYPES__UND, &fdinfo)) return -1; *mode = fdinfo.flags & O_WRONLY; return 1; } static int autofs_kernel_pipe_alive(int pgrp, int fd, int ino) { struct stat buf; char *path; int ret, fd_mode; path = xsprintf("/proc/%d/fd/%d", pgrp, fd); if (!path) return -1; if (stat(path, &buf) < 0) { if (errno == ENOENT) { xfree(path); return 0; } pr_perror("Failed to stat %s", path); xfree(path); return -1; } xfree(path); ret = autofs_check_fd_stat(&buf, pgrp, fd, ino, &fd_mode); if (ret <= 0) return ret; return O_WRONLY == fd_mode; } static int autofs_find_pipe_read_end(int pgrp, long ino, int *read_fd) { DIR *dir; struct dirent *de; int ret = -1; dir = opendir_proc(pgrp, "fd"); if (dir == NULL) return -1; *read_fd = -1; while ((de = readdir(dir))) { struct stat buf; int found, mode, fd; if (dir_dots(de)) continue; if (fstatat(dirfd(dir), de->d_name, &buf, 0) < 0) { pr_perror("Failed to fstatat"); goto out; } ret = xatoi(de->d_name, &fd); if (ret) goto out; found = autofs_check_fd_stat(&buf, pgrp, fd, ino, &mode); if (found < 0) goto out; if (found && (mode == O_RDONLY)) { *read_fd = fd; break; } } ret = 0; out: closedir(dir); close_pid_proc(); return ret; } static int autofs_find_read_fd(int pgrp, long pipe_ino) { int read_fd, fd; /* We need to find read end and make sure, that it's empty */ if (autofs_find_pipe_read_end(pgrp, pipe_ino, &read_fd) < 0) { pr_err("Failed to find read pipe fd (ino %ld) " "in process %d\n", pipe_ino, pgrp); return -1; } if (read_fd == -1) { pr_err("Master %d doesn't have a read end of the pipe with " "inode %ld opened\n", pgrp, pipe_ino); pr_err("Abandoned mount or control was delegated to child?\n"); return -ENOENT; } /* Let's check, that read end is empty */ fd = open_proc(pgrp, "fd/%d", read_fd); if (fd < 0) return -1; if (fd_has_data(fd)) { pr_err("Process %d autofs pipe fd %d is not empty.\n", pgrp, read_fd); pr_err("Try again later.\n"); return -1; } close(fd); return read_fd; } static int parse_options(char *options, AutofsEntry *entry, long *pipe_ino) { char **opts; int nr_opts, i; int parse_error = 0; entry->fd = AUTOFS_OPT_UNKNOWN; entry->timeout = AUTOFS_OPT_UNKNOWN; entry->minproto = AUTOFS_OPT_UNKNOWN; entry->maxproto = AUTOFS_OPT_UNKNOWN; entry->mode = AUTOFS_OPT_UNKNOWN; entry->pgrp = AUTOFS_OPT_UNKNOWN; entry->uid = AUTOFS_OPT_UNKNOWN; entry->gid = AUTOFS_OPT_UNKNOWN; *pipe_ino = AUTOFS_OPT_UNKNOWN; split(options, ',', &opts, &nr_opts); if (!opts) return -1; for (i = 0; i < nr_opts; i++) { char *opt = opts[i]; int err = 0; if (!strncmp(opt, "fd=", strlen("fd="))) err = xatoi(opt + strlen("fd="), &entry->fd); else if (!strncmp(opt, "pipe_ino=", strlen("pipe_ino="))) err = xatol(opt + strlen("pipe_ino="), pipe_ino); else if (!strncmp(opt, "pgrp=", strlen("pgrp="))) err = xatoi(opt + strlen("pgrp="), &entry->pgrp); else if (!strncmp(opt, "timeout=", strlen("timeout="))) err = xatoi(opt + strlen("timeout="), &entry->timeout); else if (!strncmp(opt, "minproto=", strlen("minproto="))) err = xatoi(opt + strlen("minproto="), &entry->minproto); else if (!strncmp(opt, "maxproto=", strlen("maxproto="))) err = xatoi(opt + strlen("maxproto="), &entry->maxproto); else if (!strcmp(opt, "indirect")) entry->mode = AUTOFS_MODE_INDIRECT; else if (!strcmp(opt, "offset")) entry->mode = AUTOFS_MODE_OFFSET; else if (!strcmp(opt, "direct")) entry->mode = AUTOFS_MODE_DIRECT; else if (!strncmp(opt, "uid=", strlen("uid="))) err = xatoi(opt + strlen("uid="), &entry->uid); else if (!strncmp(opt, "gid=", strlen("gid="))) err = xatoi(opt + strlen("gid="), &entry->gid); if (err) { parse_error = 1; break; } } for (i = 0; i < nr_opts; i++) xfree(opts[i]); xfree(opts); if (parse_error) return -1; if (entry->fd == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find fd option\n"); return -1; } if (entry->pgrp == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find pgrp option\n"); return -1; } if (entry->timeout == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find timeout option\n"); return -1; } if (entry->minproto == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find minproto option\n"); return -1; } if (entry->maxproto == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find maxproto option\n"); return -1; } if (entry->mode == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find mode (direct,indirect,offset) option\n"); return -1; } if (*pipe_ino == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find pipe_ino option (old kernel?)\n"); return -1; } return 0; } static int autofs_revisit_options(struct mount_info *pm) { FILE *f; char *buf; int ret = -ENOMEM; buf = xmalloc(1024); if (!buf) { return -ENOMEM; } f = fopen_proc(getpid(), "mountinfo"); if (!f) goto free_str; while (fgets(buf, 1024, f)) { int mnt_id = -1; char *str = buf; char *token; /* Removing '/n' */ str[strlen(str) - 1] = '\0'; while ((token = strsep(&str, " ")) != NULL) { if (mnt_id == -1) { ret = xatoi(token, &mnt_id); if (ret) goto close_proc; if (mnt_id != pm->mnt_id) break; } else if (strstr(token, "pipe_ino=")) { ret = 0; free(pm->options); pm->options = xstrdup(token); if (!pm->options) pr_err("failed to duplicate string\n"); else ret = 0; goto close_proc; } } } pr_err("failed to find autofs mount with mnt_id %d\n", pm->mnt_id); ret = -ENOENT; close_proc: fclose(f); free_str: free(buf); return ret; } /* * To access the mount point we have to set proper mount namespace. * But, unfortunately, we have to set proper pid namespace as well, * because otherwise autofs driver won't find the autofs master. */ static int access_autofs_mount(struct mount_info *pm) { const char *mnt_path = service_mountpoint(pm) + 1; dev_t dev_id = pm->s_dev; int new_pid_ns = -1, old_pid_ns = -1; int old_mnt_ns, old_cwd_fd; int autofs_mnt; int err = -1; int pid, status; /* * To be able to set proper pid namespace, we must open fd before * switching to the mount namespace. * The same applies to pid namespace fd to restore back. */ new_pid_ns = open_proc(pm->nsid->ns_pid, "ns/pid"); if (new_pid_ns < 0) return -1; old_pid_ns = open_proc(PROC_SELF, "ns/pid"); if (old_pid_ns < 0) goto close_new_pid_ns; if (switch_mnt_ns(pm->nsid->ns_pid, &old_mnt_ns, &old_cwd_fd)) { pr_err("failed to switch to mount namespace\n"); goto close_old_pid_ns; } err = restore_ns(new_pid_ns, &pid_ns_desc); new_pid_ns = -1; if (err) { pr_err("failed to restore pid namespace\n"); goto restore_mnt_ns; } autofs_mnt = autofs_mnt_open(mnt_path, dev_id); if (autofs_mnt < 0) goto restore_pid_ns; pid = fork(); switch (pid) { case -1: pr_err("failed to fork\n"); goto close_autofs_mnt; case 0: /* We don't care about results, all we need is to "touch" */ /* coverity[check_return] */ openat(autofs_mnt, mnt_path, O_RDONLY | O_NONBLOCK | O_DIRECTORY); _exit(0); } /* Here we also don't care about results */ waitpid(pid, &status, 0); err = autofs_revisit_options(pm); close_autofs_mnt: close(autofs_mnt); restore_pid_ns: if (restore_ns(old_pid_ns, &pid_ns_desc)) { pr_err("failed to restore pid namespace\n"); err = -1; } old_pid_ns = -1; restore_mnt_ns: if (restore_mnt_ns(old_mnt_ns, &old_cwd_fd)) { pr_err("failed to restore mount namespace\n"); err = -1; } close_old_pid_ns: if (old_pid_ns >= 0) close(old_pid_ns); close_new_pid_ns: if (new_pid_ns >= 0) close(new_pid_ns); return err; } static int autofs_create_entry(struct mount_info *pm, AutofsEntry *entry) { long pipe_ino; if (parse_options(pm->options, entry, &pipe_ino)) return -1; if (entry->uid != AUTOFS_OPT_UNKNOWN) entry->has_uid = true; if (entry->gid != AUTOFS_OPT_UNKNOWN) entry->has_gid = true; if (entry->fd != AUTOFS_CATATONIC_FD) { int found, read_fd, virt_pgrp; read_fd = autofs_find_read_fd(entry->pgrp, pipe_ino); if (read_fd < 0) { if (read_fd != -ENOENT) return -1; /* Ok, our read end doesn't exist. * There can be a case, when mount looks normal, but * it's a "hidden" or "abandoned" catatonic mount in * reality. * This can happen if: * 1) autofs master process has exited without switching * the mount to catatonic mode (or was killed). * 2) mount point was unmounted, but not propagated to * nested mount namespace with private mounts. * We can try handle these cases by accessing the mount * point. If it's catatonic, it will update it's * options, then we can read them again and dump it. */ if (access_autofs_mount(pm)) { pr_err("failed to access autofs %s\n", service_mountpoint(pm) + 1); return -1; } if (parse_options(pm->options, entry, &pipe_ino)) return -1; if (entry->fd == AUTOFS_CATATONIC_FD) return 0; pr_err("Autofs %d is alive, but unreachable.\n", pm->mnt_id); return -1; } /* Let' check whether write end is still open */ found = autofs_kernel_pipe_alive(entry->pgrp, entry->fd, pipe_ino); if (found < 0) { pr_err("Failed to check fd %d in process %d\n", entry->fd, entry->pgrp); return -1; } /* Write end is absent. we need to carry read end to restore. */ if (!found) { entry->has_read_fd = true; entry->read_fd = read_fd; } /* We need to get virtual pgrp to restore mount */ virt_pgrp = pid_to_virt(entry->pgrp); if (!virt_pgrp) { pr_err("failed to find pstree item with pid %d\n", entry->pgrp); pr_err("Non-catatonic mount without master?\n"); return -1; } entry->pgrp = virt_pgrp; } return 0; } static int autofs_dump_entry(struct mount_info *pm, AutofsEntry *entry) { struct cr_img *img; int ret = -1; img = open_image(CR_FD_AUTOFS, O_DUMP, pm->s_dev); if (img) { ret = pb_write_one(img, entry, PB_AUTOFS); close_image(img); } return ret; } int autofs_dump(struct mount_info *pm) { AutofsEntry *entry; int err; entry = xmalloc(sizeof(*entry)); if (!entry) return -1; autofs_entry__init(entry); err = autofs_create_entry(pm, entry); if (err) goto free_entry; err = autofs_dump_entry(pm, entry); free_entry: free(entry); return err < 0 ? err : 0; } typedef struct autofs_info_s { struct pipe_info pi; AutofsEntry *entry; char *mnt_path; dev_t mnt_dev; struct mount_info *mi; struct pprep_head ph; } autofs_info_t; static int dup_pipe_info(struct pipe_info *pi, int flags, struct file_desc_ops *ops) { struct pipe_info *new; PipeEntry *pe; new = shmalloc(sizeof(*new)); if (!new) return -1; pe = shmalloc(sizeof(*pe)); if (!pe) return -1; pe->id = pi->pe->id; pe->pipe_id = pi->pe->pipe_id; pe->fown = pi->pe->fown; pe->flags = flags; if (collect_one_pipe_ops(new, &pe->base, ops) < 0) { pr_err("Failed to add pipe info for write end\n"); return -1; } return 0; } static int autofs_dup_pipe(struct pstree_item *task, struct fdinfo_list_entry *ple, int new_fd) { struct pipe_info *pi = container_of(ple->desc, struct pipe_info, d); unsigned flags = O_WRONLY; new_fd = find_unused_fd(task, new_fd); if (dup_pipe_info(pi, flags, pi->d.ops) < 0) { pr_err("Failed to dup pipe entry ID %#x PIPE_ID %#x\n", pi->pe->id, pi->pe->pipe_id); return -1; } if (dup_fle(task, ple, new_fd, flags) < 0) { pr_err("Failed to add fd %d to process %d\n", new_fd, vpid(task)); return -1; } pr_info("autofs: added pipe fd %d, flags %#x to %d\n", new_fd, flags, vpid(task)); return new_fd; } static int autofs_ioctl(const char *path, int fd, int cmd, const void *param) { int err; err = ioctl(fd, cmd, param); if (err) pr_perror("%s ioctl failed", path); return err; } static int autofs_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { char *path = "/dev/" AUTOFS_DEVICE_NAME; int fd, err; fd = open(path, O_RDONLY); if (fd == -1) { pr_perror("failed to open %s", path); return -1; } err = autofs_ioctl(path, fd, cmd, param); close(fd); return err; } static int autofs_mnt_make_catatonic(const char *mnt_path, int mnt_fd) { pr_info("%s: set %s catatonic\n", __func__, mnt_path); return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_CATATONIC, NULL); } static int autofs_mnt_set_timeout(time_t timeout, const char *mnt_path, int mnt_fd) { pr_info("%s: set timeout %ld for %s\n", __func__, timeout, mnt_path); return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_SETTIMEOUT, &timeout); } static int autofs_mnt_set_pipefd(const autofs_info_t *i, int mnt_fd) { struct autofs_dev_ioctl param; /* Restore pipe and pgrp only for non-catatonic mounts */ if (i->entry->fd == AUTOFS_CATATONIC_FD) return 0; pr_info("%s: set pipe fd %d (pgrp %d) for mount %s\n", __func__, i->entry->fd, getpgrp(), i->mnt_path); init_autofs_dev_ioctl(¶m); param.ioctlfd = mnt_fd; param.setpipefd.pipefd = i->entry->fd; return autofs_dev_ioctl(AUTOFS_DEV_IOCTL_SETPIPEFD, ¶m); } static int autofs_mnt_close(const char *mnt_path, int mnt_fd) { struct autofs_dev_ioctl param; pr_info("%s: closing fd %d for mount %s\n", __func__, mnt_fd, mnt_path); init_autofs_dev_ioctl(¶m); param.ioctlfd = mnt_fd; return autofs_dev_ioctl(AUTOFS_DEV_IOCTL_CLOSEMOUNT, ¶m); } static int autofs_mnt_open(const char *mnt_path, dev_t devid) { struct autofs_dev_ioctl *param; int err; size_t size, fd; pr_info("%s: open mount %s\n", __func__, mnt_path); size = sizeof(*param) + strlen(mnt_path) + 1; param = xmalloc(size); if (!param) return -1; init_autofs_dev_ioctl(param); param->size = size; strcpy(param->path, mnt_path); param->openmount.devid = devid; err = autofs_dev_ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT, param); fd = param->ioctlfd; free(param); if (err < 0) { pr_err("Failed to get %s fd (devid: %ld)\n", mnt_path, (long)devid); return -1; } return fd; } static int autofs_create_dentries(const struct mount_info *mi, char *mnt_path) { struct mount_info *c; list_for_each_entry(c, &mi->children, siblings) { char *path, *rel_path; rel_path = get_relative_path(c->ns_mountpoint, mi->ns_mountpoint); if (!rel_path) { pr_err("Can't get path %s relative to %s\n", c->ns_mountpoint, mi->ns_mountpoint); return -1; } /* Skip children-overmount */ if (*rel_path == '\0') continue; path = xsprintf("%s/%s", mnt_path, rel_path); if (!path) return -1; if (mkdir(path, 0555) < 0) { pr_perror("Failed to create autofs dentry %s", path); free(path); return -1; } free(path); } return 0; } static int autofs_populate_mount(const struct mount_info *mi, const AutofsEntry *entry) { if (entry->mode != AUTOFS_MODE_INDIRECT) return 0; return autofs_create_dentries(mi, service_mountpoint(mi)); } static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, time_t timeout) { int mnt_fd; pr_info("%s: set timeout for %s and make it catatonic\n", __func__, mnt_path); mnt_fd = autofs_mnt_open(mnt_path, mnt_dev); if (mnt_fd < 0) { pr_err("Failed to open %s\n", mnt_path); return -1; } if (autofs_mnt_set_timeout(timeout, mnt_path, mnt_fd)) { pr_err("Failed to set timeout %ld for %s\n", timeout, mnt_path); return -1; } if (autofs_mnt_make_catatonic(mnt_path, mnt_fd)) { pr_err("Failed to set %s catatonic\n", mnt_path); return -1; } if (autofs_mnt_close(mnt_path, mnt_fd) < 0) { pr_err("Failed to close %s\n", mnt_path); return -1; } return 0; } /* Here to fixup Autofs mount */ static int autofs_post_open(struct file_desc *d, int fd) { struct pipe_info *pi = container_of(d, struct pipe_info, d); autofs_info_t *i = container_of(pi, autofs_info_t, pi); int mnt_fd; pr_info("%s: restoring %s\n", __func__, i->mnt_path); mnt_fd = autofs_mnt_open(i->mnt_path, i->mnt_dev); if (mnt_fd < 0) { pr_err("Failed to open %s\n", i->mnt_path); return -1; } if (autofs_mnt_set_pipefd(i, mnt_fd)) { pr_err("Failed to set %s owner\n", i->mnt_path); return -1; } if (autofs_mnt_close(i->mnt_path, mnt_fd) < 0) { pr_err("Failed to close %s\n", i->mnt_path); return -1; } pr_info("autofs mount %s owner restored: pgrp=%d, fd=%d\n", i->mnt_path, getpgrp(), i->entry->fd); if (i->entry->has_read_fd) { pr_info("%s: pid %d, closing write end %d\n", __func__, getpid(), i->entry->fd); close(i->entry->fd); } pr_info("%s: pid %d, closing artificial pipe end %d\n", __func__, getpid(), fd); close(fd); return 0; } static autofs_info_t *autofs_create_info(const struct mount_info *mi, const struct file_desc *desc, const autofs_info_t *info) { autofs_info_t *i; i = shmalloc(sizeof(*i)); if (!i) return NULL; i->mnt_path = shmalloc(strlen(mi->ns_mountpoint) + 1); if (!i->mnt_path) return NULL; /* Here we copy autofs dev_id and entry from private data to shared. * See autofs_mount(). */ i->entry = shmalloc(sizeof(*info->entry)); if (!i->entry) return NULL; memcpy(i->entry, info->entry, sizeof(*info->entry)); i->mnt_dev = info->mnt_dev; /* We need mountpoint to be able to open mount in autofs_post_open() * callback. And this have to be internal path, because process cwd * will be changed already. That's why ns_mountpoint is used. */ strcpy(i->mnt_path, mi->ns_mountpoint); return i; } static struct fdinfo_list_entry *autofs_pipe_le(struct pstree_item *master, AutofsEntry *entry) { struct fdinfo_list_entry *ple; int pipe_fd = entry->fd; if (entry->has_read_fd) pipe_fd = entry->read_fd; ple = find_used_fd(master, pipe_fd); if (!ple) { pr_err("Failed to find pipe fd %d in process %d\n", pipe_fd, vpid(master)); return NULL; } if (ple->fe->type != FD_TYPES__PIPE) { pr_err("Fd %d in process %d is not a pipe: %d\n", pipe_fd, vpid(master), ple->fe->type); return NULL; } return ple; } static int autofs_open_pipefd(struct file_desc *d, int *new_fd) { struct fdinfo_list_entry *fle = file_master(d); int ret; if (fle->stage < FLE_OPEN) { ret = open_pipe(d, new_fd); if (ret != 0) return ret; set_fds_event(fle->pid); return 1; } return autofs_post_open(d, fle->fe->fd); } static int autofs_create_pipe(struct pstree_item *task, autofs_info_t *i, struct fdinfo_list_entry *ple) { struct pipe_info *pi = container_of(ple->desc, struct pipe_info, d); int fd = -1; FdinfoEntry *fe; unsigned flags = O_RDONLY; struct file_desc_ops *ops; PipeEntry *pe; fd = find_unused_fd(task, fd); ops = shmalloc(sizeof(*ops)); if (!ops) return -1; memcpy(ops, pi->d.ops, sizeof(*ops)); ops->open = autofs_open_pipefd; ops->type = FD_TYPES__AUTOFS_PIPE; pe = shmalloc(sizeof(*pe)); if (!pe) return -1; pe->id = pi->pe->id; pe->pipe_id = pi->pe->pipe_id; pe->fown = pi->pe->fown; pe->flags = flags; if (collect_one_pipe_ops(&i->pi, &pe->base, ops) < 0) { pr_err("Failed to add pipe info for write end\n"); return -1; } fe = dup_fdinfo(ple->fe, fd, flags); if (!fe) return -1; fe->type = FD_TYPES__AUTOFS_PIPE; pr_info("autofs: adding pipe fd %d, flags %#x to %d (with post_open)\n", fe->fd, fe->flags, vpid(task)); return collect_fd(vpid(task), fe, rsti(task), false); } static int autofs_add_mount_info(struct pprep_head *ph) { autofs_info_t *ai = container_of(ph, autofs_info_t, ph); struct mount_info *mi = ai->mi; autofs_info_t *info = mi->private; AutofsEntry *entry = info->entry; autofs_info_t *i; struct pstree_item *master; struct fdinfo_list_entry *ple; if (entry->fd == -1) /* Catatonic mounts have no owner. Keep them with init. */ master = pstree_item_by_virt(getpid()); else master = pstree_item_by_virt(entry->pgrp); BUG_ON(!master); ple = autofs_pipe_le(master, entry); if (!ple) return -1; if (entry->has_read_fd) { /* Original pipe write end was closed. * We need create one to be able to fixup AutoFS mount. */ entry->fd = autofs_dup_pipe(master, ple, entry->fd); if (entry->fd < 0) { pr_err("Failed to find free fd in process %d\n", vpid(master)); return -1; } } i = autofs_create_info(mi, ple->desc, info); if (!i) return -1; /* Another pipe descriptor is needed to call post_open callback */ if (autofs_create_pipe(master, i, ple)) return -1; mi->private = i; return 0; } static int autofs_restore_entry(struct mount_info *mi, AutofsEntry **entry) { struct cr_img *img; int ret; img = open_image(CR_FD_AUTOFS, O_RSTR, mi->s_dev); if (!img) return -1; if (empty_image(img)) { close_image(img); return -1; } ret = pb_read_one_eof(img, entry, PB_AUTOFS); close_image(img); if (ret < 0) return -1; return 0; } int autofs_mount(struct mount_info *mi, const char *source, const char *filesystemtype, unsigned long mountflags) { AutofsEntry *entry; autofs_info_t *info; char *opts, *mode; int control_pipe[2], ret = -1; struct stat buf; if (autofs_restore_entry(mi, &entry) < 0) return -1; if (pipe(control_pipe) < 0) { pr_perror("Can't create pipe"); return -1; } mode = "direct"; if (entry->mode == AUTOFS_MODE_INDIRECT) mode = "indirect"; if (entry->mode == AUTOFS_MODE_OFFSET) mode = "offset"; opts = xsprintf("fd=%d,pgrp=%d,minproto=%d,maxproto=%d,%s", control_pipe[1], getpgrp(), entry->minproto, entry->maxproto, mode); if (opts && entry->has_uid) opts = xstrcat(opts, ",uid=%d", entry->uid); if (opts && entry->has_gid) opts = xstrcat(opts, ",gid=%d", entry->gid); if (!opts) { pr_err("Failed to create options string\n"); goto close_pipe; } pr_info("autofs: mounting to %s with options: \"%s\"\n", service_mountpoint(mi), opts); if (mount(source, service_mountpoint(mi), filesystemtype, mountflags, opts) < 0) { pr_perror("Failed to mount autofs to %s", service_mountpoint(mi)); goto free_opts; } info = xmalloc(sizeof(*info)); if (!info) goto umount; info->entry = entry; /* We need autofs dev_id to be able to open direct mount point. * But we can't call stat in autofs_add_mount_info(), because autofs * mount can be overmounted. Thus we have to call it here. But shared * data is not ready yet. So, let's put in on mi->private and copy to * shared data in autofs_add_mount_info(). */ if (stat(service_mountpoint(mi), &buf) < 0) { pr_perror("Failed to stat %s", service_mountpoint(mi)); goto free_info; } info->mnt_dev = buf.st_dev; /* We need to create dentries for nested mounts */ ret = autofs_populate_mount(mi, entry); if (ret < 0) goto free_info; /* In case of catatonic mounts all we need as the function call below */ ret = autofs_post_mount(service_mountpoint(mi), buf.st_dev, entry->timeout); if (ret < 0) goto free_info; /* Otherwise we have to add shared object creation callback */ if (entry->fd != AUTOFS_CATATONIC_FD) { info->ph.actor = autofs_add_mount_info; add_post_prepare_cb(&info->ph); } info->mi = mi; mi->private = info; free_opts: free(opts); close_pipe: close(control_pipe[1]); close(control_pipe[0]); return ret; free_info: free(info); umount: if (umount(service_mountpoint(mi)) < 0) pr_perror("Failed to umount %s", service_mountpoint(mi)); goto close_pipe; } crac-criu-1.5.0/criu/bfd.c000066400000000000000000000127661471504326700152530ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "int.h" #include "log.h" #include "common/bug.h" #include "bfd.h" #include "common/list.h" #include "util.h" #include "xmalloc.h" #include "page.h" #undef LOG_PREFIX #define LOG_PREFIX "bfd: " /* * Kernel doesn't produce more than one page of * date per one read call on proc files. */ #define BUFSIZE (PAGE_SIZE) struct bfd_buf { char *mem; struct list_head l; }; static LIST_HEAD(bufs); #define BUFBATCH (16) static int buf_get(struct xbuf *xb) { struct bfd_buf *b; if (list_empty(&bufs)) { void *mem; int i; mem = mmap(NULL, BUFBATCH * BUFSIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (mem == MAP_FAILED) { pr_perror("No buf"); return -1; } for (i = 0; i < BUFBATCH; i++) { b = xmalloc(sizeof(*b)); if (!b) { if (i == 0) { pr_err("No buffer for bfd\n"); return -1; } pr_warn("BFD buffers partial refil!\n"); break; } b->mem = mem + i * BUFSIZE; list_add_tail(&b->l, &bufs); } } b = list_first_entry(&bufs, struct bfd_buf, l); list_del_init(&b->l); xb->mem = b->mem; xb->data = xb->mem; xb->sz = 0; xb->buf = b; return 0; } static void buf_put(struct xbuf *xb) { /* * Don't unmap buffer back, it will get reused * by next bfdopen call */ list_add(&xb->buf->l, &bufs); xb->buf = NULL; xb->mem = NULL; xb->data = NULL; } static int bfdopen(struct bfd *f, bool writable) { if (buf_get(&f->b)) { close_safe(&f->fd); return -1; } f->writable = writable; return 0; } int bfdopenr(struct bfd *f) { return bfdopen(f, false); } int bfdopenw(struct bfd *f) { return bfdopen(f, true); } static int bflush(struct bfd *bfd); static bool flush_failed = false; int bfd_flush_images(void) { return flush_failed ? -1 : 0; } void bclose(struct bfd *f) { if (bfd_buffered(f)) { if (f->writable && bflush(f) < 0) { /* * This is to propagate error up. It's * hardly possible by returning and * checking it, but setting a static * flag, failing further bfdopen-s and * checking one at the end would work. */ flush_failed = true; pr_perror("Error flushing image"); } buf_put(&f->b); } close_safe(&f->fd); } static int brefill(struct bfd *f) { int ret; struct xbuf *b = &f->b; memmove(b->mem, b->data, b->sz); b->data = b->mem; ret = read_all(f->fd, b->mem + b->sz, BUFSIZE - b->sz); if (ret < 0) { pr_perror("Error reading file"); return -1; } if (ret == 0) return 0; b->sz += ret; return 1; } static char *strnchr(char *str, unsigned int len, char c) { while (len > 0 && *str != c) { str++; len--; } return len == 0 ? NULL : str; } char *breadline(struct bfd *f) { return breadchr(f, '\n'); } char *breadchr(struct bfd *f, char c) { struct xbuf *b = &f->b; bool refilled = false; char *n; unsigned int ss = 0; again: n = strnchr(b->data + ss, b->sz - ss, c); if (n) { char *ret; ret = b->data; b->data = n + 1; /* skip the \n found */ *n = '\0'; b->sz -= (b->data - ret); return ret; } if (refilled) { if (!b->sz) return NULL; if (b->sz == BUFSIZE) { pr_err("The bfd buffer is too small\n"); return ERR_PTR(-EIO); } /* * Last bytes may lack the \n at the * end, need to report this as full * line anyway */ b->data[b->sz] = '\0'; /* * The b->data still points to old data, * but we say that no bytes left there * so next call to breadline will not * "find" these bytes again. */ b->sz = 0; return b->data; } /* * small optimization -- we've scanned b->sz * symbols already, no need to re-scan them after * the buffer refill. */ ss = b->sz; /* no full line in the buffer -- refill one */ if (brefill(f) < 0) return ERR_PTR(-EIO); refilled = true; goto again; } static int bflush(struct bfd *bfd) { struct xbuf *b = &bfd->b; int ret; if (!b->sz) return 0; ret = write_all(bfd->fd, b->data, b->sz); if (ret != b->sz) return -1; b->sz = 0; return 0; } static int __bwrite(struct bfd *bfd, const void *buf, int size) { struct xbuf *b = &bfd->b; if (b->sz + size > BUFSIZE) { int ret; ret = bflush(bfd); if (ret < 0) return ret; } if (size > BUFSIZE) return write_all(bfd->fd, buf, size); memcpy(b->data + b->sz, buf, size); b->sz += size; return size; } int bwrite(struct bfd *bfd, const void *buf, int size) { if (!bfd_buffered(bfd)) return write_all(bfd->fd, buf, size); return __bwrite(bfd, buf, size); } int bwritev(struct bfd *bfd, const struct iovec *iov, int cnt) { int i, written = 0; if (!bfd_buffered(bfd)) { /* * FIXME writev() should be called again if writev() writes * less bytes than requested. */ return writev(bfd->fd, iov, cnt); } for (i = 0; i < cnt; i++) { int ret; ret = __bwrite(bfd, (const void *)iov[i].iov_base, iov[i].iov_len); if (ret < 0) return ret; written += ret; if (ret < iov[i].iov_len) break; } return written; } int bread(struct bfd *bfd, void *buf, int size) { struct xbuf *b = &bfd->b; int more = 1, filled = 0; if (!bfd_buffered(bfd)) return read_all(bfd->fd, buf, size); while (more > 0) { int chunk; chunk = size - filled; if (chunk > b->sz) chunk = b->sz; if (chunk) { memcpy(buf + filled, b->data, chunk); b->data += chunk; b->sz -= chunk; filled += chunk; } if (filled < size) more = brefill(bfd); else { BUG_ON(filled > size); more = 0; } } return more < 0 ? more : filled; } crac-criu-1.5.0/criu/bitmap.c000066400000000000000000000024561471504326700157670ustar00rootroot00000000000000#include "common/bitsperlong.h" #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) #define BITMAP_FIRST_WORD_MASK(start) (~0ul << ((start) % BITS_PER_LONG)) #define BITMAP_LAST_WORD_MASK(nbits) (((nbits) % BITS_PER_LONG) ? (1ul << ((nbits) % BITS_PER_LONG)) - 1 : ~0ul) #define small_const_nbits(nbits) (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) void bitmap_set(unsigned long *map, int start, int nr) { unsigned long *p = map + BIT_WORD(start); const int size = start + nr; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); while (nr - bits_to_set >= 0) { *p |= mask_to_set; nr -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = ~0UL; p++; } if (nr) { mask_to_set &= BITMAP_LAST_WORD_MASK(size); *p |= mask_to_set; } } void bitmap_clear(unsigned long *map, int start, int nr) { unsigned long *p = map + BIT_WORD(start); const int size = start + nr; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); while (nr - bits_to_clear >= 0) { *p &= ~mask_to_clear; nr -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = ~0UL; p++; } if (nr) { mask_to_clear &= BITMAP_LAST_WORD_MASK(size); *p &= ~mask_to_clear; } } crac-criu-1.5.0/criu/bpfmap.c000066400000000000000000000236561471504326700157650ustar00rootroot00000000000000#include #include #include "common/compiler.h" #include "imgset.h" #include "bpfmap.h" #include "fdinfo.h" #include "image.h" #include "util.h" #include "log.h" #include "protobuf.h" #ifndef LIBBPF_OPTS #define LIBBPF_OPTS DECLARE_LIBBPF_OPTS #define LEGACY_LIBBPF /* Using libbpf < 0.7 */ #endif int is_bpfmap_link(char *link) { return is_anon_link_type(link, "bpf-map"); } static void pr_info_bpfmap(char *action, BpfmapFileEntry *bpf) { pr_info("%sbpfmap: id %#08x map_id %#08x map_type %d flags %" PRIx32 "\n", action, bpf->id, bpf->map_id, bpf->map_type, bpf->map_flags); } struct bpfmap_data_rst *bpfmap_data_hash_table[BPFMAP_DATA_TABLE_SIZE]; static int bpfmap_data_read(struct cr_img *img, struct bpfmap_data_rst *r) { unsigned long bytes = r->bde->keys_bytes + r->bde->values_bytes; if (!bytes) return 0; r->data = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (r->data == MAP_FAILED) { pr_perror("Can't map mem for bpfmap buffers"); return -1; } return read_img_buf(img, r->data, bytes); } int do_collect_bpfmap_data(struct bpfmap_data_rst *r, ProtobufCMessage *msg, struct cr_img *img, struct bpfmap_data_rst **bpf_hash_table) { int ret; int table_index; r->bde = pb_msg(msg, BpfmapDataEntry); ret = bpfmap_data_read(img, r); if (ret < 0) return ret; table_index = r->bde->map_id & BPFMAP_DATA_HASH_MASK; r->next = bpf_hash_table[table_index]; bpf_hash_table[table_index] = r; pr_info("Collected bpfmap data for %#x\n", r->bde->map_id); return 0; } int restore_bpfmap_data(int map_fd, uint32_t map_id, struct bpfmap_data_rst **bpf_hash_table) { struct bpfmap_data_rst *map_data; BpfmapDataEntry *bde; void *keys = NULL; void *values = NULL; unsigned int count; LIBBPF_OPTS(bpf_map_batch_opts, opts); for (map_data = bpf_hash_table[map_id & BPFMAP_DATA_HASH_MASK]; map_data != NULL; map_data = map_data->next) { if (map_data->bde->map_id == map_id) break; } if (!map_data || map_data->bde->count == 0) { pr_info("No data for BPF map %#x\n", map_id); return 0; } bde = map_data->bde; count = bde->count; keys = mmap(NULL, bde->keys_bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (keys == MAP_FAILED) { pr_perror("Can't map memory for BPF map keys"); goto err; } memcpy(keys, map_data->data, bde->keys_bytes); values = mmap(NULL, bde->values_bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (values == MAP_FAILED) { pr_perror("Can't map memory for BPF map values"); goto err; } memcpy(values, map_data->data + bde->keys_bytes, bde->values_bytes); if (bpf_map_update_batch(map_fd, keys, values, &count, &opts)) { pr_perror("Can't load key-value pairs to BPF map"); goto err; } munmap(keys, bde->keys_bytes); munmap(values, bde->values_bytes); return 0; err: munmap(keys, bde->keys_bytes); munmap(values, bde->values_bytes); return -1; } static int collect_bpfmap_data(void *obj, ProtobufCMessage *msg, struct cr_img *img) { return do_collect_bpfmap_data(obj, msg, img, bpfmap_data_hash_table); } struct collect_image_info bpfmap_data_cinfo = { .fd_type = CR_FD_BPFMAP_DATA, .pb_type = PB_BPFMAP_DATA, .priv_size = sizeof(struct bpfmap_data_rst), .collect = collect_bpfmap_data, }; int dump_one_bpfmap_data(BpfmapFileEntry *bpf, int lfd, const struct fd_parms *p) { /* * Linux kernel patch notes for bpf_map_*_batch(): * * in_batch/out_batch are opaque values use to communicate between * user/kernel space, in_batch/out_batch must be of key_size length. * To start iterating from the beginning in_batch must be null, * count is the # of key/value elements to retrieve. Note that the 'keys' * buffer must be a buffer of key_size * count size and the 'values' buffer * must be value_size * count, where value_size must be aligned to 8 bytes * by userspace if it's dealing with percpu maps. 'count' will contain the * number of keys/values successfully retrieved. Note that 'count' is an * input/output variable and it can contain a lower value after a call. * * If there's no more entries to retrieve, ENOENT will be returned. If error * is ENOENT, count might be > 0 in case it copied some values but there were * no more entries to retrieve. * * Note that if the return code is an error and not -EFAULT, * count indicates the number of elements successfully processed. */ struct cr_img *img; uint32_t key_size, value_size, max_entries, count; void *keys = NULL, *values = NULL; void *in_batch = NULL, *out_batch = NULL; BpfmapDataEntry bde = BPFMAP_DATA_ENTRY__INIT; LIBBPF_OPTS(bpf_map_batch_opts, opts); int ret; key_size = bpf->key_size; value_size = bpf->value_size; max_entries = bpf->max_entries; count = max_entries; keys = mmap(NULL, key_size * max_entries, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (keys == MAP_FAILED) { pr_perror("Can't map memory for BPF map keys"); goto err; } values = mmap(NULL, value_size * max_entries, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (values == MAP_FAILED) { pr_perror("Can't map memory for BPF map values"); goto err; } out_batch = mmap(NULL, key_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (out_batch == MAP_FAILED) { pr_perror("Can't map memory for BPF map out_batch"); goto err; } ret = bpf_map_lookup_batch(lfd, in_batch, out_batch, keys, values, &count, &opts); if (ret && errno != ENOENT) { pr_perror("Can't perform a batch lookup on BPF map"); goto err; } img = img_from_set(glob_imgset, CR_FD_BPFMAP_DATA); bde.map_id = bpf->map_id; bde.keys_bytes = (key_size * count); bde.values_bytes = (value_size * count); bde.count = count; if (pb_write_one(img, &bde, PB_BPFMAP_DATA)) goto err; if (write(img_raw_fd(img), keys, key_size * count) != (key_size * count)) { pr_perror("Can't write BPF map's keys"); goto err; } if (write(img_raw_fd(img), values, value_size * count) != (value_size * count)) { pr_perror("Can't write BPF map's values"); goto err; } munmap(keys, key_size * max_entries); munmap(values, value_size * max_entries); munmap(out_batch, key_size); return 0; err: munmap(keys, key_size * max_entries); munmap(values, value_size * max_entries); munmap(out_batch, key_size); return -1; } static int dump_one_bpfmap(int lfd, u32 id, const struct fd_parms *p) { BpfmapFileEntry bpf = BPFMAP_FILE_ENTRY__INIT; FileEntry fe = FILE_ENTRY__INIT; int ret; /* If we are using a bigger struct than the kernel knows of, * ensure all the unknown bits are 0 - i.e. new user-space * does not rely on any unknown kernel feature extensions. * https://github.com/torvalds/linux/blob/a1994480/kernel/bpf/syscall.c#L70 */ struct bpf_map_info map_info = {}; uint32_t info_len = sizeof(struct bpf_map_info); if (parse_fdinfo(lfd, FD_TYPES__BPFMAP, &bpf)) return -1; ret = bpf_obj_get_info_by_fd(lfd, &map_info, &info_len); if (ret) { pr_perror("Could not get BPF map info"); return -1; } switch (bpf.map_type) { case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_ARRAY: bpf.id = id; bpf.flags = p->flags; bpf.fown = (FownEntry *)&p->fown; bpf.map_name = xstrdup(map_info.name); bpf.ifindex = map_info.ifindex; fe.type = FD_TYPES__BPFMAP; fe.id = bpf.id; fe.bpf = &bpf; pr_info_bpfmap("Dumping ", &bpf); if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) return -1; pr_info_bpfmap("Dumping data for ", &bpf); ret = dump_one_bpfmap_data(&bpf, lfd, p); break; default: pr_err("CRIU does not currently support dumping BPF map type %u!\n", bpf.map_type); ret = -1; } return ret; } const struct fdtype_ops bpfmap_dump_ops = { .type = FD_TYPES__BPFMAP, .dump = dump_one_bpfmap, }; static int bpfmap_open(struct file_desc *d, int *new_fd) { struct bpfmap_file_info *info; BpfmapFileEntry *bpfe; int bpfmap_fd; #ifdef LEGACY_LIBBPF struct bpf_create_map_attr xattr; #else LIBBPF_OPTS(bpf_map_create_opts, bpfmap_opts); #endif info = container_of(d, struct bpfmap_file_info, d); bpfe = info->bpfe; pr_info_bpfmap("Creating and opening ", bpfe); #ifdef LEGACY_LIBBPF xattr.name = xstrdup(bpfe->map_name); xattr.map_type = bpfe->map_type; xattr.map_flags = bpfe->map_flags; xattr.key_size = bpfe->key_size; xattr.value_size = bpfe->value_size; xattr.max_entries = bpfe->max_entries; xattr.numa_node = 0; xattr.btf_fd = 0; xattr.btf_key_type_id = 0; xattr.btf_value_type_id = 0; xattr.map_ifindex = bpfe->ifindex; xattr.inner_map_fd = 0; bpfmap_fd = bpf_create_map_xattr(&xattr); #else bpfmap_opts.map_flags = bpfe->map_flags; bpfmap_opts.map_ifindex = bpfe->ifindex; if (bpfe->has_map_extra) bpfmap_opts.map_extra = bpfe->map_extra; bpfmap_fd = bpf_map_create(bpfe->map_type, bpfe->map_name, bpfe->key_size, bpfe->value_size, bpfe->max_entries, &bpfmap_opts); #endif if (bpfmap_fd < 0) { pr_perror("Can't create bpfmap %#08x", bpfe->id); return -1; } if (bpfe->has_map_extra && bpfe->map_extra) pr_warn("bpfmap map_extra has non-zero value. This will not be restored.\n"); if (restore_bpfmap_data(bpfmap_fd, bpfe->map_id, bpfmap_data_hash_table)) return -1; if (bpfe->frozen) { if (bpf_map_freeze(bpfmap_fd)) { pr_perror("Can't freeze bpfmap %#08x", bpfe->id); goto err_close; } } if (rst_file_params(bpfmap_fd, bpfe->fown, bpfe->flags)) { pr_perror("Can't restore params on bpfmap %#08x", bpfe->id); goto err_close; } *new_fd = bpfmap_fd; return 0; err_close: close(bpfmap_fd); return -1; } static struct file_desc_ops bpfmap_desc_ops = { .type = FD_TYPES__BPFMAP, .open = bpfmap_open, }; static int collect_one_bpfmap(void *obj, ProtobufCMessage *msg, struct cr_img *i) { struct bpfmap_file_info *info = obj; info->bpfe = pb_msg(msg, BpfmapFileEntry); pr_info_bpfmap("Collected ", info->bpfe); return file_desc_add(&info->d, info->bpfe->id, &bpfmap_desc_ops); } struct collect_image_info bpfmap_cinfo = { .fd_type = CR_FD_BPFMAP_FILE, .pb_type = PB_BPFMAP_FILE, .priv_size = sizeof(struct bpfmap_file_info), .collect = collect_one_bpfmap, }; crac-criu-1.5.0/criu/cgroup-props.c000066400000000000000000000306261471504326700171530ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "int.h" #include "common/config.h" #include "common/compiler.h" #include "cgroup-props.h" #include "cr_options.h" #include "xmalloc.h" #include "string.h" #include "util.h" #include "common/list.h" #include "log.h" #include "common/bug.h" #undef LOG_PREFIX #define LOG_PREFIX "cg-prop: " enum { CGP_MERGE, CGP_REPLACE, }; static const char *____criu_global_props____[] = { "cgroup.clone_children", "notify_on_release", "cgroup.procs", "tasks", }; /* cgroup2 global properties */ // clang-format off static const char *____criu_global_props_v2____[] = { "cgroup.subtree_control", "cgroup.max.descendants", "cgroup.max.depth", "cgroup.freeze", "cgroup.type", }; // clang-format on cgp_t cgp_global = { .name = "____criu_global_props____", .nr_props = ARRAY_SIZE(____criu_global_props____), .props = ____criu_global_props____, }; cgp_t cgp_global_v2 = { .name = "____criu_global_props_v2____", .nr_props = ARRAY_SIZE(____criu_global_props_v2____), .props = ____criu_global_props_v2____, }; typedef struct { struct list_head list; cgp_t cgp; } cgp_list_entry_t; static LIST_HEAD(cgp_list); static void cgp_free(cgp_list_entry_t *p) { size_t i; if (p) { for (i = 0; i < p->cgp.nr_props; i++) xfree((void *)p->cgp.props[i]); xfree((void *)p->cgp.name); xfree((void *)p->cgp.props); xfree(p); } } static int cgp_merge_props(cgp_list_entry_t *d, cgp_list_entry_t *s) { size_t nr_props, i, j; nr_props = d->cgp.nr_props + s->cgp.nr_props; if (xrealloc_safe(&d->cgp.props, nr_props * sizeof(char *))) return -ENOMEM; /* * FIXME: Check for duplicates in propties? */ for (i = d->cgp.nr_props, j = 0; i < nr_props; i++, j++) { d->cgp.props[i] = xstrdup(s->cgp.props[j]); if (!d->cgp.props[i]) return -ENOMEM; d->cgp.nr_props++; } return 0; } static int cgp_handle_props(cgp_list_entry_t **p, int strategy) { cgp_list_entry_t *s = *p; cgp_list_entry_t *t; list_for_each_entry(t, &cgp_list, list) { if (strcmp(t->cgp.name, s->cgp.name)) continue; pr_debug("%s \"%s\" controller properties\n", strategy == CGP_MERGE ? "Merging" : "Replacing", s->cgp.name); if (strategy == CGP_MERGE) { int ret; ret = cgp_merge_props(t, s); cgp_free(s); *p = NULL; return ret; } else if (strategy == CGP_REPLACE) { /* * Simply drop out previous instance. */ list_del(&t->list); cgp_free(t); break; } else BUG(); } /* * New controller, simply add it. */ list_add(&s->list, &cgp_list); *p = NULL; return 0; } static char *skip_spaces(char **stream, size_t *len) { if (stream && *len) { char *p = *stream; while (p && *len && *p == ' ') p++, (*len)--; if (p != *stream) *stream = p; return p; } return NULL; } static bool eat_symbol(char **stream, size_t *len, char sym, bool skip_ws) { char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); if (!p || *p != sym || !*len) return false; (*stream) = p + 1; (*len)--; return true; } static bool eat_symbols(char **stream, size_t *len, char *syms, size_t n_syms, bool skip_ws) { char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); size_t i; if (p && *len) { char *stream_orig = *stream; size_t len_orig = *len; for (i = 0; i < n_syms; i++) { if (!eat_symbol(stream, len, syms[i], false)) { *stream = stream_orig; *len = len_orig; goto nomatch; } } return true; } nomatch: return false; } static bool eat_word(char **stream, size_t *len, char *word, size_t word_len, bool skip_ws) { char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); if (p && *len >= word_len) { if (!strncmp(p, word, word_len)) { (*stream) += word_len; (*len) -= word_len; return true; } } return false; } static char *get_quoted(char **stream, size_t *len, bool skip_ws) { char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); char *from = p + 1; char *dst; if (!p || *p != '\"') return NULL; for (p = from, (*len)--; (*len); p++, (*len)--) { if (*p == '\"') { if (p == from) break; dst = xmalloc(p - from + 1); if (!dst) break; memcpy(dst, from, p - from); dst[p - from] = '\0'; (*stream) = p + 1; (*len)--; return dst; } } return NULL; } static int cgp_parse_stream(char *stream, size_t len) { cgp_list_entry_t *cgp_entry = NULL; int strategy; int ret = 0; char *p; /* * We expect the following format here * (very simplified YAML!) * * "cpu": * - "strategy": "replace" * - "properties": ["cpu.shares", "cpu.cfs_period_us"] * "memory": * - "strategy": "merge" * - "properties": ["memory.limit_in_bytes", "memory.memsw.limit_in_bytes"] * * and etc. */ while (len) { /* * Controller name. */ p = get_quoted(&stream, &len, false); if (!p) { pr_err("Expecting controller name\n"); goto err_parse; } pr_info("Parsing controller \"%s\"\n", p); cgp_entry = xzalloc(sizeof(*cgp_entry)); if (cgp_entry) { INIT_LIST_HEAD(&cgp_entry->list); cgp_entry->cgp.name = p; } else { pr_err("Can't allocate memory for controller %s\n", p); xfree(p); return -ENOMEM; } if (!eat_symbols(&stream, &len, ":\n - ", 5, true)) { pr_err("Expected \':\\n - \' sequence controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_word(&stream, &len, "\"strategy\":", 11, true)) { pr_err("Expected \'strategy:\' keyword in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } p = get_quoted(&stream, &len, true); if (!p) { pr_err("Expected strategy in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; }; if (!strcmp(p, "merge")) { strategy = CGP_MERGE; } else if (!strcmp(p, "replace")) { strategy = CGP_REPLACE; } else { pr_err("Unknown strategy \"%s\" in controller's %s stream\n", p, cgp_entry->cgp.name); xfree(p); goto err_parse; } pr_info("\tStrategy \"%s\"\n", p); xfree(p); if (!eat_symbols(&stream, &len, "\n - ", 4, true)) { pr_err("Expected \':\\n - \' sequence controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_word(&stream, &len, "\"properties\":", 13, true)) { pr_err("Expected \"properties:\" keyword in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_symbol(&stream, &len, '[', true)) { pr_err("Expected \'[\' sequence controller's %s properties stream\n", cgp_entry->cgp.name); goto err_parse; } while ((p = get_quoted(&stream, &len, true))) { if (!p) { pr_err("Expected property name for controller %s\n", cgp_entry->cgp.name); goto err_parse; } if (xrealloc_safe(&cgp_entry->cgp.props, (cgp_entry->cgp.nr_props + 1) * sizeof(char *))) { pr_err("Can't allocate property for controller %s\n", cgp_entry->cgp.name); xfree(p); goto err_parse; } cgp_entry->cgp.props[cgp_entry->cgp.nr_props++] = p; pr_info("\tProperty \"%s\"\n", p); if (!eat_symbol(&stream, &len, ',', true)) { if (stream[0] == ']') { stream++, len--; break; } pr_err("Expected ']' in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } } if (cgp_entry->cgp.nr_props == 0 && !eat_symbol(&stream, &len, ']', true)) { pr_err("Expected ']' in empty property list for %s\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_symbol(&stream, &len, '\n', true) && len) { pr_err("Expected \'\\n\' symbol in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (cgp_handle_props(&cgp_entry, strategy)) goto err_parse; cgp_entry = NULL; } ret = 0; out: return ret; err_parse: cgp_free(cgp_entry); ret = -EINVAL; goto out; } static int cgp_parse_file(char *path) { void *mem = MAP_FAILED; int fd = -1, ret = -1; struct stat st; fd = open(path, O_RDONLY); if (fd < 0) { pr_perror("Can't open file %s", path); goto err; } if (fstat(fd, &st)) { pr_perror("Can't stat file %s", path); goto err; } mem = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, 0); if (mem == MAP_FAILED) { pr_perror("Can't mmap file %s", path); goto err; } if (cgp_parse_stream(mem, st.st_size)) { pr_err("Failed to parse file `%s'\n", path); goto err; } ret = 0; err: if (mem != MAP_FAILED) munmap(mem, st.st_size); close_safe(&fd); return ret; } static int cgp_parse_builtins(void) { static const char predefined_stream[] = "\"cpu\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"cpu.shares\", " "\"cpu.cfs_period_us\", " "\"cpu.cfs_quota_us\", " "\"cpu.rt_period_us\", " "\"cpu.rt_runtime_us\" " "]\n" /* limit_in_bytes and memsw.limit_in_bytes must be set in this order */ "\"memory\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"memory.limit_in_bytes\", " "\"memory.memsw.limit_in_bytes\", " "\"memory.swappiness\", " "\"memory.soft_limit_in_bytes\", " "\"memory.move_charge_at_immigrate\", " "\"memory.oom_control\", " "\"memory.use_hierarchy\", " "\"memory.kmem.limit_in_bytes\", " "\"memory.kmem.tcp.limit_in_bytes\" " "]\n" /* * cpuset.cpus and cpuset.mems must be set before the process moves * into its cgroup; they are "initialized" below to whatever the root * values are in copy_special_cg_props so as not to cause ENOSPC when * values are restored via this code. */ "\"cpuset\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"cpuset.cpus\", " "\"cpuset.mems\", " "\"cpuset.memory_migrate\", " "\"cpuset.cpu_exclusive\", " "\"cpuset.mem_exclusive\", " "\"cpuset.mem_hardwall\", " "\"cpuset.memory_spread_page\", " "\"cpuset.memory_spread_slab\", " "\"cpuset.sched_load_balance\", " "\"cpuset.sched_relax_domain_level\" " "]\n" "\"blkio\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"blkio.weight\" " "]\n" "\"freezer\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "]\n" "\"perf_event\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "]\n" "\"net_cls\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"net_cls.classid\" " "]\n" "\"net_prio\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"net_prio.ifpriomap\" " "]\n" "\"pids\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"pids.max\" " "]\n" "\"devices\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"devices.list\" " "]\n"; return cgp_parse_stream((void *)predefined_stream, strlen(predefined_stream)); } int cgp_init(char *stream, size_t len, char *path) { int ret; ret = cgp_parse_builtins(); if (ret) goto err; if (stream && len) { ret = cgp_parse_stream(stream, len); if (ret) goto err; } if (path) ret = cgp_parse_file(path); err: return ret; } static char **dump_controllers; static size_t nr_dump_controllers; bool cgp_add_dump_controller(const char *name) { if (xrealloc_safe(&dump_controllers, (nr_dump_controllers + 1) * sizeof(char *))) { pr_err("Can't add controller \"%s\" to mark\n", name); return false; } dump_controllers[nr_dump_controllers] = xstrdup(name); if (!dump_controllers[nr_dump_controllers]) return false; pr_debug("Mark controller \"%s\" to dump\n", name); nr_dump_controllers++; return true; } bool cgp_should_skip_controller(const char *name) { size_t i; /* * Dump all by default. */ if (!nr_dump_controllers) return false; for (i = 0; i < nr_dump_controllers; i++) { if (!strcmp(name, dump_controllers[i])) return false; } return true; } const cgp_t *cgp_get_props(const char *name) { cgp_list_entry_t *p; list_for_each_entry(p, &cgp_list, list) { if (!strcmp(p->cgp.name, name)) return &p->cgp; } return NULL; } void cgp_fini(void) { cgp_list_entry_t *p, *t; size_t i; list_for_each_entry_safe(p, t, &cgp_list, list) cgp_free(p); INIT_LIST_HEAD(&cgp_list); for (i = 0; i < nr_dump_controllers; i++) xfree(dump_controllers[i]); xfree(dump_controllers); nr_dump_controllers = 0; } crac-criu-1.5.0/criu/cgroup.c000066400000000000000000001460051471504326700160110ustar00rootroot00000000000000#define LOG_PREFIX "cg: " #include #include #include #include #include #include #include #include #include #include #include "common/list.h" #include "xmalloc.h" #include "cgroup.h" #include "cgroup-props.h" #include "cr_options.h" #include "pstree.h" #include "criu-log.h" #include "util.h" #include "imgset.h" #include "util-pie.h" #include "namespaces.h" #include "seize.h" #include "string.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/cgroup.pb-c.h" #include "kerndat.h" #include "linux/mount.h" #include "syscall.h" /* * This structure describes set of controller groups * a task lives in. The cg_ctl entries are stored in * the @ctls list sorted by the .name field and then * by the .path field. */ struct cg_set { u32 id; struct list_head l; unsigned int n_ctls; struct list_head ctls; }; static LIST_HEAD(cg_sets); static unsigned int n_sets; static CgSetEntry **rst_sets; static unsigned int n_controllers; static CgControllerEntry **controllers; static char *cg_yard; static struct cg_set *root_cgset; /* Set root item lives in */ static struct cg_set *criu_cgset; /* Set criu process lives in */ static u32 cg_set_ids = 1; static LIST_HEAD(cgroups); static unsigned int n_cgroups; static pid_t cgroupd_pid; static CgSetEntry *find_rst_set_by_id(u32 id) { int i; for (i = 0; i < n_sets; i++) if (rst_sets[i]->id == id) return rst_sets[i]; return NULL; } #define CGCMP_MATCH 1 /* check for exact match */ #define CGCMP_ISSUB 2 /* check set is subset of ctls */ static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what) { struct list_head *l1 = &set->ctls, *l2 = ctls; while (1) { struct cg_ctl *c1 = NULL, *c2 = NULL; if (l1->next != &set->ctls) c1 = list_first_entry(l1, struct cg_ctl, l); if (l2->next != ctls) c2 = list_first_entry(l2, struct cg_ctl, l); if (!c1 || !c2) /* Nowhere to move next */ return !c1 && !c2; /* Both lists scanned -- match */ if (strcmp(c1->name, c2->name)) return false; switch (what) { case CGCMP_MATCH: /* must have the same cgns prefix to be considered equal */ if (c1->cgns_prefix != c2->cgns_prefix) return false; if (strcmp(c1->path, c2->path)) return false; break; case CGCMP_ISSUB: if (!strstartswith(c1->path, c2->path)) return false; break; } l1 = l1->next; l2 = l2->next; } } static int collect_cgroups(struct list_head *ctls); static struct cg_set *get_cg_set(struct list_head *ctls, unsigned int n_ctls, bool collect) { struct cg_set *cs; list_for_each_entry(cs, &cg_sets, l) if (cg_set_compare(cs, ctls, CGCMP_MATCH)) { pr_debug(" `- Existing css %d found\n", cs->id); put_ctls(ctls); return cs; } pr_debug(" `- New css ID %d\n", cg_set_ids); cs = xmalloc(sizeof(*cs)); if (cs) { cs->id = cg_set_ids++; INIT_LIST_HEAD(&cs->ctls); list_splice_init(ctls, &cs->ctls); cs->n_ctls = n_ctls; list_add_tail(&cs->l, &cg_sets); n_sets++; if (!pr_quelled(LOG_DEBUG)) { struct cg_ctl *ctl; list_for_each_entry(ctl, &cs->ctls, l) pr_debug(" `- [%s] -> [%s] [%u]\n", ctl->name, ctl->path, ctl->cgns_prefix); } if (collect && collect_cgroups(&cs->ctls)) { list_del(&cs->l); n_sets--; put_ctls(&cs->ctls); xfree(cs); return NULL; } } return cs; } struct cg_controller *new_controller(const char *name) { struct cg_controller *nc = xmalloc(sizeof(*nc)); if (!nc) return NULL; nc->controllers = xmalloc(sizeof(char *)); if (!nc->controllers) { xfree(nc); return NULL; } nc->controllers[0] = xstrdup(name); if (!nc->controllers[0]) { xfree(nc->controllers); xfree(nc); return NULL; } nc->n_controllers = 1; nc->n_heads = 0; nc->is_threaded = false; INIT_LIST_HEAD(&nc->heads); return nc; } int parse_cg_info(void) { if (collect_controllers(&cgroups, &n_cgroups) < 0) return -1; return 0; } /* Check that co-mounted controllers from /proc/cgroups (e.g. cpu and cpuacct) * are contained in a comma separated string (e.g. from /proc/self/cgroup or * mount options). */ static bool cgroup_contains(char **controllers, unsigned int n_controllers, char *name, u64 *mask) { unsigned int i; bool all_match = true; /* Check whether this cgroup2 or not.*/ if (n_controllers == 1 && controllers[0][0] == 0) { bool match = name[0] == 0; if (mask && match) *mask &= ~(1ULL); return match; } for (i = 0; i < n_controllers; i++) { bool found = false; const char *loc = name; do { loc = strstr(loc, controllers[i]); if (loc) { loc += strlen(controllers[i]); switch (*loc) { case '\0': case ',': found = true; if (mask) *mask &= ~(1ULL << i); break; } } } while (loc); all_match &= found; } return all_match && n_controllers > 0; } /* This is for use in add_cgroup() as additional arguments for the ftw() * callback */ static struct cg_controller *current_controller; static unsigned int path_pref_len; #define EXACT_MATCH 0 #define PARENT_MATCH 1 #define NO_MATCH 2 static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir **rdir) { struct cgroup_dir *d; list_for_each_entry(d, dirs, siblings) { if (strcmp(d->path, path) == 0) { *rdir = d; return EXACT_MATCH; } if (strstartswith(path, d->path)) { int ret = find_dir(path, &d->children, rdir); if (ret == NO_MATCH) { *rdir = d; return PARENT_MATCH; } return ret; } } return NO_MATCH; } /* * Strips trailing '\n' from the string */ static inline char *strip(char *str) { char *e; e = strchr(str, '\0'); if (e != str && *(e - 1) == '\n') *(e - 1) = '\0'; return str; } /* * Currently this function only supports properties that have a string value * under 1024 chars. */ static int read_cgroup_prop(struct cgroup_prop *property, const char *fullpath) { char buf[1024]; int fd, ret; struct stat sb; fd = open(fullpath, O_RDONLY); if (fd == -1) { property->value = NULL; pr_perror("Failed opening %s", fullpath); return -1; } if (fstat(fd, &sb) < 0) { pr_perror("failed statting cgroup prop %s", fullpath); close(fd); return -1; } property->mode = sb.st_mode; property->uid = sb.st_uid; property->gid = sb.st_gid; /* skip dumping the value of these, since it doesn't make sense (we * just want to restore the perms) */ if (!strcmp(property->name, "cgroup.procs") || !strcmp(property->name, "tasks")) { ret = 0; /* libprotobuf segfaults if we leave a null pointer in a * string, so let's not do that */ property->value = xstrdup(""); if (!property->value) ret = -1; close(fd); return ret; } ret = read(fd, buf, sizeof(buf) - 1); if (ret == -1) { pr_perror("Failed scanning %s", fullpath); close(fd); return -1; } close(fd); buf[ret] = 0; if (strtoll(buf, NULL, 10) == LLONG_MAX) strcpy(buf, "-1"); property->value = xstrdup(strip(buf)); if (!property->value) return -1; return 0; } static struct cgroup_prop *create_cgroup_prop(const char *name) { struct cgroup_prop *property; property = xmalloc(sizeof(*property)); if (!property) return NULL; property->name = xstrdup(name); if (!property->name) { xfree(property); return NULL; } property->value = NULL; return property; } static void free_cgroup_prop(struct cgroup_prop *prop) { xfree(prop->name); xfree(prop->value); xfree(prop); } static void free_all_cgroup_props(struct cgroup_dir *ncd) { struct cgroup_prop *prop, *t; list_for_each_entry_safe(prop, t, &ncd->properties, list) { list_del(&prop->list); free_cgroup_prop(prop); } INIT_LIST_HEAD(&ncd->properties); ncd->n_properties = 0; } static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp, struct cg_controller *controller) { int j; char buf[PATH_MAX]; struct cgroup_prop *prop; for (j = 0; cgp && j < cgp->nr_props; j++) { if (snprintf(buf, PATH_MAX, "%s/%s", fpath, cgp->props[j]) >= PATH_MAX) { pr_err("snprintf output was truncated\n"); return -1; } if (access(buf, F_OK) < 0 && errno == ENOENT) { pr_info("Couldn't open %s. This cgroup property may not exist on this kernel\n", buf); continue; } prop = create_cgroup_prop(cgp->props[j]); if (!prop) { free_all_cgroup_props(ncd); return -1; } if (read_cgroup_prop(prop, buf) < 0) { free_cgroup_prop(prop); free_all_cgroup_props(ncd); return -1; } if (!strcmp("memory.oom_control", cgp->props[j])) { char *new; int disable; if (sscanf(prop->value, "oom_kill_disable %d\n", &disable) != 1) { pr_err("couldn't scan oom state from %s\n", prop->value); free_cgroup_prop(prop); free_all_cgroup_props(ncd); return -1; } if (asprintf(&new, "%d", disable) < 0) { pr_err("couldn't allocate new oom value\n"); free_cgroup_prop(prop); free_all_cgroup_props(ncd); return -1; } xfree(prop->value); prop->value = new; } /* * Set the is_threaded flag if cgroup.type's value is threaded, * ignore all other values. */ if (!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) controller->is_threaded = true; pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); list_add_tail(&prop->list, &ncd->properties); ncd->n_properties++; } return 0; } static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, struct cg_controller *controller) { int i; for (i = 0; i < controller->n_controllers; ++i) { const cgp_t *cgp = cgp_get_props(controller->controllers[i]); if (dump_cg_props_array(fpath, ncd, cgp, controller) < 0) { pr_err("dumping known properties failed\n"); return -1; } } /* cgroup v2 */ if (controller->controllers[0][0] == 0) { if (dump_cg_props_array(fpath, ncd, &cgp_global_v2, controller) < 0) { pr_err("dumping global properties v2 failed\n"); return -1; } } else { if (dump_cg_props_array(fpath, ncd, &cgp_global, controller) < 0) { pr_err("dumping global properties failed\n"); return -1; } } return 0; } static int add_cgroup(const char *fpath, const struct stat *sb, int typeflag) { struct cgroup_dir *ncd = NULL, *match; int exit_code = -1; if (typeflag == FTW_D) { int mtype; pr_info("adding cgroup %s\n", fpath); ncd = xmalloc(sizeof(*ncd)); if (!ncd) goto out; ncd->mode = sb->st_mode; ncd->uid = sb->st_uid; ncd->gid = sb->st_gid; /* chop off the first "/proc/self/fd/N" str */ if (fpath[path_pref_len] == '\0') ncd->path = xstrdup("/"); else ncd->path = xstrdup(fpath + path_pref_len); if (!ncd->path) goto out; mtype = find_dir(ncd->path, ¤t_controller->heads, &match); switch (mtype) { /* ignore co-mounted cgroups and already dumped cgroups */ case EXACT_MATCH: exit_code = 0; goto out; case PARENT_MATCH: list_add_tail(&ncd->siblings, &match->children); match->n_children++; break; case NO_MATCH: list_add_tail(&ncd->siblings, ¤t_controller->heads); current_controller->n_heads++; break; default: BUG(); } INIT_LIST_HEAD(&ncd->children); ncd->n_children = 0; INIT_LIST_HEAD(&ncd->properties); ncd->n_properties = 0; if (add_cgroup_properties(fpath, ncd, current_controller) < 0) { list_del(&ncd->siblings); if (mtype == PARENT_MATCH) match->n_children--; else if (mtype == NO_MATCH) current_controller->n_heads--; goto out; } } return 0; out: if (ncd) xfree(ncd->path); xfree(ncd); return exit_code; } static int add_freezer_state(struct cg_controller *controller) { struct cgroup_dir *it; /* There is one more case, that cgroup namespaces might * generate "multiple" heads if nothing is actually in the * root freezer cgroup, e.g. --freeze-cgroup=/lxc/foo and all * tasks in either /lxc/foo/a or /lxc/foo/b. * * In this case */ list_for_each_entry(it, &controller->heads, siblings) { struct cgroup_dir *cg_head; struct cgroup_prop *prop; cg_head = list_first_entry(&controller->heads, struct cgroup_dir, siblings); prop = create_cgroup_prop("freezer.state"); if (!prop) return -1; prop->value = xstrdup(get_real_freezer_state()); if (!prop->value) { free_cgroup_prop(prop); return -1; } list_add_tail(&prop->list, &cg_head->properties); cg_head->n_properties++; } return 0; } static const char namestr[] = "name="; static int __new_open_cgroupfs(struct cg_ctl *cc) { const char *fstype = cc->name[0] == 0 ? "cgroup2" : "cgroup"; int fsfd, fd; char *name; fsfd = sys_fsopen(fstype, 0); if (fsfd < 0) { pr_perror("Unable to open the cgroup file system"); return -1; } if (strstartswith(cc->name, namestr)) { if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { pr_perror("Unable to configure the cgroup (%s) file system", cc->name); goto err; } } else if (cc->name[0] != 0) { /* cgroup v1 */ char *saveptr = NULL, *buf = strdupa(cc->name); name = strtok_r(buf, ",", &saveptr); while (name) { if (sys_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { pr_perror("Unable to configure the cgroup (%s) file system", name); goto err; } name = strtok_r(NULL, ",", &saveptr); } } if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { pr_perror("Unable to create the cgroup (%s) file system", cc->name); goto err; } fd = sys_fsmount(fsfd, 0, 0); if (fd < 0) pr_perror("Unable to mount the cgroup (%s) file system", cc->name); close(fsfd); return fd; err: close(fsfd); return -1; } static int open_cgroupfs(struct cg_ctl *cc) { const char *fstype = cc->name[0] == 0 ? "cgroup2" : "cgroup"; char prefix[] = ".criu.cgmounts.XXXXXX"; char mopts[1024]; int fd; if (kdat.has_fsopen) return __new_open_cgroupfs(cc); if (strstartswith(cc->name, namestr)) snprintf(mopts, sizeof(mopts), "none,%s", cc->name); else snprintf(mopts, sizeof(mopts), "%s", cc->name); if (mkdtemp(prefix) == NULL) { pr_perror("can't make dir for cg mounts"); return -1; } if (mount("none", prefix, fstype, 0, mopts[0] ? mopts : NULL) < 0) { pr_perror("Unable to mount %s %s", fstype, mopts); rmdir(prefix); return -1; } fd = open_detach_mount(prefix); if (fd < 0) return -1; return fd; } static int collect_cgroups(struct list_head *ctls) { struct cg_ctl *cc; int ret = 0; int fd = -1; list_for_each_entry(cc, ctls, l) { char path[PATH_MAX], *root; struct cg_controller *cg; struct cg_root_opt *o; current_controller = NULL; /* We should get all the "real" (i.e. not name=systemd type) * controller from parse_cgroups(), so find that controller if * it exists. */ list_for_each_entry(cg, &cgroups, l) { if (cgroup_contains(cg->controllers, cg->n_controllers, cc->name, NULL)) { current_controller = cg; break; } } if (!current_controller) { /* only allow "fake" controllers to be created this way */ if (!strstartswith(cc->name, namestr)) { pr_err("controller %s not found\n", cc->name); return -1; } else { struct cg_controller *nc; nc = new_controller(cc->name); if (!nc) return -1; list_add_tail(&nc->l, &cg->l); n_cgroups++; current_controller = nc; } } if (!opts.manage_cgroups) continue; if (opts.cgroup_yard) { char dir_path[PATH_MAX]; int off; off = snprintf(dir_path, PATH_MAX, "%s/", opts.cgroup_yard); if (strstartswith(cc->name, namestr)) snprintf(dir_path + off, PATH_MAX - off, "%s", cc->name + strlen(namestr)); else if (cc->name[0] == 0) snprintf(dir_path + off, PATH_MAX - off, "unified"); else snprintf(dir_path + off, PATH_MAX - off, "%s", cc->name); fd = open(dir_path, O_RDONLY | O_DIRECTORY, 0); if (fd < 0) { pr_perror("couldn't open %s", dir_path); return -1; } } else { fd = open_cgroupfs(cc); if (fd < 0) return -1; } path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); root = cc->path; if (opts.new_global_cg_root) root = opts.new_global_cg_root; list_for_each_entry(o, &opts.new_cgroup_roots, node) { if (!strcmp(cc->name, o->controller)) root = o->newroot; } snprintf(path + path_pref_len, PATH_MAX - path_pref_len, "%s", root); ret = ftw(path, add_cgroup, 4); if (ret < 0) pr_perror("failed walking %s for empty cgroups", path); close_safe(&fd); if (ret < 0) return ret; if (opts.freeze_cgroup && !strcmp(cc->name, "freezer") && add_freezer_state(current_controller)) return -1; } return 0; } int dump_thread_cgroup(const struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args, int id) { int pid, tid; LIST_HEAD(ctls); unsigned int n_ctls = 0; struct cg_set *cs; if (opts.unprivileged) return 0; if (item) pid = item->pid->real; else pid = getpid(); if (id < 0) tid = pid; else tid = item->threads[id].real; pr_info("Dumping cgroups for thread %d\n", tid); if (parse_thread_cgroup(pid, tid, args, &ctls, &n_ctls)) return -1; cs = get_cg_set(&ctls, n_ctls, item); if (!cs) return -1; if (!item) { BUG_ON(criu_cgset); criu_cgset = cs; pr_info("Set %d is criu one\n", cs->id); } else { if (item == root_item) { if (!root_cgset) { root_cgset = cs; pr_info("Set %d is root one\n", cs->id); } } else { struct cg_ctl *root, *stray; BUG_ON(!root_cgset); pr_info("Set %d is a stray\n", cs->id); /* Copy the cgns prefix from the root cgset for each * controller. This is ok because we know that there is * only one cgroup namespace. */ list_for_each_entry(root, &root_cgset->ctls, l) { list_for_each_entry(stray, &cs->ctls, l) { if (strcmp(root->name, stray->name)) continue; if (strlen(stray->path) < root->cgns_prefix) { pr_err("cg %s shorter than path prefix %d?\n", stray->path, root->cgns_prefix); return -1; } stray->cgns_prefix = root->cgns_prefix; } } } } *cg_id = cs->id; return 0; } static int dump_cg_dir_props(struct list_head *props, size_t n_props, CgroupPropEntry ***ents) { struct cgroup_prop *prop_cur; CgroupPropEntry *cpe; void *m; int i = 0; m = xmalloc(n_props * (sizeof(CgroupPropEntry *) + sizeof(CgroupPropEntry))); *ents = m; if (!m) return -1; cpe = m + n_props * sizeof(CgroupPropEntry *); list_for_each_entry(prop_cur, props, list) { cgroup_prop_entry__init(cpe); cpe->perms = xmalloc(sizeof(*cpe->perms)); if (!cpe->perms) goto error; cgroup_perms__init(cpe->perms); cpe->name = xstrdup(prop_cur->name); cpe->value = xstrdup(prop_cur->value); if (!cpe->name || !cpe->value) goto error; cpe->perms->mode = prop_cur->mode; cpe->perms->uid = prop_cur->uid; cpe->perms->gid = prop_cur->gid; (*ents)[i++] = cpe++; } return 0; error: while (i >= 0) { xfree(cpe->name); xfree(cpe->value); --cpe; --i; } xfree(*ents); return -1; } static int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ***ents, int poff) { struct cgroup_dir *cur; CgroupDirEntry *cde; void *m; int i = 0; m = xmalloc(n_dirs * (sizeof(CgroupDirEntry *) + sizeof(CgroupDirEntry))); *ents = m; if (!m) return -1; cde = m + n_dirs * sizeof(CgroupDirEntry *); list_for_each_entry(cur, dirs, siblings) { cgroup_dir_entry__init(cde); cde->dir_perms = xmalloc(sizeof(*cde->dir_perms)); if (!cde->dir_perms) return -1; cgroup_perms__init(cde->dir_perms); cde->dir_perms->mode = cur->mode; cde->dir_perms->uid = cur->uid; cde->dir_perms->gid = cur->gid; cde->dir_name = cur->path + poff; if (poff != 1) /* parent isn't "/" */ cde->dir_name++; /* leading / */ cde->n_children = cur->n_children; if (cur->n_children > 0) if (dump_cg_dirs(&cur->children, cur->n_children, &cde->children, strlen(cur->path)) < 0) { xfree(*ents); return -1; } cde->n_properties = cur->n_properties; if (cde->n_properties > 0) { if (dump_cg_dir_props(&cur->properties, cde->n_properties, &cde->properties) < 0) { xfree(*ents); return -1; } } (*ents)[i++] = cde++; } return 0; } static int dump_controllers(CgroupEntry *cg) { struct cg_controller *cur; CgControllerEntry *ce; void *m; int i; cg->n_controllers = n_cgroups; m = xmalloc(n_cgroups * (sizeof(CgControllerEntry *) + sizeof(CgControllerEntry))); cg->controllers = m; ce = m + cg->n_controllers * sizeof(CgControllerEntry *); if (!m) return -1; i = 0; list_for_each_entry(cur, &cgroups, l) { cg_controller_entry__init(ce); ce->has_is_threaded = true; ce->is_threaded = cur->is_threaded; ce->cnames = cur->controllers; ce->n_cnames = cur->n_controllers; ce->n_dirs = cur->n_heads; if (ce->n_dirs > 0) if (dump_cg_dirs(&cur->heads, cur->n_heads, &ce->dirs, 0) < 0) { xfree(cg->controllers); cg->controllers = NULL; return -1; } cg->controllers[i++] = ce++; } return 0; } static void free_sets(CgroupEntry *cg, unsigned nr) { unsigned i; for (i = 0; i < nr; i++) xfree(cg->sets[i]->ctls); xfree(cg->sets); } static int dump_sets(CgroupEntry *cg) { struct cg_set *set; struct cg_ctl *ctl; unsigned s, c; void *m; CgSetEntry *se; CgMemberEntry *ce; pr_info("Dumping %d sets\n", n_sets - 1); cg->n_sets = n_sets - 1; m = xmalloc(cg->n_sets * (sizeof(CgSetEntry *) + sizeof(CgSetEntry))); cg->sets = m; se = m + cg->n_sets * sizeof(CgSetEntry *); if (!m) return -1; s = 0; list_for_each_entry(set, &cg_sets, l) { if (set == criu_cgset) continue; /* * Now encode them onto the image entry */ cg_set_entry__init(se); se->id = set->id; se->n_ctls = set->n_ctls; m = xmalloc(se->n_ctls * (sizeof(CgMemberEntry *) + sizeof(CgMemberEntry))); se->ctls = m; ce = m + se->n_ctls * sizeof(CgMemberEntry *); if (!m) { free_sets(cg, s); return -1; } c = 0; list_for_each_entry(ctl, &set->ctls, l) { pr_info(" `- Dumping %s of %s\n", ctl->name, ctl->path); cg_member_entry__init(ce); ce->name = ctl->name; ce->path = ctl->path; if (ctl->cgns_prefix > 0) { ce->has_cgns_prefix = true; ce->cgns_prefix = ctl->cgns_prefix; } se->ctls[c++] = ce++; } cg->sets[s++] = se++; } return 0; } int dump_cgroups(void) { CgroupEntry cg = CGROUP_ENTRY__INIT; int ret = -1; if (opts.unprivileged) return 0; BUG_ON(!criu_cgset || !root_cgset); /* * Check whether root task lives in its own set as compared * to criu. If yes, we should not dump anything. Note that * list_is_singular() is slightly wrong here: if the criu cgset has * empty cgroups, those will not be restored on the target host, since * we're not dumping anything here. */ if (root_cgset == criu_cgset && list_is_singular(&cg_sets)) { pr_info("All tasks in criu's cgroups. Nothing to dump.\n"); return 0; } if (dump_sets(&cg)) return -1; if (dump_controllers(&cg)) { goto err; } pr_info("Writing CG image\n"); ret = pb_write_one(img_from_set(glob_imgset, CR_FD_CGROUP), &cg, PB_CGROUP); err: free_sets(&cg, cg.n_sets); xfree(cg.controllers); return ret; } static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, char *opt, int os) { int i, doff = 0, ooff = 0; bool none_opt = false; for (i = 0; i < ctl->n_cnames; i++) { char *n; n = ctl->cnames[i]; if (strstartswith(n, "name=")) { n += 5; if (opt && !none_opt) { ooff += snprintf(opt + ooff, os - ooff, "none,"); none_opt = true; } } if (n[0] == 0) doff += snprintf(dir + doff, ds - doff, "unified,"); else doff += snprintf(dir + doff, ds - doff, "%s,", n); if (opt) ooff += snprintf(opt + ooff, os - ooff, "%s,", ctl->cnames[i]); } /* Chop the trailing ','-s */ dir[--doff] = '\0'; if (opt) opt[ooff - 1] = '\0'; return doff; } /* Some properties cannot be restored after the cgroup has children or tasks in * it. We restore these properties as soon as the cgroup is created. */ static const char *special_props[] = { "cpuset.cpus", "cpuset.mems", "devices.list", "memory.kmem.limit_in_bytes", "memory.swappiness", "memory.oom_control", "memory.use_hierarchy", "cgroup.type", NULL, }; bool is_special_property(const char *prop) { size_t i = 0; for (i = 0; special_props[i]; i++) if (strcmp(prop, special_props[i]) == 0) return true; return false; } static int userns_move(void *arg, int fd, pid_t pid) { char pidbuf[32]; int cg, len, err; len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid); if (len >= sizeof(pidbuf)) { pr_err("pid printing failed: %d\n", pid); return -1; } cg = get_service_fd(CGROUP_YARD); err = fd = openat(cg, arg, O_WRONLY); if (fd >= 0) { err = write(fd, pidbuf, len); close(fd); } if (err < 0) { pr_perror("Can't move %s into %s (%d/%d)", pidbuf, (char *)arg, err, fd); return -1; } return 0; } static int prepare_cgns(CgSetEntry *se) { int i; bool do_unshare = false; for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int j, aux_off; CgMemberEntry *ce = se->ctls[i]; CgControllerEntry *ctrl = NULL; for (j = 0; j < n_controllers; j++) { CgControllerEntry *cur = controllers[j]; if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { ctrl = cur; break; } } if (!ctrl) { pr_err("No cg_controller_entry found for %s/%s\n", ce->name, ce->path); return -1; } aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); /* We need to do an unshare() here as unshare() pins the root * of the cgroup namespace to whatever the current cgroups are. * For example, consider a task in a cgroup (according to the * host): * * /unsprefix/insidecontainer * * If the task first moved itself into /unsprefix, then did unshare(), * when the task examines its own /proc/self/cgroup file it will see /, * but to the host it is really in /unsprefix. Then if it further enters * /insidecontainer here, the full host path will be * /unsprefix/insidecontianer. There is no way to say "set the cgroup * namespace boundary at /unsprefix" without first entering that, doing * the unshare, and then entering the rest of the path. */ if (ce->has_cgns_prefix) { char tmp = ce->path[ce->cgns_prefix]; ce->path[ce->cgns_prefix] = '\0'; pr_info("setting cgns prefix to %s\n", ce->path); snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.procs", ce->path); ce->path[ce->cgns_prefix] = tmp; if (userns_call(userns_move, 0, aux, strlen(aux) + 1, -1) < 0) { pr_perror("couldn't set cgns prefix %s", aux); return -1; } do_unshare = true; } } if (do_unshare && unshare(CLONE_NEWCGROUP) < 0) { pr_perror("couldn't unshare cgns"); return -1; } return 0; } static int move_in_cgroup(CgSetEntry *se) { int i; pr_info("Move into %d\n", se->id); for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int fd = -1, err, j, aux_off; CgMemberEntry *ce = se->ctls[i]; CgControllerEntry *ctrl = NULL; for (j = 0; j < n_controllers; j++) { CgControllerEntry *cur = controllers[j]; if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { ctrl = cur; break; } } if (!ctrl) { pr_err("No cg_controller_entry found for %s/%s\n", ce->name, ce->path); return -1; } aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); /* Note that unshare(CLONE_NEWCGROUP) doesn't change the view * of previously mounted cgroupfses; since we're restoring via * a dirfd pointing to the cg yard set up by when criu was in * the root cgns, we still want to use the full path here when * we move into the cgroup. */ snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.procs", ce->path); pr_debug(" `-> %s\n", aux); err = userns_call(userns_move, 0, aux, strlen(aux) + 1, -1); if (err < 0) { pr_perror("Can't move into %s (%d/%d)", aux, err, fd); return -1; } } return 0; } int prepare_cgroup_namespace(struct pstree_item *root_task) { CgSetEntry *se; if (opts.manage_cgroups == CG_MODE_IGNORE) return 0; if (root_task->parent) { pr_err("Expecting root_task to restore cgroup namespace\n"); return -1; } /* * If on dump all dumped tasks are in same cgset with criu we don't * dump cgsets and thus cgroup namespaces and rely that on restore * criu caller would prepare proper cgset/cgns for us. Also in case * of --unprivileged we don't even have the root cgset here. */ if (!rsti(root_task)->cg_set || rsti(root_task)->cg_set == root_cg_set) { pr_info("Cgroup namespace inherited from parent\n"); return 0; } se = find_rst_set_by_id(rsti(root_task)->cg_set); if (!se) { pr_err("No set %d found\n", rsti(root_task)->cg_set); return -1; } if (prepare_cgns(se) < 0) { pr_err("failed preparing cgns\n"); return -1; } return 0; } int restore_task_cgroup(struct pstree_item *me) { struct pstree_item *parent = me->parent; CgSetEntry *se; u32 current_cgset; if (opts.manage_cgroups == CG_MODE_IGNORE) return 0; if (!rsti(me)->cg_set) return 0; /* Zombies and helpers can have cg_set == 0 so we skip them */ while (parent && !rsti(parent)->cg_set) parent = parent->parent; if (parent) current_cgset = rsti(parent)->cg_set; else current_cgset = root_cg_set; if (rsti(me)->cg_set == current_cgset) { pr_info("Cgroups %d inherited from parent\n", current_cgset); return 0; } se = find_rst_set_by_id(rsti(me)->cg_set); if (!se) { pr_err("No set %d found\n", rsti(me)->cg_set); return -1; } return move_in_cgroup(se); } void fini_cgroup(void) { if (!cg_yard) return; close_service_fd(CGROUP_YARD); if (!opts.cgroup_yard) { if (umount2(cg_yard, MNT_DETACH)) pr_perror("Unable to umount %s", cg_yard); if (rmdir(cg_yard)) pr_perror("Unable to remove %s", cg_yard); } xfree(cg_yard); cg_yard = NULL; } static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) { char *current, *next; size_t len, off = 0; current = input; do { next = strchrnul(current, ' '); len = next - current; output[off] = prefix; off++; memcpy(output + off, current, len); off += len; output[off] = ' '; off++; current = next + 1; } while (*next != '\0'); return off; } static int restore_cgroup_subtree_control(const CgroupPropEntry *cg_prop_entry_p, int fd) { char buf[1024]; char line[1024]; int ret, off = 0; ret = read(fd, buf, sizeof(buf) - 1); if (ret < 0) { pr_perror("read from cgroup.subtree_control"); return ret; } /* Remove the trailing newline */ buf[ret] = '\0'; /* Remove all current subsys in subtree_control */ if (buf[0] != '\0') off = add_subtree_control_prop_prefix(buf, line, '-'); /* Add subsys need to be restored in subtree_control */ if (cg_prop_entry_p->value[0] != '\0') off += add_subtree_control_prop_prefix(cg_prop_entry_p->value, line + off, '+'); /* Remove the trailing space */ if (off != 0) { off--; line[off] = '\0'; } if (write(fd, line, off) != off) { pr_perror("write to cgroup.subtree_control"); return -1; } return 0; } /* * Note: The path string can be modified in this function, * the length of path string should be at least PATH_MAX. */ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *path, int off, bool split_lines, bool skip_fails) { int cg, fd, exit_code = -1, flag; CgroupPerms *perms = cg_prop_entry_p->perms; int is_subtree_control = !strcmp(cg_prop_entry_p->name, "cgroup.subtree_control"); if (opts.manage_cgroups == CG_MODE_IGNORE) return 0; if (!cg_prop_entry_p->value) { pr_err("cg_prop_entry->value was empty when should have had a value\n"); return -1; } if (snprintf(path + off, PATH_MAX - off, "/%s", cg_prop_entry_p->name) >= PATH_MAX) { pr_err("snprintf output was truncated for %s\n", cg_prop_entry_p->name); return -1; } pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path); if (is_subtree_control) flag = O_RDWR; else flag = O_WRONLY; cg = get_service_fd(CGROUP_YARD); fd = openat(cg, path, flag); if (fd < 0) { pr_perror("bad cgroup path: %s", path); return -1; } if (perms && cr_fchperm(fd, perms->uid, perms->gid, perms->mode) < 0) goto out; /* skip these two since restoring their values doesn't make sense */ if (!strcmp(cg_prop_entry_p->name, "cgroup.procs") || !strcmp(cg_prop_entry_p->name, "tasks")) { exit_code = 0; goto out; } if (is_subtree_control) { exit_code = restore_cgroup_subtree_control(cg_prop_entry_p, fd); goto out; } /* skip restoring cgroup.type if its value is not "threaded" */ if (!strcmp(cg_prop_entry_p->name, "cgroup.type") && strcmp(cg_prop_entry_p->value, "threaded")) { exit_code = 0; goto out; } if (split_lines) { char *line = cg_prop_entry_p->value; char *next_line; size_t len; do { next_line = strchrnul(line, '\n'); len = next_line - line; if (write(fd, line, len) != len) { pr_perror("Failed writing %s to %s", line, path); if (!skip_fails) goto out; } line = next_line + 1; } while (*next_line != '\0'); } else { size_t len = strlen(cg_prop_entry_p->value); int ret; ret = write(fd, cg_prop_entry_p->value, len); /* memory.kmem.limit_in_bytes has been deprecated. Look at * 58056f77502f3 ("memcg, kmem: further deprecate * kmem.limit_in_bytes") for more details. */ if (ret == -1 && errno == EOPNOTSUPP && !strcmp(cg_prop_entry_p->name, "memory.kmem.limit_in_bytes")) ret = len; if (ret != len) { pr_perror("Failed writing %s to %s", cg_prop_entry_p->value, path); if (!skip_fails) goto out; } } exit_code = 0; out: if (close(fd) != 0) pr_perror("Failed closing %s", path); return exit_code; } static CgroupPropEntry *freezer_state_entry; static char freezer_path[PATH_MAX]; int restore_freezer_state(void) { size_t freezer_path_len; if (!freezer_state_entry) return 0; freezer_path_len = strlen(freezer_path); return restore_cgroup_prop(freezer_state_entry, freezer_path, freezer_path_len, false, false); } static void add_freezer_state_for_restore(CgroupPropEntry *entry, char *path, size_t path_len) { BUG_ON(path_len >= sizeof(freezer_path)); if (freezer_state_entry) { int max_len, i; max_len = strlen(freezer_path); if (max_len > path_len) max_len = path_len; /* If there are multiple freezer.state properties, that means they had * one common path prefix with no tasks in it. Let's find that common * prefix. */ for (i = 0; i < max_len; i++) { if (freezer_path[i] != path[i]) { freezer_path[i] = 0; return; } } } freezer_state_entry = entry; /* Path is not null terminated at path_len */ strncpy(freezer_path, path, path_len); freezer_path[path_len] = 0; } /* * Filter out ifpriomap interfaces which have 0 as priority. * As by default new ifpriomap has 0 as a priority for each * interface, this will save up some write()'s. * As this property is used rarely, this may save a whole bunch * of syscalls, skipping all ifpriomap restore. */ static int filter_ifpriomap(char *out, char *line) { char *next_line, *space; bool written = false; size_t len; if (*line == '\0') return 0; do { next_line = strchrnul(line, '\n'); len = next_line - line; space = strchr(line, ' '); if (!space) { pr_err("Invalid value for ifpriomap: `%s'\n", line); return -1; } if (!strtol(space, NULL, 10)) goto next; /* Copying with last \n or \0 */ strncpy(out, line, len + 1); out += len + 1; written = true; next: line = next_line + 1; } while (*next_line != '\0'); if (written) *(out - 1) = '\0'; return 0; } static int restore_cgroup_ifpriomap(CgroupPropEntry *cpe, char *path, int off) { CgroupPropEntry priomap = *cpe; int ret = -1; priomap.value = xmalloc(strlen(cpe->value) + 1); priomap.value[0] = '\0'; if (filter_ifpriomap(priomap.value, cpe->value)) goto out; if (strlen(priomap.value)) ret = restore_cgroup_prop(&priomap, path, off, true, true); else ret = 0; out: xfree(priomap.value); return ret; } static int prepare_cgroup_dir_properties(char *path, int off, CgroupDirEntry **ents, unsigned int n_ents) { unsigned int i, j; for (i = 0; i < n_ents; i++) { CgroupDirEntry *e = ents[i]; size_t off2 = off; if (strcmp(e->dir_name, "") == 0) goto skip; /* skip root cgroups */ off2 += sprintf(path + off, "/%s", e->dir_name); for (j = 0; j < e->n_properties; ++j) { CgroupPropEntry *p = e->properties[j]; if (!strcmp(p->name, "freezer.state")) { add_freezer_state_for_restore(p, path, off2); continue; /* skip restore now */ } /* Skip restoring special cpuset props now. * They were restored earlier, and can cause * the restore to fail if some other task has * entered the cgroup. */ if (is_special_property(p->name)) continue; /* * The kernel can't handle it in one write() * Number of network interfaces on host may differ. */ if (strcmp(p->name, "net_prio.ifpriomap") == 0) { if (restore_cgroup_ifpriomap(p, path, off2)) return -1; continue; } if (restore_cgroup_prop(p, path, off2, false, false) < 0) return -1; } skip: if (prepare_cgroup_dir_properties(path, off2, e->children, e->n_children) < 0) return -1; } return 0; } int prepare_cgroup_properties(void) { char cname_path[PATH_MAX]; unsigned int i, off; for (i = 0; i < n_controllers; i++) { CgControllerEntry *c = controllers[i]; if (c->n_cnames < 1) { pr_err("Each CgControllerEntry should have at least 1 cname\n"); return -1; } off = ctrl_dir_and_opt(c, cname_path, sizeof(cname_path), NULL, 0); if (prepare_cgroup_dir_properties(cname_path, off, c->dirs, c->n_dirs) < 0) return -1; } return 0; } /* * The devices cgroup must be restored in a special way: * only the contents of devices.list can be read, and it is a whitelist * of all the devices the cgroup is allowed to create. To re-create * this whitelist, we firstly deny everything via devices.deny, * and then write the list back into devices.allow. * * Further, we must have a write() call for each line, because the kernel * only parses the first line of any write(). */ static int restore_devices_list(char *paux, size_t off, CgroupPropEntry *pr) { CgroupPropEntry dev_allow = *pr; CgroupPropEntry dev_deny = *pr; int ret; dev_allow.name = "devices.allow"; dev_deny.name = "devices.deny"; dev_deny.value = "a"; ret = restore_cgroup_prop(&dev_deny, paux, off, false, false); /* * An empty string here means nothing is allowed, * and the kernel disallows writing an "" to devices.allow, * so let's just keep going. */ if (!strcmp(dev_allow.value, "")) return 0; if (ret < 0) return -1; return restore_cgroup_prop(&dev_allow, paux, off, true, false); } static int restore_special_property(char *paux, size_t off, CgroupPropEntry *pr) { /* * XXX: we can drop this hack and make memory.swappiness and * memory.oom_control regular properties when we drop support for * kernels < 3.16. See 3dae7fec5. */ if (!strcmp(pr->name, "memory.swappiness") && !strcmp(pr->value, "60")) return 0; if (!strcmp(pr->name, "memory.oom_control") && !strcmp(pr->value, "0")) return 0; if (!strcmp(pr->name, "devices.list")) { /* * A bit of a fudge here. These are write only by owner * by default, but the container engine could have changed * the perms. We should come up with a better way to * restore all of this stuff. */ pr->perms->mode = 0200; return restore_devices_list(paux, off, pr); } return restore_cgroup_prop(pr, paux, off, false, false); } static int restore_special_props(char *paux, size_t off, CgroupDirEntry *e) { unsigned int j; pr_info("Restore special props\n"); for (j = 0; j < e->n_properties; j++) { CgroupPropEntry *prop = e->properties[j]; if (!is_special_property(prop->name)) continue; if (restore_special_property(paux, off, prop) < 0) { pr_err("Restoring %s special property failed\n", prop->name); return -1; } } return 0; } static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) { int fd, ret = 0; fd = openat(cg, path, O_DIRECTORY); if (fd < 0) { pr_perror("failed to open cg dir fd (%s) for chowning", path); return -1; } if (perms) ret = cr_fchperm(fd, perms->uid, perms->gid, perms->mode); close(fd); return ret; } static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux, size_t off, CgroupDirEntry **ents, size_t n_ents) { size_t i, j; CgroupDirEntry *e; int cg = get_service_fd(CGROUP_YARD); for (i = 0; i < n_ents; i++) { size_t off2 = off; e = ents[i]; off2 += sprintf(paux + off, "/%s", e->dir_name); if (faccessat(cg, paux, F_OK, 0) < 0) { if (errno != ENOENT) { pr_perror("Failed accessing cgroup dir %s", paux); return -1; } if (opts.manage_cgroups & (CG_MODE_NONE | CG_MODE_PROPS)) { pr_err("Cgroup dir %s doesn't exist\n", paux); return -1; } if (mkdirpat(cg, paux, 0755)) { pr_perror("Can't make cgroup dir %s", paux); return -1; } pr_info("Created cgroup dir %s\n", paux); if (prepare_dir_perms(cg, paux, e->dir_perms) < 0) return -1; for (j = 0; j < n_controllers; j++) { if (restore_special_props(paux, off2, e) < 0) { pr_err("Restoring special cpuset props failed!\n"); return -1; } } } else { pr_info("Determined cgroup dir %s already exist\n", paux); if (opts.manage_cgroups & CG_MODE_STRICT) { pr_err("Abort restore of existing cgroups\n"); return -1; } if (opts.manage_cgroups & (CG_MODE_SOFT | CG_MODE_NONE)) { pr_info("Skip restoring properties on cgroup dir %s\n", paux); if (e->n_properties > 0) { xfree(e->properties); e->properties = NULL; e->n_properties = 0; } } if (!(opts.manage_cgroups & CG_MODE_NONE) && prepare_dir_perms(cg, paux, e->dir_perms) < 0) return -1; } if (prepare_cgroup_dirs(controllers, n_controllers, paux, off2, e->children, e->n_children) < 0) return -1; } return 0; } /* * Prepare the CGROUP_YARD service descriptor. This guy is * tmpfs mount with the set of ctl->name directories each * one having the respective cgroup mounted. * * It's required for two reasons. * * First, if we move more than one task into cgroups it's * faster to have cgroup tree visible by them all in sime * single place. Searching for this thing existing in the * criu's space is not nice, as parsing /proc/mounts is not * very fast, other than this not all cgroups may be mounted. * * Second, when we have user-namespaces support we will * loose the ability to mount cgroups on-demand, so prepare * them in advance. */ static int prepare_cgroup_sfd(CgroupEntry *ce) { int off, i, ret; char paux[PATH_MAX]; if (!opts.manage_cgroups) return 0; pr_info("Preparing cgroups yard (cgroups restore mode %#x)\n", opts.manage_cgroups); if (opts.cgroup_yard) { off = sprintf(paux, "%s", opts.cgroup_yard); cg_yard = xstrdup(paux); if (!cg_yard) return -1; } else { off = sprintf(paux, ".criu.cgyard.XXXXXX"); if (mkdtemp(paux) == NULL) { pr_perror("Can't make temp cgyard dir"); return -1; } cg_yard = xstrdup(paux); if (!cg_yard) { rmdir(paux); return -1; } if (make_yard(cg_yard)) return -1; } pr_debug("Opening %s as cg yard\n", cg_yard); i = open(cg_yard, O_DIRECTORY); if (i < 0) { pr_perror("Can't open cgyard"); return -1; } ret = install_service_fd(CGROUP_YARD, i); if (ret < 0) return -1; paux[off++] = '/'; for (i = 0; i < ce->n_controllers; i++) { int ctl_off = off, yard_off; char opt[128], *yard; CgControllerEntry *ctrl = ce->controllers[i]; if (ctrl->n_cnames < 1) { pr_err("Each cg_controller_entry must have at least 1 controller\n"); return -1; } ctl_off += ctrl_dir_and_opt(ctrl, paux + ctl_off, sizeof(paux) - ctl_off, opt, sizeof(opt)); /* Create controller if not yet present */ if (access(paux, F_OK)) { char *fstype = "cgroup"; if (ctrl->cnames[0][0] == 0) fstype = "cgroup2"; pr_debug("\tMaking controller dir %s (%s)\n", paux, opt); if (mkdir(paux, 0700)) { pr_perror("\tCan't make controller dir %s", paux); return -1; } if (mount("none", paux, fstype, 0, opt) < 0) { pr_perror("\tCan't mount controller dir %s", paux); return -1; } } /* * Finally handle all cgroups for this controller. */ yard = paux + strlen(cg_yard) + 1; yard_off = ctl_off - (strlen(cg_yard) + 1); if (opts.manage_cgroups && prepare_cgroup_dirs(ctrl->cnames, ctrl->n_cnames, yard, yard_off, ctrl->dirs, ctrl->n_dirs)) return -1; } return 0; } /* * If a thread is a different cgroup set than the main thread in process, * it means it is in a threaded controller. This daemon receives the cg_set * number from the restored thread and move this thread to the correct * cgroup controllers */ static int cgroupd(int sk) { pr_info("cgroud: Daemon started\n"); while (1) { struct unsc_msg um; uns_call_t call; pid_t tid; int fd, cg_set, i; CgSetEntry *cg_set_entry; int ret; unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, NULL); ret = recvmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("cgroupd: recv req error"); return -1; } unsc_msg_pid_fd(&um, &tid, &fd); pr_debug("cgroupd: move process %d into cg_set %d\n", tid, cg_set); cg_set_entry = find_rst_set_by_id(cg_set); if (!cg_set_entry) { pr_err("cgroupd: No set found %d\n", cg_set); return -1; } for (i = 0; i < cg_set_entry->n_ctls; i++) { int j, aux_off; CgMemberEntry *ce = cg_set_entry->ctls[i]; char aux[PATH_MAX]; CgControllerEntry *ctrl = NULL; for (j = 0; j < n_controllers; j++) { CgControllerEntry *cur = controllers[j]; if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { ctrl = cur; break; } } if (!ctrl) { pr_err("cgroupd: No cg_controller_entry found for %s/%s\n", ce->name, ce->path); return -1; } /* * This is not a threaded controller, all threads in this * process must be in this controller. Main thread has been * restored, so this thread is in this controller already. */ if (!ctrl->has_is_threaded || !ctrl->is_threaded) continue; aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.threads", ce->path); /* * Cgroupd runs outside of the namespaces so we don't * need to use userns_call here */ if (userns_move(aux, 0, tid)) { pr_err("cgroupd: Can't move thread %d into %s/%s\n", tid, ce->name, ce->path); return -1; } } /* * We only want to send the cred which contains thread id back. * The restored thread recvmsg(MSG_PEEK) until it gets its own * thread id. */ unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, &tid); if (sendmsg(sk, &um.h, 0) <= 0) { pr_perror("cgroupd: send req error"); return -1; } } return 0; } int stop_cgroupd(void) { if (cgroupd_pid) { sigset_t blockmask, oldmask; /* * Block the SIGCHLD signal to avoid triggering * sigchld_handler() */ sigemptyset(&blockmask); sigaddset(&blockmask, SIGCHLD); sigprocmask(SIG_BLOCK, &blockmask, &oldmask); kill(cgroupd_pid, SIGTERM); waitpid(cgroupd_pid, NULL, 0); sigprocmask(SIG_SETMASK, &oldmask, NULL); } return 0; } static int prepare_cgroup_thread_sfd(void) { int sk; sk = start_unix_cred_daemon(&cgroupd_pid, cgroupd); if (sk < 0) { pr_err("failed to start cgroupd\n"); return -1; } if (install_service_fd(CGROUPD_SK, sk) < 0) { kill(cgroupd_pid, SIGKILL); waitpid(cgroupd_pid, NULL, 0); return -1; } return 0; } static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot) { size_t dirlen = strlen(*dir_name); char *dir = *dir_name; char *dirnew = NULL; size_t i, j; /* * For example we may have the following in the image: * * set * name "hugetlb" * path "/300" * * controller * cnames hugetlb * dirs * dirname "300" * properties ... * * when we're switching to a new root we need to change * @path and don't forget to update the @dirname into * new state. */ for (i = 0; i < cge->n_sets; i++) { CgSetEntry *set = cge->sets[i]; for (j = 0; j < set->n_ctls; j++) { CgMemberEntry *cg = set->ctls[j]; /* * Make sure if it's same controller * and its path with stripping leading * "/" is matching to be renamed. */ if (!(cgroup_contains(controllers, n_controllers, cg->name, NULL) && strstartswith(cg->path + 1, dir))) continue; if (cg->has_cgns_prefix && cg->cgns_prefix) { char *prev = cg->path; cg->path = xsprintf("%s%s", newroot, cg->path + cg->cgns_prefix); if (!cg->path) { cg->path = prev; xfree(dirnew); return -ENOMEM; } xfree(prev); if (!dirnew) { /* -1 because cgns_prefix includes leading "/" */ dirnew = xsprintf("%s%s", newroot, dir + cg->cgns_prefix - 1); if (!dirnew) return -ENOMEM; } cg->cgns_prefix = strlen(newroot); } else { char *prev = cg->path; /* * If no prefix present simply rename the * root but make sure the rest of path is * untouched. */ cg->path = xsprintf("%s%s", newroot, cg->path + dirlen + 1); if (!cg->path) { cg->path = prev; xfree(dirnew); return -ENOMEM; } xfree(prev); if (!dirnew) { dirnew = xstrdup(newroot); if (!dirnew) return -ENOMEM; } } } } if (dirnew) { xfree(dir); *dir_name = dirnew; } return 0; } static int rewrite_cgroup_roots(CgroupEntry *cge) { int i, j; struct cg_root_opt *o; for (i = 0; i < cge->n_controllers; i++) { CgControllerEntry *ctrl = cge->controllers[i]; u64 ctrl_mask = (1ULL << ctrl->n_cnames) - 1; char *newroot = NULL; list_for_each_entry(o, &opts.new_cgroup_roots, node) { unsigned old_mask = ctrl_mask; /* coverity[check_return] */ cgroup_contains(ctrl->cnames, ctrl->n_cnames, o->controller, &ctrl_mask); if (old_mask != ctrl_mask) { if (newroot && strcmp(newroot, o->newroot)) { pr_err("CG paths mismatch: %s %s\n", newroot, o->newroot); return -1; } newroot = o->newroot; } if (!ctrl_mask) break; } if (!newroot) newroot = opts.new_global_cg_root; if (newroot) { for (j = 0; j < ctrl->n_dirs; j++) { CgroupDirEntry *cgde = ctrl->dirs[j]; pr_info("rewriting %s to %s\n", cgde->dir_name, newroot); if (rewrite_cgsets(cge, ctrl->cnames, ctrl->n_cnames, &cgde->dir_name, newroot)) return -1; } } } return 0; } int prepare_cgroup(void) { int ret; struct cr_img *img; CgroupEntry *ce; img = open_image(CR_FD_CGROUP, O_RSTR); if (!img) return -1; ret = pb_read_one_eof(img, &ce, PB_CGROUP); close_image(img); if (ret <= 0) /* Zero is OK -- no sets there. */ return ret; if (rewrite_cgroup_roots(ce)) return -1; n_sets = ce->n_sets; rst_sets = ce->sets; n_controllers = ce->n_controllers; controllers = ce->controllers; if (n_sets) { /* * We rely on the fact that all sets contain the same * set of controllers. This is checked during dump * with cg_set_compare(CGCMP_ISSUB) call. */ ret = prepare_cgroup_sfd(ce); if (ret < 0) return ret; ret = prepare_cgroup_thread_sfd(); } else { ret = 0; } return ret; } int new_cg_root_add(char *controller, char *newroot) { struct cg_root_opt *o; if (!controller) { SET_CHAR_OPTS(new_global_cg_root, newroot); return 0; } o = xmalloc(sizeof(*o)); if (!o) return -1; o->controller = xstrdup(controller); if (!o->controller) goto err_ctrl; o->newroot = xstrdup(newroot); if (!o->newroot) goto err_newroot; list_add(&o->node, &opts.new_cgroup_roots); return 0; err_newroot: xfree(o->controller); err_ctrl: xfree(o); return -1; } struct ns_desc cgroup_ns_desc = NS_DESC_ENTRY(CLONE_NEWCGROUP, "cgroup"); crac-criu-1.5.0/criu/clone-noasan.c000066400000000000000000000046511471504326700170670ustar00rootroot00000000000000#include #include #include #include #include "sched.h" #include "common/compiler.h" #include "log.h" #include "common/bug.h" /* * ASan doesn't play nicely with clone if we use current stack for * child task. ASan puts local variables on the fake stack * to catch use-after-return bug: * https://github.com/google/sanitizers/wiki/AddressSanitizerUseAfterReturn#algorithm * * So it's become easy to overflow this fake stack frame in cloned child. * We need a real stack for clone(). * * To workaround this we add clone_noasan() not-instrumented wrapper for * clone(). Unfortunately we can't use __attribute__((no_sanitize_address)) * for this because of bug in GCC > 6: * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69863 * * So the only way is to put this wrapper in separate non-instrumented file * * WARNING: When calling clone_noasan make sure your not sitting in a later * __restore__ phase where other tasks might be creating threads, otherwise * all calls to clone_noasan should be guarder with * * lock_last_pid * clone_noasan * ... wait for process to finish ... * unlock_last_pid */ int clone_noasan(int (*fn)(void *), int flags, void *arg) { void *stack_ptr = (void *)round_down((unsigned long)&stack_ptr - 1024, 16); BUG_ON((flags & CLONE_VM) && !(flags & CLONE_VFORK)); /* * Reserve some bytes for clone() internal needs * and use as stack the address above this area. */ return clone(fn, stack_ptr, flags, arg); } int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, int exit_signal, pid_t pid) { struct _clone_args c_args = {}; BUG_ON(flags & CLONE_VM); /* * Make sure no child signals are requested. clone3() uses * exit_signal for that. */ BUG_ON(flags & 0xff); pr_debug("Creating process using clone3()\n"); /* * clone3() explicitly blocks setting an exit_signal * if CLONE_PARENT is specified. With clone() it also * did not work, but there was no error message. The * exit signal from the thread group leader is taken. */ if (!(flags & CLONE_PARENT)) { if (exit_signal != SIGCHLD) { pr_err("Exit signal not SIGCHLD\n"); errno = EINVAL; return -1; } c_args.exit_signal = exit_signal; } c_args.flags = flags; c_args.set_tid = ptr_to_u64(&pid); c_args.set_tid_size = 1; pid = syscall(__NR_clone3, &c_args, sizeof(c_args)); if (pid == 0) exit(fn(arg)); return pid; } crac-criu-1.5.0/criu/config.c000066400000000000000000000715741471504326700157670ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "log.h" #include "common/list.h" #include "action-scripts.h" #include "cgroup.h" #include "cgroup-props.h" #include "common/bug.h" #include "cpu.h" #include "crtools.h" #include "cr_options.h" #include "filesystems.h" #include "file-lock.h" #include "irmap.h" #include "mount.h" #include "mount-v2.h" #include "namespaces.h" #include "net.h" #include "sk-inet.h" #include "sockets.h" #include "tty.h" #include "version.h" #include "common/xmalloc.h" struct cr_options opts; char *rpc_cfg_file; static int count_elements(char **to_count) { int count = 0; if (to_count != NULL) while (to_count[count] != NULL) count++; return count; } /* Parse one statement in configuration file */ int parse_statement(int i, char *line, char **configuration) { cleanup_free char *input = NULL; int offset = 0, len = 0; char *tmp_string; /* * A line from the configuration file can be: * - empty * - a boolean option (tcp-close) * - an option with one parameter (verbosity 4) * - a parameter can be in quotes (lsm-profile "selinux:something") * - a parameter can contain escaped quotes * * Whenever a '#' is found we ignore everything after as a comment. * * This function adds none, one (boolean option) or two entries * in **configuration and returns i + (the number of entries). */ if (strlen(line) == 0) return i; /* Ignore leading white-space */ while ((isspace(*(line + offset)) && (*(line + offset) != '\n'))) offset++; /* Ignore empty line */ if (line[offset] == '\n') return i; /* Ignore line starting with a comment */ if (line[offset] == '#') return i; input = xstrdup(line + offset); if (unlikely(!input)) return -1; offset = 0; /* Remove trailing '\n' */ if ((tmp_string = strchr(input, '\n'))) tmp_string[0] = 0; if ((tmp_string = strchr(input, ' ')) || (tmp_string = strchr(input, '\t'))) { configuration[i] = xzalloc(tmp_string - input + strlen("--") + 1); if (unlikely(!configuration[i])) return -1; memcpy(configuration[i], "--", strlen("--")); memcpy(configuration[i] + strlen("--"), input, tmp_string - input); configuration[i][tmp_string - input + strlen("--")] = 0; /* Go to the next character */ offset += tmp_string - input + 1; i++; } else { if (unlikely(asprintf(&configuration[i], "--%s", input) == -1)) return -1; return i + 1; } while ((isspace(*(input + offset)))) offset++; /* Check if the next token is a comment */ if (input[offset] == '#') return i; if (input[offset] == '"') { bool found_second_quote = false; char *quote_start; int quote_offset; /* Move by one to skip the leading quote. */ offset++; quote_start = input + offset; quote_offset = offset; if (input[offset] == 0) { /* The value for the parameter was a single quote, this is not supported. */ xfree(configuration[i - 1]); pr_err("Unsupported configuration file format. Please consult man page criu(8)\n"); return -1; } if (input[offset] == '"') { /* We got "" as value */ configuration[i] = xstrdup(""); if (unlikely(!configuration[i])) { xfree(configuration[i - 1]); return -1; } offset = 0; goto out; } /* * If it starts with a quote everything until the * next unescaped quote needs to be looked at. */ while ((tmp_string = strchr(input + quote_offset + 1, '"'))) { quote_offset = tmp_string - input; /* Check if it is escaped */ if (*(tmp_string - 1) == '\\') continue; /* Not escaped. That is the end of the quoted string. */ found_second_quote = true; configuration[i] = xzalloc(quote_offset - offset + 1); if (unlikely(!configuration[i])) { xfree(configuration[i - 1]); return -1; } memcpy(configuration[i], quote_start, quote_offset - offset); configuration[i][quote_offset - offset] = 0; /* We skipped one additional quote */ offset++; /* Check for excessive parameters on the original line. */ tmp_string++; if (tmp_string != 0 && strchr(tmp_string, ' ')) { int j; len = strlen(tmp_string); for (j = 0; j < len - 1; j++) { if (tmp_string[j] == '#') break; if (!isspace(tmp_string[j])) { pr_err("Unsupported configuration file format. Please consult man page criu(8)\n"); xfree(configuration[i - 1]); xfree(configuration[i]); return -1; } } } break; } if (!found_second_quote) { pr_err("Unsupported configuration file format. Please consult man page criu(8)\n"); xfree(configuration[i - 1]); return -1; } } else { /* Does not start with a quote. */ if (unlikely(asprintf(&configuration[i], "%s", input + offset) == -1)) { xfree(configuration[i - 1]); return -1; } if ((tmp_string = strchr(input + offset, ' '))) offset = tmp_string - (input + offset); else offset = 0; } len = strlen(configuration[i]); if (strstr(configuration[i], "\\\"")) { /* We found an escaped quote. Skip the backslash. */ cleanup_free char *tmp = NULL; int skipped = 0; int start = 0; int dest = 0; int j; tmp = xzalloc(len); if (tmp == NULL) return -1; for (j = start; j < len; j++) { if (configuration[i][j] == '\\' && j + 1 < len && configuration[i][j + 1] == '"') { skipped++; continue; } tmp[dest++] = configuration[i][j]; } memcpy(configuration[i], tmp, strlen(tmp)); configuration[i][strlen(tmp)] = 0; /* Account for skipped backslashes. */ offset += skipped + 1; len -= skipped; } out: /* Remove potential comments at the end */ if ((tmp_string = strstr(configuration[i], "#")) || (tmp_string = strstr(configuration[i], " #"))) tmp_string[0] = 0; /* Check for unsupported configuration file entries */ if (strchr(configuration[i] + offset, ' ')) { int j; len = strlen(configuration[i] + offset); for (j = 0; j < len - 1; j++) { if (!isspace(configuration[i][offset + j])) { pr_err("Unsupported configuration file format. Please consult man page criu(8)\n"); xfree(configuration[i - 1]); xfree(configuration[i]); return -1; } } } if ((tmp_string = strchr(configuration[i] + offset, ' '))) tmp_string[0] = 0; return i + 1; } /* Parse a configuration file */ static char **parse_config(char *filepath) { #define DEFAULT_CONFIG_SIZE 10 FILE *configfile = fopen(filepath, "r"); int config_size = DEFAULT_CONFIG_SIZE; int i = 1; size_t line_size = 0; char *line = NULL; char **configuration; if (!configfile) return NULL; pr_debug("Parsing config file %s\n", filepath); configuration = xmalloc(config_size * sizeof(char *)); if (configuration == NULL) { fclose(configfile); exit(1); } /* * Initialize first element, getopt ignores it. */ configuration[0] = "criu"; while (getline(&line, &line_size, configfile) != -1) { int spaces = 1; int j; /* * The statement parser 'parse_statement()' needs as many * elements in 'configuration' as spaces + 1, because it splits * each line at a space to return a result that can used as * input for getopt. So, let's count spaces to determine the * memory requirements. */ for (j = 0; j < strlen(line); j++) if (line[j] == ' ') spaces++; /* Extend configuration buffer if necessary */ if (i + spaces >= config_size - 1) { config_size += spaces; configuration = xrealloc(configuration, config_size * sizeof(char *)); if (configuration == NULL) { fclose(configfile); exit(1); } } i = parse_statement(i, line, configuration); if (i < 0) { fclose(configfile); exit(1); } free(line); line = NULL; } /* Initialize the last element */ configuration[i] = NULL; free(line); fclose(configfile); return configuration; } static int next_config(char **argv, char ***_argv, bool no_default_config, int state, char *cfg_file) { char local_filepath[PATH_MAX + 1]; char *home_dir = NULL; char *cfg_from_env = NULL; if (state >= PARSING_LAST) return 0; switch (state) { case PARSING_GLOBAL_CONF: if (no_default_config) break; *_argv = parse_config(GLOBAL_CONFIG_DIR DEFAULT_CONFIG_FILENAME); break; case PARSING_USER_CONF: if (no_default_config) break; home_dir = getenv("HOME"); if (!home_dir) { pr_info("Unable to get $HOME directory, local configuration file will not be used.\n"); } else { snprintf(local_filepath, PATH_MAX, "%s/%s%s", home_dir, USER_CONFIG_DIR, DEFAULT_CONFIG_FILENAME); *_argv = parse_config(local_filepath); } break; case PARSING_ENV_CONF: cfg_from_env = getenv("CRIU_CONFIG_FILE"); if (!cfg_from_env) break; *_argv = parse_config(cfg_from_env); break; case PARSING_CMDLINE_CONF: if (!cfg_file) break; *_argv = parse_config(cfg_file); break; case PARSING_ARGV: *_argv = argv; break; case PARSING_RPC_CONF: if (!rpc_cfg_file) break; *_argv = parse_config(rpc_cfg_file); break; default: break; } return ++state; } static int pre_parse(int argc, char **argv, bool *usage_error, bool *no_default_config, char **cfg_file) { int i; /* * We are running before getopt(), so we need to pre-parse * the command line. * * Check for --help / -h on commandline before parsing, otherwise * the help message won't be displayed if there is an error in * configuration file syntax. Checks are kept in parser in case of * option being put in the configuration file itself. * * Check also whether default configfiles are forbidden to lower * number of argv iterations, but checks for help have higher priority. */ for (i = 0; i < argc; i++) { if ((!strcmp(argv[i], "--help")) || (!strcmp(argv[i], "-h"))) { *usage_error = false; return 1; } else if (!strcmp(argv[i], "--no-default-config")) { *no_default_config = true; } else if (!strcmp(argv[i], "--config")) { /* * getopt takes next string as required * argument automatically, we do the same */ *cfg_file = argv[i + 1]; *no_default_config = true; } else if (strstr(argv[i], "--config=") != NULL) { *cfg_file = argv[i] + strlen("--config="); *no_default_config = true; } } return 0; } void init_opts(void) { memset(&opts, 0, sizeof(opts)); /* Default options */ opts.final_state = TASK_DEAD; INIT_LIST_HEAD(&opts.ext_mounts); INIT_LIST_HEAD(&opts.inherit_fds); INIT_LIST_HEAD(&opts.external); INIT_LIST_HEAD(&opts.join_ns); INIT_LIST_HEAD(&opts.new_cgroup_roots); INIT_LIST_HEAD(&opts.irmap_scan_paths); opts.cpu_cap = CPU_CAP_DEFAULT; opts.manage_cgroups = CG_MODE_DEFAULT; opts.ps_socket = -1; opts.ghost_limit = DEFAULT_GHOST_LIMIT; opts.timeout = DEFAULT_TIMEOUT; opts.empty_ns = 0; opts.status_fd = -1; opts.log_level = DEFAULT_LOGLEVEL; opts.pre_dump_mode = PRE_DUMP_SPLICE; opts.file_validation_method = FILE_VALIDATION_DEFAULT; opts.network_lock_method = NETWORK_LOCK_DEFAULT; opts.mmap_page_image = true; opts.ptrace_allowed = true; opts.ghost_fiemap = FIEMAP_DEFAULT; } bool deprecated_ok(char *what) { if (opts.deprecated_ok) return true; pr_err("Deprecated functionality (%s) rejected.\n", what); pr_err("Use the --deprecated option or set CRIU_DEPRECATED environment.\n"); pr_err("For details visit https://criu.org/Deprecation\n"); return false; } static int parse_cpu_cap(struct cr_options *opts, const char *optarg) { bool inverse = false; #define ____cpu_set_cap(__opts, __cap, __inverse) \ do { \ if ((__inverse)) \ (__opts)->cpu_cap &= ~(__cap); \ else \ (__opts)->cpu_cap |= (__cap); \ } while (0) if (!optarg) { ____cpu_set_cap(opts, CPU_CAP_ALL, false); ____cpu_set_cap(opts, CPU_CAP_IMAGE, false); return 0; } while (*optarg) { if (optarg[0] == '^') { inverse = !inverse; optarg++; continue; } else if (optarg[0] == ',') { inverse = false; optarg++; continue; } if (!strncmp(optarg, "fpu", 3)) { ____cpu_set_cap(opts, CPU_CAP_FPU, inverse); optarg += 3; } else if (!strncmp(optarg, "all", 3)) { ____cpu_set_cap(opts, CPU_CAP_ALL, inverse); optarg += 3; } else if (!strncmp(optarg, "none", 4)) { if (inverse) opts->cpu_cap = CPU_CAP_ALL; else opts->cpu_cap = CPU_CAP_NONE; optarg += 4; } else if (!strncmp(optarg, "cpu", 3)) { ____cpu_set_cap(opts, CPU_CAP_CPU, inverse); optarg += 3; } else if (!strncmp(optarg, "ins", 3)) { ____cpu_set_cap(opts, CPU_CAP_INS, inverse); optarg += 3; } else goto Esyntax; } if (opts->cpu_cap != CPU_CAP_NONE) ____cpu_set_cap(opts, CPU_CAP_IMAGE, false); #undef ____cpu_set_cap return 0; Esyntax: pr_err("Unknown FPU mode `%s' selected\n", optarg); return -1; } static int parse_manage_cgroups(struct cr_options *opts, const char *optarg) { if (!optarg) { opts->manage_cgroups = CG_MODE_SOFT; return 0; } if (!strcmp(optarg, "none")) { opts->manage_cgroups = CG_MODE_NONE; } else if (!strcmp(optarg, "props")) { opts->manage_cgroups = CG_MODE_PROPS; } else if (!strcmp(optarg, "soft")) { opts->manage_cgroups = CG_MODE_SOFT; } else if (!strcmp(optarg, "full")) { opts->manage_cgroups = CG_MODE_FULL; } else if (!strcmp(optarg, "strict")) { opts->manage_cgroups = CG_MODE_STRICT; } else if (!strcmp(optarg, "ignore")) { opts->manage_cgroups = CG_MODE_IGNORE; } else goto Esyntax; return 0; Esyntax: pr_err("Unknown cgroups mode `%s' selected\n", optarg); return -1; } extern char *index(const char *s, int c); static size_t parse_size(char *optarg) { if (index(optarg, 'K')) return (size_t)KILO(atol(optarg)); else if (index(optarg, 'M')) return (size_t)MEGA(atol(optarg)); else if (index(optarg, 'G')) return (size_t)GIGA(atol(optarg)); return (size_t)atol(optarg); } static int parse_join_ns(const char *ptr) { char *aux, *ns_file, *extra_opts = NULL; cleanup_free char *ns = NULL; ns = xstrdup(ptr); if (ns == NULL) return -1; aux = strchr(ns, ':'); if (aux == NULL) return -1; *aux = '\0'; ns_file = aux + 1; aux = strchr(ns_file, ','); if (aux != NULL) { *aux = '\0'; extra_opts = aux + 1; } else { extra_opts = NULL; } if (join_ns_add(ns, ns_file, extra_opts)) return -1; return 0; } static int parse_file_validation_method(struct cr_options *opts, const char *optarg) { if (!strcmp(optarg, "filesize")) opts->file_validation_method = FILE_VALIDATION_FILE_SIZE; else if (!strcmp(optarg, "buildid")) opts->file_validation_method = FILE_VALIDATION_BUILD_ID; else goto Esyntax; return 0; Esyntax: pr_err("Unknown file validation method `%s' selected\n", optarg); return -1; } /* * parse_options() is the point where the getopt parsing happens. The CLI * parsing as well as the configuration file parsing happens here. * This used to be all part of main() and to integrate the new code flow * in main() this function (parse_options()) returns '0' if everything is * correct, '1' if something failed and '2' if the CRIU help text should * be displayed. */ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, int state) { int ret; int opt = -1; int idx; bool no_default_config = false; char *cfg_file = NULL; char **_argv = NULL; int _argc = 0; bool has_network_lock_opt = false; #define BOOL_OPT(OPT_NAME, SAVE_TO) \ { OPT_NAME, no_argument, SAVE_TO, true }, \ { \ "no-" OPT_NAME, no_argument, SAVE_TO, false \ } static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:"; static struct option long_opts[] = { { "tree", required_argument, 0, 't' }, { "leave-stopped", no_argument, 0, 's' }, { "leave-running", no_argument, 0, 'R' }, BOOL_OPT("restore-detached", &opts.restore_detach), BOOL_OPT("restore-sibling", &opts.restore_sibling), BOOL_OPT("daemon", &opts.restore_detach), { "images-dir", required_argument, 0, 'D' }, { "work-dir", required_argument, 0, 'W' }, { "log-file", required_argument, 0, 'o' }, { "join-ns", required_argument, 0, 'J' }, { "root", required_argument, 0, 'r' }, { USK_EXT_PARAM, optional_argument, 0, 'x' }, { "help", no_argument, 0, 'h' }, BOOL_OPT(SK_EST_PARAM, &opts.tcp_established_ok), { "close", required_argument, 0, 1043 }, BOOL_OPT("log-pid", &opts.log_file_per_pid), { "version", no_argument, 0, 'V' }, BOOL_OPT("evasive-devices", &opts.evasive_devices), { "pidfile", required_argument, 0, 1046 }, { "veth-pair", required_argument, 0, 1047 }, { "action-script", required_argument, 0, 1049 }, BOOL_OPT(LREMAP_PARAM, &opts.link_remap_ok), BOOL_OPT(OPT_SHELL_JOB, &opts.shell_job), BOOL_OPT(OPT_FILE_LOCKS, &opts.handle_file_locks), BOOL_OPT("page-server", &opts.use_page_server), { "address", required_argument, 0, 1051 }, { "port", required_argument, 0, 1052 }, { "prev-images-dir", required_argument, 0, 1053 }, { "ms", no_argument, 0, 1054 }, BOOL_OPT("track-mem", &opts.track_mem), BOOL_OPT("auto-dedup", &opts.auto_dedup), { "libdir", required_argument, 0, 'L' }, { "cpu-cap", optional_argument, 0, 1057 }, BOOL_OPT("force-irmap", &opts.force_irmap), { "ext-mount-map", required_argument, 0, 'M' }, { "exec-cmd", no_argument, 0, 1059 }, { "manage-cgroups", optional_argument, 0, 1060 }, { "cgroup-root", required_argument, 0, 1061 }, { "inherit-fd", required_argument, 0, 1062 }, { "feature", required_argument, 0, 1063 }, { "skip-mnt", required_argument, 0, 1064 }, { "enable-fs", required_argument, 0, 1065 }, { "enable-external-sharing", no_argument, &opts.enable_external_sharing, true }, { "enable-external-masters", no_argument, &opts.enable_external_masters, true }, { "freeze-cgroup", required_argument, 0, 1068 }, { "ghost-limit", required_argument, 0, 1069 }, { "irmap-scan-path", required_argument, 0, 1070 }, { "lsm-profile", required_argument, 0, 1071 }, { "timeout", required_argument, 0, 1072 }, { "external", required_argument, 0, 1073 }, { "empty-ns", required_argument, 0, 1074 }, { "lazy-pages", no_argument, 0, 1076 }, BOOL_OPT("extra", &opts.check_extra_features), BOOL_OPT("experimental", &opts.check_experimental_features), { "all", no_argument, 0, 1079 }, { "cgroup-props", required_argument, 0, 1080 }, { "cgroup-props-file", required_argument, 0, 1081 }, { "cgroup-dump-controller", required_argument, 0, 1082 }, BOOL_OPT(SK_INFLIGHT_PARAM, &opts.tcp_skip_in_flight), BOOL_OPT("deprecated", &opts.deprecated_ok), BOOL_OPT("display-stats", &opts.display_stats), BOOL_OPT("weak-sysctls", &opts.weak_sysctls), { "status-fd", required_argument, 0, 1088 }, BOOL_OPT(SK_CLOSE_PARAM, &opts.tcp_close), { "verbosity", optional_argument, 0, 'v' }, { "ps-socket", required_argument, 0, 1091 }, BOOL_OPT("stream", &opts.stream), { "config", required_argument, 0, 1089 }, { "no-default-config", no_argument, 0, 1090 }, { "tls-cacert", required_argument, 0, 1092 }, { "tls-cacrl", required_argument, 0, 1093 }, { "tls-cert", required_argument, 0, 1094 }, { "tls-key", required_argument, 0, 1095 }, BOOL_OPT("tls", &opts.tls), BOOL_OPT("mmap-page-image", &opts.mmap_page_image), BOOL_OPT("ptrace-allowed", &opts.ptrace_allowed), { "tls-no-cn-verify", no_argument, &opts.tls_no_cn_verify, true }, { "cgroup-yard", required_argument, 0, 1096 }, { "pre-dump-mode", required_argument, 0, 1097 }, { "file-validation", required_argument, 0, 1098 }, BOOL_OPT("skip-file-rwx-check", &opts.skip_file_rwx_check), { "lsm-mount-context", required_argument, 0, 1099 }, { "network-lock", required_argument, 0, 1100 }, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), { "compress", no_argument, 0, 1101 }, BOOL_OPT("unprivileged", &opts.unprivileged), BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), {}, }; #undef BOOL_OPT if (argv && argv[0]) SET_CHAR_OPTS(argv_0, argv[0]); ret = pre_parse(argc, argv, usage_error, &no_default_config, &cfg_file); if (ret) return 2; while (1) { idx = -1; /* Only if opt is -1 we are going to the next configuration input */ if (opt == -1) { /* Do not free any memory if it points to argv */ if (state != PARSING_ARGV + 1) { int i; for (i = 1; i < _argc; i++) { free(_argv[i]); } free(_argv); } /* This needs to be reset for a new getopt() run */ _argc = 0; _argv = NULL; state = next_config(argv, &_argv, no_default_config, state, cfg_file); /* if next_config() returns 0 it means no more configs found */ if (state == 0) break; if (!_argv) continue; _argc = count_elements(_argv); optind = 0; } opt = getopt_long(_argc, _argv, short_opts, long_opts, &idx); /* * The end of the current _argv has been reached, * let's go to the next _argv */ if (opt == -1) continue; /* * If opt == 0 then getopt will directly fill out the corresponding * field in CRIU's opts structure. */ if (!opt) continue; switch (opt) { case 's': opts.final_state = TASK_STOPPED; break; case 'R': opts.final_state = TASK_ALIVE; break; case 'x': if (optarg && unix_sk_ids_parse(optarg) < 0) { pr_err("Failed to parse unix socket inode from optarg: %s\n", optarg); return 1; } opts.ext_unix_sk = true; break; case 't': opts.tree_id = atoi(optarg); if (opts.tree_id <= 0) goto bad_arg; break; case 'r': SET_CHAR_OPTS(root, optarg); break; case 'd': opts.restore_detach = true; break; case 'S': opts.restore_sibling = true; break; case 'D': SET_CHAR_OPTS(imgs_dir, optarg); break; case 'W': SET_CHAR_OPTS(work_dir, optarg); break; case 'o': SET_CHAR_OPTS(output, optarg); break; case 'J': if (parse_join_ns(optarg)) goto bad_arg; break; case 'v': if (optarg) { if (optarg[0] == 'v') /* handle -vvvvv */ opts.log_level += strlen(optarg) + 1; else opts.log_level = atoi(optarg); } else opts.log_level++; break; case 1043: { int fd; fd = atoi(optarg); pr_info("Closing fd %d\n", fd); close(fd); break; } case 1046: SET_CHAR_OPTS(pidfile, optarg); break; case 1047: { char *aux; aux = strchr(optarg, '='); if (aux == NULL) goto bad_arg; *aux = '\0'; if (veth_pair_add(optarg, aux + 1)) { pr_err("Failed to add veth pair: %s, %s.\n", optarg, aux + 1); return 1; } } break; case 1049: if (add_script(optarg)) { pr_err("Failed to add action-script: %s.\n", optarg); return 1; } break; case 1051: SET_CHAR_OPTS(addr, optarg); break; case 1052: opts.port = atoi(optarg); if (!opts.port) goto bad_arg; break; case 'j': opts.shell_job = true; break; case 'l': opts.handle_file_locks = true; break; case 1053: SET_CHAR_OPTS(img_parent, optarg); break; case 1057: if (parse_cpu_cap(&opts, optarg)) return 2; break; case 1058: opts.force_irmap = true; break; case 1054: pr_err("--ms is deprecated; see \"Check options\" of criu --help\n"); return 1; case 'L': SET_CHAR_OPTS(libdir, optarg); break; case 1059: *has_exec_cmd = true; break; case 1060: if (parse_manage_cgroups(&opts, optarg)) return 2; break; case 1061: { char *path, *ctl; path = strchr(optarg, ':'); if (path) { *path = '\0'; path++; ctl = optarg; } else { path = optarg; ctl = NULL; } if (new_cg_root_add(ctl, path)) return -1; } break; case 1062: if (inherit_fd_parse(optarg) < 0) return 1; break; case 1063: ret = check_add_feature(optarg); if (ret < 0) /* invalid kernel feature name */ return 1; if (ret > 0) /* list kernel features and exit */ return 0; break; case 1064: if (!add_skip_mount(optarg)) { pr_err("Failed to add skip-mnt: %s\n", optarg); return 1; } break; case 1065: if (!add_fsname_auto(optarg)) { pr_err("Failed while parsing --enable-fs option: %s\n", optarg); return 1; } break; case 1068: SET_CHAR_OPTS(freeze_cgroup, optarg); break; case 1069: opts.ghost_limit = parse_size(optarg); break; case 1070: if (irmap_scan_path_add(optarg)) { pr_err("Failed while parsing --irmap-scan-path option: %s\n", optarg); return -1; } break; case 1071: SET_CHAR_OPTS(lsm_profile, optarg); opts.lsm_supplied = true; break; case 1072: opts.timeout = atoi(optarg); break; case 1076: opts.lazy_pages = true; break; case 'M': { char *aux; if (strcmp(optarg, "auto") == 0) { opts.autodetect_ext_mounts = true; break; } aux = strchr(optarg, ':'); if (aux == NULL) goto bad_arg; *aux = '\0'; if (ext_mount_add(optarg, aux + 1)) { pr_err("Could not add external mount when initializing config: %s, %s\n", optarg, aux + 1); return 1; } } break; case 1073: if (add_external(optarg)) { pr_err("Could not add external resource when initializing config: %s\n", optarg); return 1; } break; case 1074: if (!strcmp("net", optarg)) opts.empty_ns |= CLONE_NEWNET; else { pr_err("Unsupported empty namespace: %s\n", optarg); return 1; } break; case 1079: opts.check_extra_features = true; opts.check_experimental_features = true; break; case 1080: SET_CHAR_OPTS(cgroup_props, optarg); break; case 1081: SET_CHAR_OPTS(cgroup_props_file, optarg); break; case 1082: if (!cgp_add_dump_controller(optarg)) return 1; break; case 1088: if (sscanf(optarg, "%d", &opts.status_fd) != 1) { pr_err("Unable to parse a value of --status-fd\n"); return 1; } break; case 1089: break; case 1090: break; case 1091: opts.ps_socket = atoi(optarg); break; case 1092: SET_CHAR_OPTS(tls_cacert, optarg); break; case 1093: SET_CHAR_OPTS(tls_cacrl, optarg); break; case 1094: SET_CHAR_OPTS(tls_cert, optarg); break; case 1095: SET_CHAR_OPTS(tls_key, optarg); break; case 1096: SET_CHAR_OPTS(cgroup_yard, optarg); break; case 1097: if (!strcmp("read", optarg)) { opts.pre_dump_mode = PRE_DUMP_READ; } else if (strcmp("splice", optarg)) { pr_err("Unable to parse value of --pre-dump-mode\n"); return 1; } break; case 1098: if (parse_file_validation_method(&opts, optarg)) return 2; break; case 1099: SET_CHAR_OPTS(lsm_mount_context, optarg); break; case 1100: has_network_lock_opt = true; if (!strcmp("iptables", optarg)) { opts.network_lock_method = NETWORK_LOCK_IPTABLES; } else if (!strcmp("nftables", optarg)) { opts.network_lock_method = NETWORK_LOCK_NFTABLES; } else if (!strcmp("skip", optarg) || !strcmp("none", optarg)) { opts.network_lock_method = NETWORK_LOCK_SKIP; } else { pr_err("Invalid value for --network-lock: %s\n", optarg); return 1; } break; case 1101: opts.compress = true; pr_debug("Compression enabled"); break; case 'V': pr_msg("Version: %s\n", CRIU_VERSION); if (strcmp(CRIU_GITID, "0")) pr_msg("GitID: %s\n", CRIU_GITID); exit(0); case 'h': *usage_error = false; return 2; default: return 2; } } if (has_network_lock_opt && !strcmp(argv[optind], "restore")) { pr_warn("--network-lock will be ignored in restore command\n"); pr_info("Network lock method from dump will be used in restore\n"); } return 0; bad_arg: if (idx < 0) /* short option */ pr_err("invalid argument for -%c: %s\n", opt, optarg); else /* long option */ pr_err("invalid argument for --%s: %s\n", long_opts[idx].name, optarg); return 1; } int check_options(void) { if (opts.tcp_established_ok) pr_info("Will dump/restore TCP connections\n"); if (opts.tcp_skip_in_flight) pr_info("Will skip in-flight TCP connections\n"); if (opts.tcp_close) pr_info("Will drop all TCP connections on restore\n"); if (opts.link_remap_ok) pr_info("Will allow link remaps on FS\n"); if (opts.weak_sysctls) pr_info("Will skip non-existent sysctls on restore\n"); if (opts.deprecated_ok) pr_info("Turn deprecated stuff ON\n"); else if (getenv("CRIU_DEPRECATED")) { pr_info("Turn deprecated stuff ON via env\n"); opts.deprecated_ok = true; } if (!opts.restore_detach && opts.restore_sibling) { pr_err("--restore-sibling only makes sense with --restore-detached\n"); return 1; } if (opts.ps_socket != -1) { if (opts.addr || opts.port) pr_warn("Using --address or --port in " "combination with --ps-socket is obsolete\n"); if (opts.ps_socket <= STDERR_FILENO && opts.daemon_mode) { pr_err("Standard file descriptors will be closed" " in daemon mode\n"); return 1; } } #ifndef CONFIG_GNUTLS if (opts.tls) { pr_err("CRIU was built without TLS support\n"); return 1; } #endif if (opts.mntns_compat_mode && opts.mode != CR_RESTORE) { pr_err("Option --mntns-compat-mode is only valid on restore\n"); return 1; } else if (!opts.mntns_compat_mode && opts.mode == CR_RESTORE) { if (check_mount_v2()) { pr_debug("Mount engine fallback to --mntns-compat-mode mode\n"); opts.mntns_compat_mode = true; } } if (opts.track_mem && !kdat.has_dirty_track) { pr_err("Tracking memory is not available. Consider omitting --track-mem option.\n"); return 1; } if (check_namespace_opts()) { pr_err("Error: namespace flags conflict\n"); return 1; } return 0; } crac-criu-1.5.0/criu/cr-check.c000066400000000000000000001054631471504326700161740ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../soccr/soccr.h" #include "types.h" #include "fdinfo.h" #include "sockets.h" #include "crtools.h" #include "log.h" #include "util-caps.h" #include "prctl.h" #include "files.h" #include "sk-inet.h" #include "proc_parse.h" #include "mount.h" #include "tty.h" #include #include "ptrace-compat.h" #include "kerndat.h" #include "timerfd.h" #include "util.h" #include "tun.h" #include "namespaces.h" #include "pstree.h" #include "lsm.h" #include "apparmor.h" #include "cr_options.h" #include "libnetlink.h" #include "net.h" #include "restorer.h" #include "uffd.h" #include "linux/aio_abi.h" #include "images/inventory.pb-c.h" static char *feature_name(int (*func)(void)); static int check_tty(void) { int master = -1, slave = -1; const int lock = 1; struct termios t; char *slavename; int ret = -1; if (ARRAY_SIZE(t.c_cc) < TERMIOS_NCC) { pr_err("struct termios has %d @c_cc while " "at least %d expected.\n", (int)ARRAY_SIZE(t.c_cc), TERMIOS_NCC); goto out; } master = open("/dev/ptmx", O_RDWR); if (master < 0) { pr_perror("Can't open /dev/ptmx"); goto out; } if (ioctl(master, TIOCSPTLCK, &lock)) { pr_perror("Can't lock pty master"); goto out; } slavename = ptsname(master); slave = open(slavename, O_RDWR); if (slave < 0) { if (errno != EIO) { pr_perror("Unexpected error on locked pty"); goto out; } } else { pr_err("Managed to open locked pty.\n"); goto out; } ret = 0; out: close_safe(&master); close_safe(&slave); return ret; } static int check_apparmor_stacking(void) { if (!kdat.apparmor_ns_dumping_enabled) return -1; return 0; } static int check_map_files(void) { int ret; ret = access("/proc/self/map_files", R_OK); if (!ret) return 0; pr_perror("/proc//map_files is inaccessible"); return -1; } static int check_sock_diag(void) { int ret; struct ns_id ns; ns.ns_pid = 0; ns.type = NS_CRIU; ns.net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); if (ns.net.nlsk < 0) { pr_perror("Can't make diag socket for check"); return -1; } ret = collect_sockets(&ns); if (!ret) return 0; pr_msg("The sock diag infrastructure is incomplete.\n"); pr_msg("Make sure you have:\n"); pr_msg(" 1. *_DIAG kernel config options turned on;\n"); pr_msg(" 2. *_diag.ko modules loaded (if compiled as modules).\n"); return -1; } static int check_ns_last_pid(void) { int ret; ret = access("/proc/" LAST_PID_PATH, W_OK); if (!ret) return 0; pr_perror("%s sysctl is inaccessible", LAST_PID_PATH); return -1; } static int check_sock_peek_off(void) { int sk; int ret, off, sz; sk = socket(PF_UNIX, SOCK_DGRAM, 0); if (sk < 0) { pr_perror("Can't create unix socket for check"); return -1; } sz = sizeof(off); ret = getsockopt(sk, SOL_SOCKET, SO_PEEK_OFF, &off, (socklen_t *)&sz); close(sk); if ((ret == 0) && (off == -1) && (sz == sizeof(int))) return 0; pr_msg("SO_PEEK_OFF sockoption doesn't work.\n"); return -1; } static int check_kcmp(void) { int ret = syscall(SYS_kcmp, getpid(), -1, -1, -1, -1); if (ret < 0 && errno == ENOSYS) { pr_perror("System call kcmp is not supported"); return -1; } return 0; } static int check_prctl_cat1(void) { unsigned long user_auxv = 0; unsigned int *tid_addr; unsigned int size = 0; int ret; ret = prctl(PR_GET_TID_ADDRESS, (unsigned long)&tid_addr, 0, 0, 0); if (ret < 0) { pr_perror("prctl: PR_GET_TID_ADDRESS is not supported"); return -1; } /* * It's OK if the new interface is not supported because it's * a Category 2 feature, but the old interface has to be supported. */ ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); if (ret < 0) { pr_msg("Info prctl: PR_SET_MM_MAP_SIZE is not supported\n"); ret = prctl(PR_SET_MM, PR_SET_MM_BRK, (unsigned long)sbrk(0), 0, 0); if (ret < 0) { if (errno == EPERM) pr_msg("prctl: One needs CAP_SYS_RESOURCE capability to perform testing\n"); else pr_perror("prctl: PR_SET_MM_BRK is not supported"); return -1; } ret = prctl(PR_SET_MM, PR_SET_MM_EXE_FILE, -1, 0, 0); if (ret < 0 && errno != EBADF) { pr_perror("prctl: PR_SET_MM_EXE_FILE is not supported"); return -1; } ret = prctl(PR_SET_MM, PR_SET_MM_AUXV, (long)&user_auxv, sizeof(user_auxv), 0); if (ret < 0) { pr_perror("prctl: PR_SET_MM_AUXV is not supported"); return -1; } } return 0; } static int check_prctl_cat2(void) { unsigned int size = 0; int ret; ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); if (ret) { pr_warn("prctl: PR_SET_MM_MAP_SIZE is not supported\n"); return -1; } return 0; } static int check_fcntl(void) { u32 v[2]; int fd; fd = open_proc(PROC_SELF, "comm"); if (fd < 0) return -1; if (fcntl(fd, F_GETOWNER_UIDS, (long)v)) { pr_perror("Can't fetch file owner UIDs"); close(fd); return -1; } close(fd); return 0; } static int check_proc_stat(void) { struct proc_pid_stat stat; int ret; ret = parse_pid_stat(getpid(), &stat); if (ret) { pr_msg("procfs: stat extension is not supported\n"); return -1; } return 0; } static int check_fdinfo_eventfd(void) { int fd, ret; int cnt = 13; EventfdFileEntry fe = EVENTFD_FILE_ENTRY__INIT; fd = eventfd(cnt, 0); if (fd < 0) { pr_perror("Can't make eventfd"); return -1; } ret = parse_fdinfo(fd, FD_TYPES__EVENTFD, &fe); close(fd); if (ret) { pr_err("Error parsing proc fdinfo\n"); return -1; } if (fe.counter != cnt) { pr_err("Counter mismatch (or not met) %d want %d\n", (int)fe.counter, cnt); return -1; } pr_info("Eventfd fdinfo works OK (%d vs %d)\n", cnt, (int)fe.counter); return 0; } int check_mnt_id(void) { struct fdinfo_common fdinfo = { .mnt_id = -1 }; int ret; ret = parse_fdinfo(get_service_fd(LOG_FD_OFF), FD_TYPES__UND, &fdinfo); if (ret < 0) return -1; if (fdinfo.mnt_id == -1) { pr_err("fdinfo doesn't contain the mnt_id field\n"); return -1; } return 0; } static int check_fdinfo_signalfd(void) { int fd, ret; sigset_t mask; SignalfdEntry sfd = SIGNALFD_ENTRY__INIT; sigemptyset(&mask); sigaddset(&mask, SIGUSR1); fd = signalfd(-1, &mask, 0); if (fd < 0) { pr_perror("Can't make signalfd"); return -1; } ret = parse_fdinfo(fd, FD_TYPES__SIGNALFD, &sfd); close(fd); if (ret) { pr_err("Error parsing proc fdinfo\n"); return -1; } return 0; } static int check_fdinfo_eventpoll(void) { int efd, pfd[2], ret = -1; struct epoll_event ev; EventpollFileEntry efe = EVENTPOLL_FILE_ENTRY__INIT; if (pipe(pfd)) { pr_perror("Can't make pipe to watch"); return -1; } efd = epoll_create(1); if (efd < 0) { pr_perror("Can't make epoll fd"); goto pipe_err; } memset(&ev, 0, sizeof(ev)); ev.events = EPOLLIN | EPOLLOUT; if (epoll_ctl(efd, EPOLL_CTL_ADD, pfd[0], &ev)) { pr_perror("Can't add epoll tfd"); goto epoll_err; } ret = parse_fdinfo(efd, FD_TYPES__EVENTPOLL, &efe); if (ret) { pr_err("Error parsing proc fdinfo\n"); goto epoll_err; } if (efe.n_tfd != 1 || efe.tfd[0]->tfd != pfd[0]) { pr_err("TFD mismatch (or not met)\n"); ret = -1; goto epoll_err; } pr_info("Epoll fdinfo works OK\n"); epoll_err: close(efd); pipe_err: close(pfd[0]); close(pfd[1]); return ret; } static int check_fdinfo_inotify(void) { int ifd, wd, ret; InotifyFileEntry ify = INOTIFY_FILE_ENTRY__INIT; ifd = inotify_init1(0); if (ifd < 0) { pr_perror("Can't make inotify fd"); return -1; } wd = inotify_add_watch(ifd, ".", IN_ALL_EVENTS); if (wd < 0) { pr_perror("Can't add watch"); close(ifd); return -1; } ret = parse_fdinfo(ifd, FD_TYPES__INOTIFY, &ify); close(ifd); if (ret < 0) { pr_err("Error parsing proc fdinfo\n"); return -1; } if (ify.n_wd != 1 || ify.wd[0]->wd != wd) { pr_err("WD mismatch (or not met)\n"); return -1; } pr_info("Inotify fdinfo works OK\n"); return 0; } static int check_fdinfo_ext(void) { int ret = 0; ret |= check_fdinfo_eventfd(); ret |= check_fdinfo_eventpoll(); ret |= check_fdinfo_signalfd(); ret |= check_fdinfo_inotify(); return ret; } static int check_unaligned_vmsplice(void) { int p[2], ret; char buf; /* :) */ struct iovec iov; ret = pipe(p); if (ret < 0) { pr_perror("Can't create pipe"); return ret; } iov.iov_base = &buf; iov.iov_len = sizeof(buf); ret = vmsplice(p[1], &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK); if (ret < 0) { pr_perror("Unaligned vmsplice doesn't work"); goto err; } pr_info("Unaligned vmsplice works OK\n"); ret = 0; err: close(p[0]); close(p[1]); return ret; } #ifndef SO_GET_FILTER #define SO_GET_FILTER SO_ATTACH_FILTER #endif static int check_so_gets(void) { int sk, ret = -1; socklen_t len; char name[IFNAMSIZ]; sk = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); if (sk < 0) { pr_perror("No socket"); return -1; } len = 0; if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len)) { pr_perror("Can't get socket filter"); goto err; } len = sizeof(name); if (getsockopt(sk, SOL_SOCKET, SO_BINDTODEVICE, name, &len)) { pr_perror("Can't get socket bound dev"); goto err; } ret = 0; err: close(sk); return ret; } static int check_ipc(void) { int ret; /* * Since kernel 5.16 sem_next_id can be accessed via CAP_CHECKPOINT_RESTORE, however * for non-root users access() runs with an empty set of caps and will therefore always * fail. */ if (opts.uid) return 0; ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK); if (!ret) return 0; pr_perror("/proc/sys/kernel/sem_next_id is inaccessible"); return -1; } static int check_sigqueuinfo(void) { siginfo_t info = { .si_code = 1 }; signal(SIGUSR1, SIG_IGN); if (syscall(SYS_rt_sigqueueinfo, getpid(), SIGUSR1, &info) < 0) { pr_perror("Unable to send siginfo with positive si_code to itself"); return -1; } return 0; } static int check_ptrace_peeksiginfo(void) { struct ptrace_peeksiginfo_args arg; siginfo_t siginfo; pid_t pid, ret = 0; k_rtsigset_t mask; pid = fork_and_ptrace_attach(NULL); if (pid < 0) return -1; arg.flags = 0; arg.off = 0; arg.nr = 1; if (ptrace(PTRACE_PEEKSIGINFO, pid, &arg, &siginfo) != 0) { pr_perror("Unable to dump pending signals"); ret = -1; } if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(mask), &mask) != 0) { pr_perror("Unable to dump signal blocking mask"); ret = -1; } kill(pid, SIGKILL); waitpid(pid, NULL, 0); return ret; } struct special_mapping { const char *name; void *addr; size_t size; }; static int parse_special_maps(struct special_mapping *vmas, size_t nr) { FILE *maps; char buf[256]; int ret = 0; maps = fopen_proc(PROC_SELF, "maps"); if (!maps) return -1; while (fgets(buf, sizeof(buf), maps)) { unsigned long start, end; int r, tail; size_t i; r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n", &start, &end, &tail); if (r != 2) { fclose(maps); pr_err("Bad maps format %d.%d (%s)\n", r, tail, buf + tail); return -1; } for (i = 0; i < nr; i++) { if (strcmp(buf + tail, vmas[i].name) != 0) continue; if (vmas[i].addr != MAP_FAILED) { pr_err("Special mapping meet twice: %s\n", vmas[i].name); ret = -1; goto out; } vmas[i].addr = (void *)start; vmas[i].size = end - start; } } out: fclose(maps); return ret; } static void dummy_sighandler(int sig) { } /* * The idea of test is checking if the kernel correctly tracks positions * of special_mappings: vdso/vvar/sigpage/... * Per-architecture commits added handling for mremap() somewhere between * v4.8...v4.14. If the kernel doesn't have one of those patches, * a process will crash after receiving a signal (we use SIGUSR1 for * the test here). That's because after processing a signal the kernel * needs a "landing" to return to userspace, which is based on vdso/sigpage. * If the kernel doesn't track the position of mapping - we land in the void. * And we definitely mremap() support by the fact that those special_mappings * are subjects for ASLR. (See #288 as a reference) */ static void check_special_mapping_mremap_child(struct special_mapping *vmas, size_t nr) { size_t i, parking_size = 0; void *parking_lot; pid_t self = getpid(); for (i = 0; i < nr; i++) { if (vmas[i].addr != MAP_FAILED) parking_size += vmas[i].size; } if (signal(SIGUSR1, dummy_sighandler) == SIG_ERR) { pr_perror("signal() failed"); exit(1); } parking_lot = mmap(NULL, parking_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (parking_lot == MAP_FAILED) { pr_perror("mmap(%zu) failed", parking_size); exit(1); } for (i = 0; i < nr; i++) { unsigned long ret; if (vmas[i].addr == MAP_FAILED) continue; ret = syscall(__NR_mremap, (unsigned long)vmas[i].addr, vmas[i].size, vmas[i].size, MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)parking_lot); if (ret != (unsigned long)parking_lot) syscall(__NR_exit, 1); parking_lot += vmas[i].size; } syscall(__NR_kill, self, SIGUSR1); syscall(__NR_exit, 0); } static int check_special_mapping_mremap(void) { struct special_mapping special_vmas[] = { { .name = "[vvar]\n", .addr = MAP_FAILED, }, { .name = "[vdso]\n", .addr = MAP_FAILED, }, { .name = "[sigpage]\n", .addr = MAP_FAILED, }, /* XXX: { .name = "[uprobes]\n" }, */ /* * Not subjects for ASLR, skipping: * { .name = "[vectors]\n", }, * { .name = "[vsyscall]\n" }, */ }; size_t vmas_nr = ARRAY_SIZE(special_vmas); pid_t child; int stat; if (parse_special_maps(special_vmas, vmas_nr)) return -1; child = fork(); if (child < 0) { pr_perror("%s(): failed to fork()", __func__); return -1; } if (child == 0) check_special_mapping_mremap_child(special_vmas, vmas_nr); if (waitpid(child, &stat, 0) != child) { if (errno == ECHILD) { pr_err("BUG: Someone waited for the child already\n"); return -1; } /* Probably, we're interrupted with a signal - cleanup */ pr_err("Failed to wait for a child %d\n", errno); kill(child, SIGKILL); waitpid(child, NULL, 0); return -1; } if (WIFSIGNALED(stat)) { pr_err("Child killed by signal %d\n", WTERMSIG(stat)); pr_err("Your kernel probably lacks the support for mremapping special mappings\n"); return -1; } else if (WIFEXITED(stat)) { if (WEXITSTATUS(stat) == 0) return 0; pr_err("Child exited with %d\n", WEXITSTATUS(stat)); return -1; } pr_err("BUG: waitpid() returned stat=%d\n", stat); /* We're not killing the child here - it's predestined to die anyway. */ return -1; } static int check_ptrace_suspend_seccomp(void) { pid_t pid; int ret = 0; pid = fork_and_ptrace_attach(NULL); if (pid < 0) return -1; if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { if (errno == EINVAL) { pr_err("Kernel doesn't support PTRACE_O_SUSPEND_SECCOMP\n"); } else { pr_perror("couldn't suspend seccomp"); } ret = -1; } kill(pid, SIGKILL); waitpid(pid, NULL, 0); return ret; } static int setup_seccomp_filter(void) { struct sock_filter filter[] = { BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct seccomp_data, nr)), /* Allow all syscalls except ptrace */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_ptrace, 0, 1), BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL), BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog bpf_prog = { .len = (unsigned short)(sizeof(filter) / sizeof(filter[0])), .filter = filter, }; if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, (long)&bpf_prog, 0, 0) < 0) return -1; return 0; } static int check_ptrace_dump_seccomp_filters(void) { pid_t pid; int ret = 0, len; pid = fork_and_ptrace_attach(setup_seccomp_filter); if (pid < 0) return -1; len = ptrace(PTRACE_SECCOMP_GET_FILTER, pid, 0, NULL); if (len < 0) { ret = -1; pr_perror("Dumping seccomp filters not supported"); } kill(pid, SIGKILL); waitpid(pid, NULL, 0); return ret; } static int check_ptrace_get_rseq_conf(void) { if (!kdat.has_ptrace_get_rseq_conf) { pr_warn("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported. C/R of processes which are using rseq() won't work.\n"); return -1; } return 0; } static int check_mem_dirty_track(void) { if (!kdat.has_dirty_track) { pr_warn("Dirty tracking is OFF. Memory snapshot will not work.\n"); return -1; } return 0; } static int check_posix_timers(void) { int ret; ret = access("/proc/self/timers", R_OK); if (!ret) return 0; pr_msg("/proc//timers file is missing.\n"); return -1; } static unsigned long get_ring_len(unsigned long addr) { FILE *maps; char buf[256]; maps = fopen_proc(PROC_SELF, "maps"); if (!maps) return 0; while (fgets(buf, sizeof(buf), maps)) { unsigned long start, end; int r, tail; r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n", &start, &end, &tail); if (r != 2) { fclose(maps); pr_err("Bad maps format %d.%d (%s)\n", r, tail, buf + tail); return 0; } if (start == addr) { fclose(maps); if (strcmp(buf + tail, "/[aio] (deleted)\n")) goto notfound; return end - start; } } fclose(maps); notfound: pr_err("No AIO ring at expected location\n"); return 0; } static int check_aio_remap(void) { aio_context_t ctx = 0; unsigned long len; void *naddr; int r; if (syscall(SYS_io_setup, 16, &ctx) < 0) { pr_perror("No AIO syscall"); return -1; } len = get_ring_len((unsigned long)ctx); if (!len) return -1; naddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); if (naddr == MAP_FAILED) { pr_perror("Can't find place for new AIO ring"); return -1; } if (mremap((void *)ctx, len, len, MREMAP_FIXED | MREMAP_MAYMOVE, naddr) == MAP_FAILED) { pr_perror("Can't remap AIO ring"); return -1; } ctx = (aio_context_t)naddr; r = syscall(SYS_io_getevents, ctx, 0, 1, NULL, NULL); if (r < 0) { pr_perror("AIO remap doesn't work properly"); return -1; } return 0; } static int check_fdinfo_lock(void) { if (!kdat.has_fdinfo_lock) { pr_err("fdinfo doesn't contain the lock field\n"); return -1; } return 0; } struct clone_arg { /* * Reserve some space for clone() to locate arguments * and retcode in this place */ char stack[128] __stack_aligned__; char stack_ptr[0]; }; static int clone_cb(void *_arg) { exit(0); } static int check_clone_parent_vs_pid(void) { struct clone_arg ca; pid_t pid; pid = clone(clone_cb, ca.stack_ptr, CLONE_NEWPID | CLONE_PARENT, &ca); if (pid < 0) { pr_err("CLONE_PARENT | CLONE_NEWPID don't work together\n"); return -1; } return 0; } static int check_autofs_pipe_ino(void) { FILE *f; char str[1024]; int ret = -ENOENT; f = fopen_proc(PROC_SELF, "mountinfo"); if (!f) return -1; while (fgets(str, sizeof(str), f)) { if (strstr(str, " autofs ")) { if (strstr(str, "pipe_ino=")) ret = 0; else { pr_err("autofs not supported.\n"); ret = -ENOTSUP; } break; } } fclose(f); return ret; } static int check_autofs(void) { char *dir, *options, template[] = "/tmp/.criu.mnt.XXXXXX"; int ret, pfd[2]; ret = check_autofs_pipe_ino(); if (ret != -ENOENT) return ret; if (pipe(pfd) < 0) { pr_perror("failed to create pipe"); return -1; } ret = -1; options = xsprintf("fd=%d,pgrp=%d,minproto=5,maxproto=5,direct", pfd[1], getpgrp()); if (!options) { pr_err("failed to allocate autofs options\n"); goto close_pipe; } dir = mkdtemp(template); if (!dir) { pr_perror("failed to construct temporary name"); goto free_options; } if (mount("criu", dir, "autofs", 0, options) < 0) { pr_perror("failed to mount autofs"); goto unlink_dir; } ret = check_autofs_pipe_ino(); if (umount(dir)) pr_perror("failed to umount %s", dir); unlink_dir: if (rmdir(dir)) pr_perror("failed to unlink %s", dir); free_options: free(options); close_pipe: close(pfd[0]); close(pfd[1]); return ret; } static int check_cgroupns(void) { int ret; ret = access("/proc/self/ns/cgroup", F_OK); if (ret < 0) { pr_err("cgroupns not supported. This is not fatal.\n"); return -1; } return 0; } static int check_tcp(void) { socklen_t optlen; int sk, ret; int val; sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); if (sk < 0) { pr_perror("Can't create TCP socket :("); return -1; } val = 1; if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); if (ret < 0) { pr_perror("Can't turn TCP repair mode ON"); goto out; } } else { pr_info("Not checking for TCP repair mode. Please set CAP_NET_ADMIN\n"); } optlen = sizeof(val); ret = getsockopt(sk, SOL_TCP, TCP_TIMESTAMP, &val, &optlen); if (ret) pr_perror("Can't get TCP_TIMESTAMP"); out: close(sk); return ret; } static int check_tcp_halt_closed(void) { if (!kdat.has_tcp_half_closed) { pr_err("TCP_REPAIR can't be enabled for half-closed sockets\n"); return -1; } return 0; } static int kerndat_tcp_repair_window(void) { struct tcp_repair_window opt; socklen_t optlen = sizeof(opt); int sk, val = 1; sk = socket(AF_INET, SOCK_STREAM, 0); if (sk < 0 && errno == EAFNOSUPPORT) sk = socket(AF_INET6, SOCK_STREAM, 0); if (sk < 0) { pr_perror("Unable to create inet socket"); goto errn; } if (setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val))) { if (errno == EPERM) { pr_warn("TCP_REPAIR isn't available to unprivileged users\n"); goto now; } pr_perror("Unable to set TCP_REPAIR"); goto err; } if (getsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &opt, &optlen)) { if (errno != ENOPROTOOPT) { pr_perror("Unable to set TCP_REPAIR_WINDOW"); goto err; } now: val = 0; } else val = 1; close(sk); return val; err: close(sk); errn: return -1; } static int check_tcp_window(void) { int ret; ret = kerndat_tcp_repair_window(); if (ret < 0) return -1; if (ret == 0) { pr_err("The TCP_REPAIR_WINDOW option isn't supported.\n"); return -1; } return 0; } static int check_userns(void) { int ret; unsigned long size = 0; ret = access("/proc/self/ns/user", F_OK); if (ret) { pr_perror("No userns proc file"); return -1; } ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); if (ret < 0) { pr_perror("prctl: PR_SET_MM_MAP_SIZE is not supported"); return -1; } return 0; } static int check_loginuid(void) { if (kdat.luid != LUID_FULL) { pr_warn("Loginuid restore is OFF.\n"); return -1; } return 0; } static int check_compat_cr(void) { #ifdef CONFIG_COMPAT if (kdat_compatible_cr()) return 0; pr_warn("compat_cr is not supported. Requires kernel >= v4.12\n"); #else pr_warn("CRIU built without CONFIG_COMPAT - can't C/R compatible tasks\n"); #endif return -1; } static int check_nftables_cr(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) return 0; #else pr_warn("CRIU was built without nftables support - nftables rules will " "not be preserved during C/R\n"); return -1; #endif } static int check_ipt_legacy(void) { char *ipt_legacy_bin; char *ip6t_legacy_bin; ipt_legacy_bin = get_legacy_iptables_bin(false, false); if (!ipt_legacy_bin) { pr_warn("Couldn't find iptables version which is using iptables legacy API\n"); return -1; } pr_info("iptables cmd: %s\n", ipt_legacy_bin); if (!kdat.ipv6) return 0; ip6t_legacy_bin = get_legacy_iptables_bin(true, false); if (!ip6t_legacy_bin) { pr_warn("Couldn't find ip6tables version which is using iptables legacy API\n"); return -1; } pr_info("ip6tables cmd: %s\n", ip6t_legacy_bin); return 0; } static int check_uffd(void) { if (!kdat.has_uffd) { pr_err("UFFD is not supported\n"); return -1; } return 0; } static int check_uffd_noncoop(void) { if (check_uffd()) return -1; if (!uffd_noncooperative()) { pr_err("Non-cooperative UFFD is not supported\n"); return -1; } return 0; } static int check_clone3_set_tid(void) { if (!kdat.has_clone3_set_tid) { pr_warn("clone3() with set_tid not supported\n"); return -1; } return 0; } static int check_can_map_vdso(void) { if (kdat_can_map_vdso() == 1) return 0; pr_warn("Do not have API to map vDSO - will use mremap() to restore vDSO\n"); return -1; } static int check_sk_netns(void) { if (!kdat.sk_ns) return -1; return 0; } static int check_sk_unix_file(void) { if (!kdat.sk_unix_file) return -1; return 0; } static int check_kcmp_epoll(void) { if (!kdat.has_kcmp_epoll_tfd) return -1; return 0; } static int check_time_namespace(void) { if (!kdat.has_timens) { pr_err("Time namespaces are not supported\n"); return -1; } return 0; } static int check_newifindex(void) { if (!kdat.has_newifindex) { pr_err("IFLA_NEW_IFINDEX isn't supported\n"); return -1; } return 0; } static int check_net_diag_raw(void) { check_sock_diag(); return (socket_test_collect_bit(AF_INET, IPPROTO_RAW) && socket_test_collect_bit(AF_INET6, IPPROTO_RAW)) ? 0 : -1; } static int check_pidfd_store(void) { if (!kdat.has_pidfd_open) { pr_warn("Pidfd store requires pidfd_open syscall which is not supported\n"); return -1; } if (!kdat.has_pidfd_getfd) { pr_warn("Pidfd store requires pidfd_getfd syscall which is not supported\n"); return -1; } return 0; } static int check_ns_pid(void) { if (!kdat.has_nspid) return -1; return 0; } static int check_memfd_hugetlb(void) { if (!kdat.has_memfd_hugetlb) return -1; return 0; } static int check_network_lock_nftables(void) { if (!kdat.has_nftables_concat) { pr_warn("Nftables based locking requires libnftables and set concatenations support\n"); return -1; } return 0; } static int check_sockopt_buf_lock(void) { if (!kdat.has_sockopt_buf_lock) return -1; return 0; } static int check_move_mount_set_group(void) { if (!kdat.has_move_mount_set_group) return -1; return 0; } static int check_openat2(void) { if (!kdat.has_openat2) return -1; return 0; } static int check_ipv6_freebind(void) { if (!kdat.has_ipv6_freebind) return -1; return 0; } static int (*chk_feature)(void); /* * There are three categories of kernel features: * * 1. Absolutely required (/proc/pid/map_files, ptrace PEEKSIGINFO, etc.). * 2. Required only for specific cases (aio remap, tun, etc.). * Checked when --extra or --all is specified. * 3. Experimental (task-diag). * Checked when --experimental or --all is specified. * * We fail if any feature in category 1 is missing but tolerate failures * in the other categories. Currently, there is nothing in category 3. */ #define CHECK_GOOD "Looks good." #define CHECK_BAD "Does not look good." #define CHECK_MAYBE \ "Looks good but some kernel features are missing\n" \ "which, depending on your process tree, may cause\n" \ "dump or restore failure." #define CHECK_CAT1(fn) \ do { \ if ((ret = fn) != 0) { \ pr_warn("%s\n", CHECK_BAD); \ return ret; \ } \ } while (0) int cr_check(void) { struct ns_id *ns; int ret = 0; root_item = alloc_pstree_item(); if (root_item == NULL) return -1; root_item->pid->real = getpid(); if (collect_pstree_ids()) return -1; ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); if (ns == NULL) return -1; mntinfo = collect_mntinfo(ns, false); if (mntinfo == NULL) return -1; if (chk_feature) { if (chk_feature()) return -1; pr_msg("%s is supported\n", feature_name(chk_feature)); return 0; } /* * Category 1 - absolutely required. * So that the user can see clearly what's missing, we exit with * non-zero status on the first failure because it gets very * confusing when there are many warnings and error messages. */ CHECK_CAT1(check_map_files()); CHECK_CAT1(check_sock_diag()); CHECK_CAT1(check_ns_last_pid()); CHECK_CAT1(check_sock_peek_off()); CHECK_CAT1(check_kcmp()); CHECK_CAT1(check_prctl_cat1()); CHECK_CAT1(check_fcntl()); CHECK_CAT1(check_proc_stat()); CHECK_CAT1(check_tcp()); CHECK_CAT1(check_fdinfo_ext()); CHECK_CAT1(check_unaligned_vmsplice()); CHECK_CAT1(check_tty()); CHECK_CAT1(check_so_gets()); CHECK_CAT1(check_ipc()); CHECK_CAT1(check_sigqueuinfo()); CHECK_CAT1(check_ptrace_peeksiginfo()); CHECK_CAT1(check_special_mapping_mremap()); /* * Category 2 - required for specific cases. * Unlike Category 1 features, we don't exit with non-zero status * on a failure because CRIU may still work. */ if (opts.check_extra_features) { ret |= check_prctl_cat2(); ret |= check_ptrace_suspend_seccomp(); ret |= check_ptrace_dump_seccomp_filters(); ret |= check_mem_dirty_track(); ret |= check_posix_timers(); ret |= check_tun_cr(0); ret |= check_timerfd(); ret |= check_mnt_id(); ret |= check_aio_remap(); ret |= check_fdinfo_lock(); ret |= check_clone_parent_vs_pid(); ret |= check_cgroupns(); ret |= check_tcp_window(); ret |= check_tcp_halt_closed(); ret |= check_userns(); ret |= check_loginuid(); ret |= check_can_map_vdso(); ret |= check_uffd(); ret |= check_uffd_noncoop(); ret |= check_sk_netns(); ret |= check_kcmp_epoll(); ret |= check_net_diag_raw(); ret |= check_clone3_set_tid(); ret |= check_time_namespace(); ret |= check_newifindex(); ret |= check_pidfd_store(); ret |= check_ns_pid(); ret |= check_network_lock_nftables(); ret |= check_sockopt_buf_lock(); ret |= check_memfd_hugetlb(); ret |= check_move_mount_set_group(); ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); ret |= check_ipv6_freebind(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); } /* * Category 3 - experimental. */ if (opts.check_experimental_features) { ret |= check_autofs(); ret |= check_compat_cr(); } pr_msg("%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); return ret; } #undef CHECK_GOOD #undef CHECK_BAD #undef CHECK_MAYBE #undef CHECK_CAT1 static int check_tun(void) { /* * In case there's no TUN support at all we * should report error. Unlike this plain criu * check would report "Looks good" in this case * since C/R effectively works, just not for TUN. */ return check_tun_cr(-1); } static int check_tun_netns(void) { bool has = false; check_tun_netns_cr(&has); return has ? 0 : -1; } static int check_nsid(void) { if (!kdat.has_nsid) { pr_warn("NSID isn't supported\n"); return -1; } return 0; } static int check_link_nsid(void) { if (!kdat.has_link_nsid) { pr_warn("NSID isn't supported\n"); return -1; } return 0; } static int check_external_net_ns(void) { /* * This is obviously not a real check. This only exists, so that * CRIU clients/users can check if this CRIU version supports the * external network namespace feature. Theoretically the CRIU client * or user could also parse the version, but especially for CLI users * version comparison in the shell is not easy. * This feature check does not exist for RPC as RPC has a special * version call which does not require string parsing and the external * network namespace feature is available for all CRIU versions newer * than 3.9. */ return 0; } struct feature_list { char *name; int (*func)(void); }; static struct feature_list feature_list[] = { { "mnt_id", check_mnt_id }, { "mem_dirty_track", check_mem_dirty_track }, { "aio_remap", check_aio_remap }, { "timerfd", check_timerfd }, { "tun", check_tun }, { "tun_ns", check_tun_netns }, { "userns", check_userns }, { "fdinfo_lock", check_fdinfo_lock }, { "seccomp_suspend", check_ptrace_suspend_seccomp }, { "seccomp_filters", check_ptrace_dump_seccomp_filters }, { "loginuid", check_loginuid }, { "cgroupns", check_cgroupns }, { "autofs", check_autofs }, { "tcp_half_closed", check_tcp_halt_closed }, { "compat_cr", check_compat_cr }, { "uffd", check_uffd }, { "uffd-noncoop", check_uffd_noncoop }, { "can_map_vdso", check_can_map_vdso }, { "sk_ns", check_sk_netns }, { "sk_unix_file", check_sk_unix_file }, { "net_diag_raw", check_net_diag_raw }, { "nsid", check_nsid }, { "link_nsid", check_link_nsid }, { "kcmp_epoll", check_kcmp_epoll }, { "timens", check_time_namespace }, { "external_net_ns", check_external_net_ns }, { "clone3_set_tid", check_clone3_set_tid }, { "newifindex", check_newifindex }, { "nftables", check_nftables_cr }, { "has_ipt_legacy", check_ipt_legacy }, { "pidfd_store", check_pidfd_store }, { "ns_pid", check_ns_pid }, { "apparmor_stacking", check_apparmor_stacking }, { "network_lock_nftables", check_network_lock_nftables }, { "sockopt_buf_lock", check_sockopt_buf_lock }, { "memfd_hugetlb", check_memfd_hugetlb }, { "move_mount_set_group", check_move_mount_set_group }, { "openat2", check_openat2 }, { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, { NULL, NULL }, }; void pr_check_features(const char *offset, const char *sep, int width) { struct feature_list *fl; int pos = width + 1; int sep_len = strlen(sep); int offset_len = strlen(offset); for (fl = feature_list; fl->name; fl++) { int len = strlen(fl->name); if (pos + len + sep_len > width) { pr_msg("\n%s", offset); pos = offset_len; } pr_msg("%s", fl->name); // no \n pos += len; if ((fl + 1)->name) { // not the last item pr_msg("%s", sep); // no \n pos += sep_len; } } pr_msg("\n"); } int check_add_feature(char *feat) { struct feature_list *fl; for (fl = feature_list; fl->name; fl++) { if (!strcmp(feat, fl->name)) { chk_feature = fl->func; return 0; } } pr_err("Unknown feature %s\n", feat); return -1; } static char *feature_name(int (*func)(void)) { struct feature_list *fl; for (fl = feature_list; fl->func; fl++) { if (fl->func == func) return fl->name; } return NULL; } static int pr_set_dumpable(int value) { int ret = prctl(PR_SET_DUMPABLE, value, 0, 0, 0); if (ret < 0) pr_perror("Unable to set PR_SET_DUMPABLE"); return ret; } int check_caps(void) { /* Read out effective capabilities and store in opts.cap_eff. */ if (set_opts_cap_eff()) goto out; /* * No matter if running as root or not. CRIU always needs * at least these capabilities. */ if (!has_cap_checkpoint_restore(opts.cap_eff)) goto out; /* For some things we need to know if we are running as root. */ opts.uid = geteuid(); if (!opts.uid) { /* CRIU is running as root. No further checks are necessary. */ return 0; } if (!opts.unprivileged) { pr_msg("Running as non-root requires '--unprivileged'\n"); pr_msg("Please consult the documentation for limitations when running as non-root\n"); return -1; } /* * At his point we know we are running as non-root with the necessary * capabilities available. Now we have to make the process dumpable * so that /proc/self is not owned by root. */ if (pr_set_dumpable(1)) return -1; return 0; out: pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); return -1; } crac-criu-1.5.0/criu/cr-dedup.c000066400000000000000000000034221471504326700162100ustar00rootroot00000000000000#include #include #include #include #include "int.h" #include "crtools.h" #include "pagemap.h" #include "restorer.h" static int cr_dedup_one_pagemap(unsigned long img_id, int flags); int cr_dedup(void) { int close_ret, ret = 0; unsigned long img_id; DIR *dirp; struct dirent *ent; dirp = opendir(CR_PARENT_LINK); if (dirp == NULL) { pr_perror("Can't enter previous snapshot folder"); ret = -1; goto err; } while (1) { errno = 0; ent = readdir(dirp); if (ent == NULL) { if (errno) { pr_perror("Failed readdir"); ret = -1; goto err; } break; } ret = sscanf(ent->d_name, "pagemap-%lu.img", &img_id); if (ret == 1) { pr_info("pid=%lu\n", img_id); ret = cr_dedup_one_pagemap(img_id, PR_TASK); if (ret < 0) break; } ret = sscanf(ent->d_name, "pagemap-shmem-%lu.img", &img_id); if (ret == 1) { pr_info("shmid=%lu\n", img_id); ret = cr_dedup_one_pagemap(img_id, PR_SHMEM); if (ret < 0) break; } } err: if (dirp) { close_ret = closedir(dirp); if (close_ret == -1) return close_ret; } if (ret < 0) return ret; pr_info("Deduplicated\n"); return 0; } static int cr_dedup_one_pagemap(unsigned long img_id, int flags) { int ret; struct page_read pr; struct page_read *prp; flags |= PR_MOD; ret = open_page_read(img_id, &pr, flags); if (ret <= 0) return -1; prp = pr.parent; if (!prp) goto exit; while (1) { ret = pr.advance(&pr); if (ret <= 0) goto exit; pr_debug("dedup iovec base=%" PRIx64 ", len=%lu\n", pr.pe->vaddr, pagemap_len(pr.pe)); if (!pagemap_in_parent(pr.pe)) { ret = dedup_one_iovec(prp, pr.pe->vaddr, pagemap_len(pr.pe)); if (ret) goto exit; } } exit: pr.close(&pr); if (ret < 0) return ret; return 0; } crac-criu-1.5.0/criu/cr-dump.c000066400000000000000000001466441471504326700160720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/fs.pb-c.h" #include "images/mm.pb-c.h" #include "images/creds.pb-c.h" #include "images/core.pb-c.h" #include "images/file-lock.pb-c.h" #include "images/rlimit.pb-c.h" #include "images/siginfo.pb-c.h" #include "common/list.h" #include "imgset.h" #include "file-ids.h" #include "kcmp-ids.h" #include "common/compiler.h" #include "crtools.h" #include "cr_options.h" #include "servicefd.h" #include "string.h" #include "ptrace-compat.h" #include "util.h" #include "namespaces.h" #include "image.h" #include "proc_parse.h" #include "parasite.h" #include "parasite-syscall.h" #include "compel/ptrace.h" #include "files.h" #include "files-reg.h" #include "shmem.h" #include "sk-inet.h" #include "pstree.h" #include "mount.h" #include "tty.h" #include "net.h" #include "sk-packet.h" #include "cpu.h" #include "elf.h" #include "cgroup.h" #include "cgroup-props.h" #include "file-lock.h" #include "page-xfer.h" #include "kerndat.h" #include "stats.h" #include "mem.h" #include "page-pipe.h" #include "posix-timer.h" #include "vdso.h" #include "vma.h" #include "cr-service.h" #include "plugin.h" #include "irmap.h" #include "sysfs_parse.h" #include "action-scripts.h" #include "aio.h" #include "lsm.h" #include "seccomp.h" #include "seize.h" #include "fault-injection.h" #include "dump.h" #include "eventpoll.h" #include "memfd.h" #include "timens.h" #include "img-streamer.h" #include "pidfd-store.h" #include "apparmor.h" #include "asm/dump.h" #include "pages-compress.h" /* * Architectures can overwrite this function to restore register sets that * are not covered by ptrace_set/get_regs(). * * with_threads = false: Only the register sets of the tasks are restored * with_threads = true : The register sets of the tasks with all their threads * are restored */ int __attribute__((weak)) arch_set_thread_regs(struct pstree_item *item, bool with_threads) { return 0; } #define PERSONALITY_LENGTH 9 static char loc_buf[PERSONALITY_LENGTH]; void free_mappings(struct vm_area_list *vma_area_list) { struct vma_area *vma_area, *p; list_for_each_entry_safe(vma_area, p, &vma_area_list->h, list) { if (!vma_area->file_borrowed) free(vma_area->vmst); free(vma_area); } vm_area_list_init(vma_area_list); } int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_file) { int ret = -1; pr_info("\n"); pr_info("Collecting mappings (pid: %d)\n", pid); pr_info("----------------------------------------\n"); ret = parse_smaps(pid, vma_area_list, dump_file); if (ret < 0) goto err; pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->nr_priv_pages_longest); pr_info_vma_list(&vma_area_list->h); pr_info("----------------------------------------\n"); err: return ret; } static int dump_sched_info(int pid, ThreadCoreEntry *tc) { int ret; struct sched_param sp; BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */ /* * In musl-libc sched_getscheduler and sched_getparam don't call * syscalls and instead the always return -ENOSYS */ ret = syscall(__NR_sched_getscheduler, pid); if (ret < 0) { pr_perror("Can't get sched policy for %d", pid); return -1; } pr_info("%d has %d sched policy\n", pid, ret); tc->has_sched_policy = true; tc->sched_policy = ret; if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) { ret = syscall(__NR_sched_getparam, pid, &sp); if (ret < 0) { pr_perror("Can't get sched param for %d", pid); return -1; } pr_info("\tdumping %d prio for %d\n", sp.sched_priority, pid); tc->has_sched_prio = true; tc->sched_prio = sp.sched_priority; } /* * The nice is ignored for RT sched policies, but is stored * in kernel. Thus we have to take it with us in the image. */ errno = 0; ret = getpriority(PRIO_PROCESS, pid); if (ret == -1 && errno) { pr_perror("Can't get nice for %d ret %d", pid, ret); return -1; } pr_info("\tdumping %d nice for %d\n", ret, pid); tc->has_sched_nice = true; tc->sched_nice = ret; return 0; } static int check_thread_rseq(pid_t tid, const struct parasite_check_rseq *ti_rseq) { if (!kdat.has_rseq || kdat.has_ptrace_get_rseq_conf) return 0; pr_debug("%d has rseq_inited = %d\n", tid, ti_rseq->rseq_inited); /* * We have no kdat.has_ptrace_get_rseq_conf and user * process has rseq() used, let's fail dump. */ if (ti_rseq->rseq_inited) { pr_err("%d has rseq but kernel lacks get_rseq_conf feature\n", tid); return -1; } return 0; } struct cr_imgset *glob_imgset; static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) { struct dirent *de; DIR *fd_dir; int size = 0; int n; pr_info("\n"); pr_info("Collecting fds (pid: %d)\n", pid); pr_info("----------------------------------------\n"); fd_dir = opendir_proc(pid, "fd"); if (!fd_dir) return -1; n = 0; while ((de = readdir(fd_dir))) { if (dir_dots(de)) continue; if (sizeof(struct parasite_drain_fd) + sizeof(int) * (n + 1) > size) { struct parasite_drain_fd *t; size += PAGE_SIZE; t = xrealloc(*dfds, size); if (!t) { closedir(fd_dir); return -1; } *dfds = t; } (*dfds)->fds[n++] = atoi(de->d_name); } (*dfds)->nr_fds = n; pr_info("Found %d file descriptors\n", n); pr_info("----------------------------------------\n"); closedir(fd_dir); return 0; } static int fill_fd_params_special(int fd, struct fd_parms *p) { *p = FD_PARMS_INIT; if (fstat(fd, &p->stat) < 0) { pr_perror("Can't fstat exe link"); return -1; } if (get_fd_mntid(fd, &p->mnt_id)) return -1; return 0; } static long get_fs_type(int lfd) { struct statfs fst; if (fstatfs(lfd, &fst)) { pr_perror("Unable to statfs fd %d", lfd); return -1; } return fst.f_type; } static int dump_one_reg_file_cond(int lfd, u32 *id, struct fd_parms *parms) { if (fd_id_generate_special(parms, id)) { parms->fs_type = get_fs_type(lfd); if (parms->fs_type < 0) return -1; return dump_one_reg_file(lfd, *id, parms); } return 0; } static int dump_task_exe_link(pid_t pid, MmEntry *mm) { struct fd_parms params; int fd, ret = 0; fd = open_proc_path(pid, "exe"); if (fd < 0) return -1; if (fill_fd_params_special(fd, ¶ms)) return -1; ret = dump_one_reg_file_cond(fd, &mm->exe_file_id, ¶ms); close(fd); return ret; } static int dump_task_fs(pid_t pid, struct parasite_dump_misc *misc, struct cr_imgset *imgset) { struct fd_parms p; FsEntry fe = FS_ENTRY__INIT; int fd, ret; fe.has_umask = true; fe.umask = misc->umask; fd = open_proc_path(pid, "cwd"); if (fd < 0) return -1; if (fill_fd_params_special(fd, &p)) return -1; ret = dump_one_reg_file_cond(fd, &fe.cwd_id, &p); if (ret < 0) return ret; close(fd); fd = open_proc_path(pid, "root"); if (fd < 0) return -1; if (fill_fd_params_special(fd, &p)) return -1; ret = dump_one_reg_file_cond(fd, &fe.root_id, &p); if (ret < 0) return ret; close(fd); pr_info("Dumping task cwd id %#x root id %#x\n", fe.cwd_id, fe.root_id); return pb_write_one(img_from_set(imgset, CR_FD_FS), &fe, PB_FS); } static inline rlim_t encode_rlim(rlim_t val) { return val == RLIM_INFINITY ? -1 : val; } static int dump_task_rlimits(int pid, TaskRlimitsEntry *rls) { int res; for (res = 0; res < rls->n_rlimits; res++) { struct rlimit64 lim; if (syscall(__NR_prlimit64, pid, res, NULL, &lim)) { pr_perror("Can't get rlimit %d", res); return -1; } rls->rlimits[res]->cur = encode_rlim(lim.rlim_cur); rls->rlimits[res]->max = encode_rlim(lim.rlim_max); } return 0; } static int dump_pid_misc(pid_t pid, TaskCoreEntry *tc) { int ret; if (kdat.luid != LUID_NONE) { pr_info("dumping /proc/%d/loginuid\n", pid); tc->has_loginuid = true; tc->loginuid = parse_pid_loginuid(pid, &ret, false); tc->loginuid = userns_uid(tc->loginuid); /* * loginuid dumping is critical, as if not correctly * restored, you may loss ability to login via SSH to CT */ if (ret < 0) return ret; } else { tc->has_loginuid = false; } pr_info("dumping /proc/%d/oom_score_adj\n", pid); tc->oom_score_adj = parse_pid_oom_score_adj(pid, &ret); /* * oom_score_adj dumping is not very critical, as it will affect * on victim in OOM situation and one will find dumping error in log */ if (ret < 0) tc->has_oom_score_adj = false; else tc->has_oom_score_adj = true; return 0; } static int dump_filemap(struct vma_area *vma_area, int fd) { struct fd_parms p = FD_PARMS_INIT; VmaEntry *vma = vma_area->e; int ret = 0; u32 id; BUG_ON(!vma_area->vmst); p.stat = *vma_area->vmst; p.mnt_id = vma_area->mnt_id; /* * AUFS support to compensate for the kernel bug * exposing branch pathnames in map_files. * * If the link found in vma_get_mapfile() pointed * inside a branch, we should use the pathname * from root that was saved in vma_area->aufs_rpath. */ if (vma_area->aufs_rpath) { struct fd_link aufs_link; __strlcpy(aufs_link.name, vma_area->aufs_rpath, sizeof(aufs_link.name)); aufs_link.len = strlen(aufs_link.name); p.link = &aufs_link; } /* Flags will be set during restore in open_filmap() */ if (vma->status & VMA_AREA_MEMFD) ret = dump_one_memfd_cond(fd, &id, &p); else ret = dump_one_reg_file_cond(fd, &id, &p); vma->shmid = id; return ret; } static int check_sysvipc_map_dump(pid_t pid, VmaEntry *vma) { if (root_ns_mask & CLONE_NEWIPC) return 0; pr_err("Task %d with SysVIPC shmem map @%" PRIx64 " doesn't live in IPC ns\n", pid, vma->start); return -1; } static int get_task_auxv(pid_t pid, MmEntry *mm) { auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; int fd, i, ret; pr_info("Obtaining task auvx ...\n"); fd = open_proc(pid, "auxv"); if (fd < 0) return -1; ret = read(fd, mm_saved_auxv, sizeof(mm_saved_auxv)); if (ret < 0) { ret = -1; pr_perror("Error reading %d's auxv", pid); goto err; } else { mm->n_mm_saved_auxv = ret / sizeof(auxv_t); for (i = 0; i < mm->n_mm_saved_auxv; i++) mm->mm_saved_auxv[i] = (u64)mm_saved_auxv[i]; } ret = 0; err: close_safe(&fd); return ret; } static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, const struct parasite_dump_misc *misc, const struct vm_area_list *vma_area_list, const struct cr_imgset *imgset) { MmEntry mme = MM_ENTRY__INIT; struct vma_area *vma_area; int ret = -1, i = 0; pr_info("\n"); pr_info("Dumping mm (pid: %d)\n", pid); pr_info("----------------------------------------\n"); mme.n_vmas = vma_area_list->nr; mme.vmas = xmalloc(mme.n_vmas * sizeof(VmaEntry *)); if (!mme.vmas) return -1; list_for_each_entry(vma_area, &vma_area_list->h, list) { VmaEntry *vma = vma_area->e; pr_info_vma(vma_area); if (!vma_entry_is(vma, VMA_AREA_REGULAR)) ret = 0; else if (vma_entry_is(vma, VMA_AREA_SYSVIPC)) ret = check_sysvipc_map_dump(pid, vma); else if (vma_entry_is(vma, VMA_AREA_SOCKET)) ret = dump_socket_map(vma_area); else ret = 0; if (ret) goto err; mme.vmas[i++] = vma; if (vma_entry_is(vma, VMA_AREA_AIORING)) { ret = dump_aio_ring(&mme, vma_area); if (ret) goto err; } } mme.mm_start_code = stat->start_code; mme.mm_end_code = stat->end_code; mme.mm_start_data = stat->start_data; mme.mm_end_data = stat->end_data; mme.mm_start_stack = stat->start_stack; mme.mm_start_brk = stat->start_brk; mme.mm_arg_start = stat->arg_start; mme.mm_arg_end = stat->arg_end; mme.mm_env_start = stat->env_start; mme.mm_env_end = stat->env_end; mme.mm_brk = misc->brk; mme.dumpable = misc->dumpable; mme.has_dumpable = true; mme.thp_disabled = misc->thp_disabled; mme.has_thp_disabled = true; mme.n_mm_saved_auxv = AT_VECTOR_SIZE; mme.mm_saved_auxv = xmalloc(pb_repeated_size(&mme, mm_saved_auxv)); if (!mme.mm_saved_auxv) goto err; if (get_task_auxv(pid, &mme)) goto err; if (dump_task_exe_link(pid, &mme)) goto err; ret = pb_write_one(img_from_set(imgset, CR_FD_MM), &mme, PB_MM); xfree(mme.mm_saved_auxv); free_aios(&mme); err: xfree(mme.vmas); return ret; } static int get_task_futex_robust_list(pid_t pid, ThreadCoreEntry *info) { struct robust_list_head *head = NULL; size_t len = 0; int ret; ret = syscall(SYS_get_robust_list, pid, &head, &len); if (ret < 0 && errno == ENOSYS) { /* * If the kernel says get_robust_list is not implemented, then * check whether set_robust_list is also not implemented, in * that case we can assume it is empty, since set_robust_list * is the only way to populate it. This case is possible when * "futex_cmpxchg_enabled" is unset in the kernel. * * The following system call should always fail, even if it is * implemented, in which case it will return -EINVAL because * len should be greater than zero. */ ret = syscall(SYS_set_robust_list, NULL, 0); if (ret == 0 || (ret < 0 && errno != ENOSYS)) goto err; head = NULL; len = 0; } else if (ret) { goto err; } info->futex_rla = encode_pointer(head); info->futex_rla_len = (u32)len; return 0; err: pr_err("Failed obtaining futex robust list on %d\n", pid); return -1; } static int get_task_personality(pid_t pid, u32 *personality) { int fd, ret = -1; pr_info("Obtaining personality ... \n"); fd = open_proc(pid, "personality"); if (fd < 0) goto err; ret = read(fd, loc_buf, sizeof(loc_buf) - 1); close(fd); if (ret >= 0) { loc_buf[ret] = '\0'; *personality = atoi(loc_buf); } err: return ret; } static DECLARE_KCMP_TREE(vm_tree, KCMP_VM); static DECLARE_KCMP_TREE(fs_tree, KCMP_FS); static DECLARE_KCMP_TREE(files_tree, KCMP_FILES); static DECLARE_KCMP_TREE(sighand_tree, KCMP_SIGHAND); static int dump_task_kobj_ids(struct pstree_item *item) { int new; struct kid_elem elem; int pid = item->pid->real; TaskKobjIdsEntry *ids = item->ids; elem.pid = pid; elem.idx = 0; /* really 0 for all */ elem.genid = 0; /* FIXME optimize */ new = 0; ids->vm_id = kid_generate_gen(&vm_tree, &elem, &new); if (!ids->vm_id || !new) { pr_err("Can't make VM id for %d\n", pid); return -1; } new = 0; ids->fs_id = kid_generate_gen(&fs_tree, &elem, &new); if (!ids->fs_id || !new) { pr_err("Can't make FS id for %d\n", pid); return -1; } new = 0; ids->files_id = kid_generate_gen(&files_tree, &elem, &new); if (!ids->files_id || (!new && !shared_fdtable(item))) { pr_err("Can't make FILES id for %d\n", pid); return -1; } new = 0; ids->sighand_id = kid_generate_gen(&sighand_tree, &elem, &new); if (!ids->sighand_id || !new) { pr_err("Can't make IO id for %d\n", pid); return -1; } return 0; } int get_task_ids(struct pstree_item *item) { int ret; item->ids = xmalloc(sizeof(*item->ids)); if (!item->ids) goto err; task_kobj_ids_entry__init(item->ids); if (item->pid->state != TASK_DEAD) { ret = dump_task_kobj_ids(item); if (ret) goto err_free; ret = dump_task_ns_ids(item); if (ret) goto err_free; } return 0; err_free: xfree(item->ids); item->ids = NULL; err: return -1; } static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_imgset) { return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS); } int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti) { int ret; ThreadCoreEntry *tc = core->thread_core; /* * XXX: It's possible to set two: 32-bit and 64-bit * futex list's heads. That makes about no sense, but * it's possible. Until we meet such application, dump * only one: native or compat futex's list pointer. */ if (!core_is_compat(core)) ret = get_task_futex_robust_list(pid, tc); else ret = get_task_futex_robust_list_compat(pid, tc); if (!ret) ret = dump_sched_info(pid, tc); if (!ret) { core_put_tls(core, ti->tls); CORE_THREAD_ARCH_INFO(core)->clear_tid_addr = encode_pointer(ti->tid_addr); BUG_ON(!tc->sas); copy_sas(tc->sas, &ti->sas); if (ti->pdeath_sig) { tc->has_pdeath_sig = true; tc->pdeath_sig = ti->pdeath_sig; } tc->comm = xstrdup(ti->comm); if (tc->comm == NULL) return -1; } if (!ret) ret = seccomp_dump_thread(pid, tc); /* * We are dumping rseq() in the dump_thread_rseq() function, * *before* processes gets infected (because of ptrace requests * API restriction). At this point, if the kernel lacks * kdat.has_ptrace_get_rseq_conf support we have to ensure * that dumpable processes haven't initialized rseq() or * fail dump if rseq() was used. */ if (!ret) ret = check_thread_rseq(pid, &ti->rseq); return ret; } static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item, const struct proc_pid_stat *stat, const struct cr_imgset *cr_imgset, const struct parasite_dump_misc *misc) { struct cr_img *img; CoreEntry *core = item->core[0]; pid_t pid = item->pid->real; int ret = -1; struct parasite_dump_cgroup_args cgroup_args, *info = NULL; u32 *cg_set; BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); pr_info("\n"); pr_info("Dumping core (pid: %d)\n", pid); pr_info("----------------------------------------\n"); core->tc->child_subreaper = misc->child_subreaper; core->tc->has_child_subreaper = true; if (misc->membarrier_registration_mask) { core->tc->membarrier_registration_mask = misc->membarrier_registration_mask; core->tc->has_membarrier_registration_mask = true; } ret = get_task_personality(pid, &core->tc->personality); if (ret < 0) goto err; __strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); core->tc->flags = stat->flags; core->tc->task_state = item->pid->state; core->tc->exit_code = 0; core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[0]->profile; core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; if (core->tc->task_state == TASK_STOPPED) { core->tc->has_stop_signo = true; core->tc->stop_signo = item->pid->stop_signo; } ret = parasite_dump_thread_leader_seized(ctl, pid, core); if (ret) goto err; ret = dump_pid_misc(pid, core->tc); if (ret) goto err; ret = dump_task_rlimits(pid, core->tc->rlimits); if (ret) goto err; /* For now, we only need to dump the root task's cgroup ns, because we * know all the tasks are in the same cgroup namespace because we don't * allow nesting. */ if (item->ids->has_cgroup_ns_id && !item->parent) { info = &cgroup_args; strcpy(cgroup_args.thread_cgrp, "self/cgroup"); ret = parasite_dump_cgroup(ctl, &cgroup_args); if (ret) goto err; } core->thread_core->has_cg_set = true; cg_set = &core->thread_core->cg_set; ret = dump_thread_cgroup(item, cg_set, info, -1); if (ret) goto err; img = img_from_set(cr_imgset, CR_FD_CORE); ret = pb_write_one(img, core, PB_CORE); err: pr_info("----------------------------------------\n"); return ret; } static int collect_pstree_ids_predump(void) { struct pstree_item *item; struct pid pid; struct { struct pstree_item i; struct dmp_info d; } crt = { .i.pid = &pid, }; /* * This thing is normally done inside * write_img_inventory(). */ crt.i.pid->state = TASK_ALIVE; crt.i.pid->real = getpid(); if (predump_task_ns_ids(&crt.i)) return -1; for_each_pstree_item(item) { if (item->pid->state == TASK_DEAD) continue; if (predump_task_ns_ids(item)) return -1; } return 0; } int collect_pstree_ids(void) { struct pstree_item *item; for_each_pstree_item(item) if (get_task_ids(item)) return -1; return 0; } static int collect_file_locks(void) { return parse_file_locks(); } static bool task_in_rseq(struct criu_rseq_cs *rseq_cs, uint64_t addr) { return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; } static int fixup_thread_rseq(const struct pstree_item *item, int i) { CoreEntry *core = item->core[i]; struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; if (!kdat.has_ptrace_get_rseq_conf) return 0; /* equivalent to (struct rseq)->rseq_cs is NULL */ if (!rseq_cs->start_ip) return 0; pr_debug( "fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, rseq_cs->version, (unsigned long)TI_IP(core)); if (rseq_cs->version != 0) { pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version); return -1; } if (task_in_rseq(rseq_cs, TI_IP(core))) { struct pid *tid = &item->threads[i]; /* * We need to fixup task instruction pointer from * the original one (which lays inside rseq critical section) * to rseq abort handler address. But we need to look on rseq_cs->flags * (please refer to struct rseq -> flags field description). * Naive idea of flags support may be like... let's change instruction pointer (IP) * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL). * But unfortunately, it doesn't work properly, because the kernel does * clean up of rseq_cs field in the struct rseq (modifies userspace memory). * So, we need to preserve original value of (struct rseq)->rseq_cs field in the * image and restore it's value before releasing threads (see restore_rseq_cs()). * * It's worth to mention that we need to fixup IP in CoreEntry * (used when full dump/restore is performed) and also in * the parasite regs storage (used if --leave-running option is used, * or if dump error occurred and process execution is resumed). */ if (!(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL)) { pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", tid->real); TI_IP(core) = rseq_cs->abort_ip; if (item->pid->real == tid->real) { compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); } else { compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); } } } return 0; } static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstree_item *item, int id) { struct parasite_thread_ctl *tctl = dmpi(item)->thread_ctls[id]; struct pid *tid = &item->threads[id]; CoreEntry *core = item->core[id]; pid_t pid = tid->real; int ret = -1; struct cr_img *img; pr_info("\n"); pr_info("Dumping core for thread (pid: %d)\n", pid); pr_info("----------------------------------------\n"); ret = parasite_dump_thread_seized(tctl, parasite_ctl, id, tid, core); if (ret) { pr_err("Can't dump thread for pid %d\n", pid); goto err; } pstree_insert_pid(tid); core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[id]->profile; core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; ret = fixup_thread_rseq(item, id); if (ret) { pr_err("Can't fixup rseq for pid %d\n", pid); goto err; } img = open_image(CR_FD_CORE, O_DUMP, tid->ns[0].virt); if (!img) goto err; ret = pb_write_one(img, core, PB_CORE); close_image(img); err: compel_release_thread(tctl); pr_info("----------------------------------------\n"); return ret; } static int dump_one_zombie(const struct pstree_item *item, const struct proc_pid_stat *pps) { CoreEntry *core; int ret = -1; struct cr_img *img; core = core_entry_alloc(0, 1); if (!core) return -1; __strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); core->tc->task_state = TASK_DEAD; core->tc->exit_code = pps->exit_code; img = open_image(CR_FD_CORE, O_DUMP, vpid(item)); if (!img) goto err; ret = pb_write_one(img, core, PB_CORE); close_image(img); err: core_entry_free(core); return ret; } #define SI_BATCH 32 static int dump_signal_queue(pid_t tid, SignalQueueEntry **sqe, bool group) { struct ptrace_peeksiginfo_args arg; int ret; SignalQueueEntry *queue = NULL; pr_debug("Dump %s signals of %d\n", group ? "shared" : "private", tid); arg.nr = SI_BATCH; arg.flags = 0; if (group) arg.flags |= PTRACE_PEEKSIGINFO_SHARED; arg.off = 0; queue = xmalloc(sizeof(*queue)); if (!queue) return -1; signal_queue_entry__init(queue); while (1) { int nr, si_pos; siginfo_t *si; si = xmalloc(SI_BATCH * sizeof(*si)); if (!si) { ret = -1; break; } nr = ret = ptrace(PTRACE_PEEKSIGINFO, tid, &arg, si); if (ret == 0) { xfree(si); break; /* Finished */ } if (ret < 0) { if (errno == EIO) { pr_warn("ptrace doesn't support PTRACE_PEEKSIGINFO\n"); ret = 0; } else pr_perror("ptrace"); xfree(si); break; } queue->n_signals += nr; queue->signals = xrealloc(queue->signals, sizeof(*queue->signals) * queue->n_signals); if (!queue->signals) { ret = -1; xfree(si); break; } for (si_pos = queue->n_signals - nr; si_pos < queue->n_signals; si_pos++) { SiginfoEntry *se; se = xmalloc(sizeof(*se)); if (!se) { ret = -1; break; } siginfo_entry__init(se); se->siginfo.len = sizeof(siginfo_t); se->siginfo.data = (void *)si++; /* XXX we don't free cores, but when * we will, this would cause problems */ queue->signals[si_pos] = se; } if (ret < 0) break; arg.off += nr; } *sqe = queue; return ret; } static int dump_task_signals(pid_t pid, struct pstree_item *item) { int i, ret; /* Dump private signals for each thread */ for (i = 0; i < item->nr_threads; i++) { ret = dump_signal_queue(item->threads[i].real, &item->core[i]->thread_core->signals_p, false); if (ret) { pr_err("Can't dump private signals for thread %d\n", item->threads[i].real); return -1; } } /* Dump shared signals */ ret = dump_signal_queue(pid, &item->core[0]->tc->signals_s, true); if (ret) { pr_err("Can't dump shared signals (pid: %d)\n", pid); return -1; } return 0; } static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct criu_rseq_cs *rseq_cs, struct criu_rseq *rseq) { int ret; /* rseq is not registered */ if (!rseqc->rseq_abi_pointer) return 0; /* * We need to cover the case when victim process was inside rseq critical section * at the moment when CRIU comes and seized it. We need to determine the borders * of rseq critical section at first. To achieve that we need to access thread * memory and read pointer to struct rseq_cs. * * We have two ways to access thread memory: from the parasite and using ptrace(). * But it this case we can't use parasite, because if victim process returns to the * execution, on the kernel side __rseq_handle_notify_resume hook will be called, * then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq * will be cleared. So, let's use ptrace(PTRACE_PEEKDATA). */ ret = ptrace_peek_area(tid, rseq, decode_pointer(rseqc->rseq_abi_pointer), sizeof(struct criu_rseq)); if (ret) { pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq struct\n", tid, (unsigned long)rseq, (unsigned long)(rseqc->rseq_abi_pointer), (unsigned long)sizeof(uint64_t)); return -1; } if (!rseq->rseq_cs) return 0; ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct criu_rseq_cs)); if (ret) { pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, (unsigned long)sizeof(struct criu_rseq_cs)); return -1; } return 0; } static int dump_thread_rseq(struct pstree_item *item, int i) { struct __ptrace_rseq_configuration rseqc; RseqEntry *rseqe = NULL; int ret; CoreEntry *core = item->core[i]; RseqEntry **rseqep = &core->thread_core->rseq_entry; struct criu_rseq rseq = {}; struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; /* * If we are here it means that rseq() syscall is supported, * but ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported, * we can just fail dump here. But this is bad idea, IMHO. * * So, we will try to detect if victim process was used rseq(). * See check_rseq() and check_thread_rseq() functions. */ if (!kdat.has_ptrace_get_rseq_conf) return 0; ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseqc), &rseqc); if (ret != sizeof(rseqc)) { pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret); return -1; } if (rseqc.flags != 0) { pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid, rseqc.flags); return -1; } pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseqc.rseq_abi_pointer, rseqc.signature); rseqe = xmalloc(sizeof(*rseqe)); if (!rseqe) return -1; rseq_entry__init(rseqe); rseqe->rseq_abi_pointer = rseqc.rseq_abi_pointer; rseqe->rseq_abi_size = rseqc.rseq_abi_size; rseqe->signature = rseqc.signature; if (read_rseq_cs(tid, &rseqc, rseq_cs, &rseq)) goto err; /* we won't save rseq_cs to the image (only pointer), * so let's combine flags from both struct rseq and struct rseq_cs * (kernel does the same when interpreting RSEQ_CS_FLAG_*) */ rseq_cs->flags |= rseq.flags; if (rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) { rseqe->has_rseq_cs_pointer = true; rseqe->rseq_cs_pointer = rseq.rseq_cs; } /* save rseq entry to the image */ *rseqep = rseqe; return 0; err: xfree(rseqe); return -1; } static int dump_task_rseq(pid_t pid, struct pstree_item *item) { int i; struct criu_rseq_cs *thread_rseq_cs; /* if rseq() syscall isn't supported then nothing to dump */ if (!kdat.has_rseq) return 0; thread_rseq_cs = xzalloc(sizeof(*thread_rseq_cs) * item->nr_threads); if (!thread_rseq_cs) return -1; dmpi(item)->thread_rseq_cs = thread_rseq_cs; for (i = 0; i < item->nr_threads; i++) { if (dump_thread_rseq(item, i)) goto free_rseq; } return 0; free_rseq: xfree(thread_rseq_cs); dmpi(item)->thread_rseq_cs = NULL; return -1; } static struct proc_pid_stat pps_buf; static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) { int i, ret = 0; for (i = 0; i < item->nr_threads; i++) { /* Leader is already dumped */ if (item->pid->real == item->threads[i].real) { item->threads[i].ns[0].virt = vpid(item); continue; } ret = dump_task_thread(parasite_ctl, item, i); if (ret) break; } xfree(dmpi(item)->thread_rseq_cs); dmpi(item)->thread_rseq_cs = NULL; return ret; } /* * What this routine does is just reads pid-s of dead * tasks in item's children list from item's ns proc. * * It does *not* find which real pid corresponds to * which virtual one, but it's not required -- all we * need to dump for zombie can be found in the same * ns proc. */ static int fill_zombies_pids(struct pstree_item *item) { struct pstree_item *child; int i, nr; pid_t *ch; /* * Pids read here are virtual -- caller has set up * the proc of target pid namespace. */ if (parse_children(vpid(item), &ch, &nr) < 0) return -1; /* * Step 1 -- filter our ch's pid of alive tasks */ list_for_each_entry(child, &item->children, sibling) { if (vpid(child) < 0) continue; for (i = 0; i < nr; i++) { if (ch[i] == vpid(child)) { ch[i] = -1; break; } } } /* * Step 2 -- assign remaining pids from ch on * children's items in arbitrary order. The caller * will then re-read everything needed to dump * zombies using newly obtained virtual pids. */ i = 0; list_for_each_entry(child, &item->children, sibling) { if (vpid(child) > 0) continue; for (; i < nr; i++) { if (ch[i] < 0) continue; child->pid->ns[0].virt = ch[i]; ch[i] = -1; break; } BUG_ON(i == nr); } xfree(ch); return 0; } static int dump_zombies(void) { struct pstree_item *item; int ret = -1; int pidns = root_ns_mask & CLONE_NEWPID; if (pidns) { int fd; fd = get_service_fd(CR_PROC_FD_OFF); if (fd < 0) return -1; if (set_proc_fd(fd)) return -1; } /* * We dump zombies separately because for pid-ns case * we'd have to resolve their pids w/o parasite via * target ns' proc. */ for_each_pstree_item(item) { if (item->pid->state != TASK_DEAD) continue; if (vpid(item) < 0) { if (!pidns) item->pid->ns[0].virt = item->pid->real; else if (root_item == item) { pr_err("A root task is dead\n"); goto err; } else if (fill_zombies_pids(item->parent)) goto err; } pr_info("Obtaining zombie stat ... \n"); if (parse_pid_stat(vpid(item), &pps_buf) < 0) goto err; item->sid = pps_buf.sid; item->pgid = pps_buf.pgid; BUG_ON(!list_empty(&item->children)); if (!item->sid) { pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n", item->pid->real, vpid(item)); goto err; } if (dump_one_zombie(item, &pps_buf) < 0) goto err; } ret = 0; err: if (pidns) close_proc(); return ret; } static int dump_task_cgroup(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) { struct parasite_dump_cgroup_args cgroup_args, *info; int i; BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); for (i = 0; i < item->nr_threads; i++) { CoreEntry *core = item->core[i]; /* Leader is already dumped */ if (item->pid->real == item->threads[i].real) continue; /* For now, we only need to dump the root task's cgroup ns, because we * know all the tasks are in the same cgroup namespace because we don't * allow nesting. */ info = NULL; if (item->ids->has_cgroup_ns_id && !item->parent) { info = &cgroup_args; sprintf(cgroup_args.thread_cgrp, "self/task/%d/cgroup", item->threads[i].ns[0].virt); if (parasite_dump_cgroup(parasite_ctl, &cgroup_args)) return -1; } core->thread_core->has_cg_set = true; if (dump_thread_cgroup(item, &core->thread_core->cg_set, info, i)) return -1; } return 0; } static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) { pid_t pid = item->pid->real; struct vm_area_list vmas; struct parasite_ctl *parasite_ctl; int ret = -1; struct parasite_dump_misc misc; struct mem_dump_ctl mdc; vm_area_list_init(&vmas); pr_info("========================================\n"); pr_info("Pre-dumping task (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); /* * Add pidfd of task to pidfd_store if it is initialized. * This pidfd will be used in the next pre-dump/dump iteration * in detect_pid_reuse(). */ ret = pidfd_store_add(pid); if (ret) goto err; if (item->pid->state == TASK_STOPPED) { pr_warn("Stopped tasks are not supported\n"); return 0; } if (item->pid->state == TASK_DEAD) return 0; ret = collect_mappings(pid, &vmas, NULL); if (ret) { pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret); goto err; } ret = -1; parasite_ctl = parasite_infect_seized(pid, item, &vmas); if (!parasite_ctl) { pr_err("Can't infect (pid: %d) with parasite\n", pid); goto err_free; } ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); if (ret) { pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); goto err_cure; } ret = parasite_dump_misc_seized(parasite_ctl, &misc); if (ret) { pr_err("Can't dump misc (pid: %d)\n", pid); goto err_cure; } ret = predump_task_files(pid); if (ret) { pr_err("Pre-dumping files failed (pid: %d)\n", pid); goto err_cure; } item->pid->ns[0].virt = misc.pid; mdc.pre_dump = true; mdc.lazy = false; mdc.stat = NULL; mdc.parent_ie = parent_ie; ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); if (ret) goto err_cure; if (compel_cure_remote(parasite_ctl)) pr_err("Can't cure (pid: %d) from parasite\n", pid); err_free: free_mappings(&vmas); err: return ret; err_cure: if (compel_cure(parasite_ctl)) pr_err("Can't cure (pid: %d) from parasite\n", pid); goto err_free; } static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) { pid_t pid = item->pid->real; struct vm_area_list vmas; struct parasite_ctl *parasite_ctl; int ret, exit_code = -1; struct parasite_dump_misc misc; struct cr_imgset *cr_imgset = NULL; struct parasite_drain_fd *dfds = NULL; struct proc_posix_timers_stat proc_args; struct mem_dump_ctl mdc; vm_area_list_init(&vmas); pr_info("========================================\n"); pr_info("Dumping task (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); if (item->pid->state == TASK_DEAD) /* * zombies are dumped separately in dump_zombies() */ return 0; pr_info("Obtaining task stat ... \n"); ret = parse_pid_stat(pid, &pps_buf); if (ret < 0) goto err; ret = collect_mappings(pid, &vmas, dump_filemap); if (ret) { pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret); goto err; } if (!shared_fdtable(item)) { dfds = xmalloc(sizeof(*dfds)); if (!dfds) goto err; ret = collect_fds(pid, &dfds); if (ret) { pr_err("Collect fds (pid: %d) failed with %d\n", pid, ret); goto err; } parasite_ensure_args_size(drain_fds_size(dfds)); } ret = parse_posix_timers(pid, &proc_args); if (ret < 0) { pr_err("Can't read posix timers file (pid: %d)\n", pid); goto err; } parasite_ensure_args_size(posix_timers_dump_size(proc_args.timer_n)); ret = dump_task_signals(pid, item); if (ret) { pr_err("Dump %d signals failed %d\n", pid, ret); goto err; } ret = dump_task_rseq(pid, item); if (ret) { pr_err("Dump %d rseq failed %d\n", pid, ret); goto err; } parasite_ctl = parasite_infect_seized(pid, item, &vmas); if (!parasite_ctl) { pr_err("Can't infect (pid: %d) with parasite\n", pid); goto err; } ret = fixup_thread_rseq(item, 0); if (ret) { pr_err("Fixup rseq for %d failed %d\n", pid, ret); goto err; } if (fault_injected(FI_DUMP_EARLY)) { pr_info("fault: CRIU sudden detach\n"); kill(getpid(), SIGKILL); } if (root_ns_mask & CLONE_NEWPID && root_item == item) { int pfd; pfd = parasite_get_proc_fd_seized(parasite_ctl); if (pfd < 0) { pr_err("Can't get proc fd (pid: %d)\n", pid); goto err_cure; } if (install_service_fd(CR_PROC_FD_OFF, pfd) < 0) goto err_cure; } ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); if (ret) { pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); goto err_cure; } ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ if (ret) { pr_err("Failed to check aio rings (pid: %d)\n", pid); goto err_cure; } ret = parasite_dump_misc_seized(parasite_ctl, &misc); if (ret) { pr_err("Can't dump misc (pid: %d)\n", pid); goto err_cure; } item->pid->ns[0].virt = misc.pid; pstree_insert_pid(item->pid); item->sid = misc.sid; item->pgid = misc.pgid; pr_info("sid=%d pgid=%d pid=%d\n", item->sid, item->pgid, vpid(item)); if (item->sid == 0) { pr_err("A session leader of %d(%d) is outside of its pid namespace\n", item->pid->real, vpid(item)); goto err_cure; } cr_imgset = cr_task_imgset_open(vpid(item), O_DUMP); if (!cr_imgset) goto err_cure; ret = dump_task_ids(item, cr_imgset); if (ret) { pr_err("Dump ids (pid: %d) failed with %d\n", pid, ret); goto err_cure; } if (dfds) { ret = dump_task_files_seized(parasite_ctl, item, dfds); if (ret) { pr_err("Dump files (pid: %d) failed with %d\n", pid, ret); goto err_cure; } ret = flush_eventpoll_dinfo_queue(); if (ret) { pr_err("Dump eventpoll (pid: %d) failed with %d\n", pid, ret); goto err_cure; } } mdc.pre_dump = false; mdc.lazy = opts.lazy_pages; mdc.stat = &pps_buf; mdc.parent_ie = parent_ie; ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); if (ret) goto err_cure; ret = parasite_dump_sigacts_seized(parasite_ctl, item); if (ret) { pr_err("Can't dump sigactions (pid: %d) with parasite\n", pid); goto err_cure; } ret = parasite_dump_itimers_seized(parasite_ctl, item); if (ret) { pr_err("Can't dump itimers (pid: %d)\n", pid); goto err_cure; } ret = parasite_dump_posix_timers_seized(&proc_args, parasite_ctl, item); if (ret) { pr_err("Can't dump posix timers (pid: %d)\n", pid); goto err_cure; } ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset, &misc); if (ret) { pr_err("Dump core (pid: %d) failed with %d\n", pid, ret); goto err_cure; } ret = dump_task_cgroup(parasite_ctl, item); if (ret) { pr_err("Dump cgroup of threads in process (pid: %d) failed with %d\n", pid, ret); goto err_cure; } ret = compel_stop_daemon(parasite_ctl); if (ret) { pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); goto err_cure; } ret = dump_task_threads(parasite_ctl, item); if (ret) { pr_err("Can't dump threads\n"); goto err_cure; } /* * On failure local map will be cured in cr_dump_finish() * for lazy pages. */ if (opts.lazy_pages) ret = compel_cure_remote(parasite_ctl); else ret = compel_cure(parasite_ctl); if (ret) { pr_err("Can't cure (pid: %d) from parasite\n", pid); goto err; } ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset); if (ret) { pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret); goto err; } ret = dump_task_fs(pid, &misc, cr_imgset); if (ret) { pr_err("Dump fs (pid: %d) failed with %d\n", pid, ret); goto err; } close_cr_imgset(&cr_imgset); if (opts.compress) { compress_images(); } exit_code = 0; err: if (exit_code) { close_cr_imgset(&cr_imgset); } close_pid_proc(); free_mappings(&vmas); xfree(dfds); return exit_code; err_cure: ret = compel_cure(parasite_ctl); if (ret) pr_err("Can't cure (pid: %d) from parasite\n", pid); goto err; } static int alarm_attempts = 0; bool alarm_timeouted(void) { return alarm_attempts > 0; } static void alarm_handler(int signo) { pr_err("Timeout reached. Try to interrupt: %d\n", alarm_attempts); if (alarm_attempts++ < 5) { alarm(1); /* A current syscall will be exited with EINTR */ return; } pr_err("FATAL: Unable to interrupt the current operation\n"); BUG(); } static int setup_alarm_handler(void) { struct sigaction sa = { .sa_handler = alarm_handler, .sa_flags = 0, /* Don't restart syscalls */ }; sigemptyset(&sa.sa_mask); sigaddset(&sa.sa_mask, SIGALRM); if (sigaction(SIGALRM, &sa, NULL)) { pr_perror("Unable to setup SIGALRM handler"); return -1; } return 0; } static int cr_pre_dump_finish(int status) { InventoryEntry he = INVENTORY_ENTRY__INIT; struct pstree_item *item; int ret; /* * Restore registers for tasks only. The threads have not been * infected. Therefore, the thread register sets have not been changed. */ ret = arch_set_thread_regs(root_item, false); if (ret) goto err; ret = inventory_save_uptime(&he); if (ret) goto err; he.has_pre_dump_mode = true; he.pre_dump_mode = opts.pre_dump_mode; pstree_switch_state(root_item, TASK_ALIVE); timing_stop(TIME_FROZEN); if (status < 0) { ret = status; goto err; } pr_info("Pre-dumping tasks' memory\n"); for_each_pstree_item(item) { struct parasite_ctl *ctl = dmpi(item)->parasite_ctl; struct page_pipe *mem_pp; struct page_xfer xfer; if (!ctl) continue; pr_info("\tPre-dumping %d\n", vpid(item)); timing_start(TIME_MEMWRITE); ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, vpid(item)); if (ret < 0) goto err; mem_pp = dmpi(item)->mem_pp; if (opts.pre_dump_mode == PRE_DUMP_READ) { timing_stop(TIME_MEMWRITE); ret = page_xfer_predump_pages(item->pid->real, &xfer, mem_pp); } else { ret = page_xfer_dump_pages(&xfer, mem_pp); } xfer.close(&xfer); if (ret) goto err; timing_stop(TIME_MEMWRITE); destroy_page_pipe(mem_pp); if (compel_cure_local(ctl)) pr_err("Can't cure local: something happened with mapping?\n"); } free_pstree(root_item); seccomp_free_entries(); if (irmap_predump_run()) { ret = -1; goto err; } err: if (unsuspend_lsm()) ret = -1; if (disconnect_from_page_server()) ret = -1; if (bfd_flush_images()) ret = -1; if (write_img_inventory(&he)) ret = -1; if (ret) pr_err("Pre-dumping FAILED.\n"); else { write_stats(DUMP_STATS); pr_info("Pre-dumping finished successfully\n"); } return ret; } int cr_pre_dump_tasks(pid_t pid) { InventoryEntry *parent_ie = NULL; struct pstree_item *item; int ret = -1; /* * We might need a lot of pipes to fetch huge number of pages to dump. */ rlimit_unlimit_nofile(); root_item = alloc_pstree_item(); if (!root_item) goto err; root_item->pid->real = pid; if (!opts.track_mem) { pr_info("Enforcing memory tracking for pre-dump.\n"); opts.track_mem = true; } if (opts.final_state == TASK_DEAD) { pr_info("Enforcing tasks run after pre-dump.\n"); opts.final_state = TASK_ALIVE; } if (init_stats(DUMP_STATS)) goto err; if (cr_plugin_init(CR_PLUGIN_STAGE__PRE_DUMP)) goto err; if (lsm_check_opts()) goto err; if (irmap_load_cache()) goto err; if (cpu_init()) goto err; if (vdso_init_dump()) goto err; if (connect_to_page_server_to_send() < 0) goto err; if (setup_alarm_handler()) goto err; if (collect_pstree()) goto err; if (collect_pstree_ids_predump()) goto err; if (collect_namespaces(false) < 0) goto err; if (collect_and_suspend_lsm() < 0) goto err; /* Errors handled later in detect_pid_reuse */ parent_ie = get_parent_inventory(); for_each_pstree_item(item) if (pre_dump_one_task(item, parent_ie)) goto err; if (parent_ie) { inventory_entry__free_unpacked(parent_ie, NULL); parent_ie = NULL; } ret = cr_dump_shmem(); if (ret) goto err; if (irmap_predump_prep()) goto err; ret = 0; err: if (parent_ie) inventory_entry__free_unpacked(parent_ie, NULL); return cr_pre_dump_finish(ret); } static int cr_lazy_mem_dump(void) { struct pstree_item *item; int ret = 0; pr_info("Starting lazy pages server\n"); ret = cr_page_server(false, true, -1); for_each_pstree_item(item) { if (item->pid->state != TASK_DEAD) { destroy_page_pipe(dmpi(item)->mem_pp); if (compel_cure_local(dmpi(item)->parasite_ctl)) pr_err("Can't cure local: something happened with mapping?\n"); } } if (ret) pr_err("Lazy pages transfer FAILED.\n"); else pr_info("Lazy pages transfer finished successfully\n"); return ret; } static int cr_dump_finish(int ret) { int post_dump_ret = 0; if (disconnect_from_page_server()) ret = -1; close_cr_imgset(&glob_imgset); if (bfd_flush_images()) ret = -1; cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); cgp_fini(); if (!ret) { /* * It might be a migration case, where we're asked * to dump everything, then some script transfer * image on a new node and we're supposed to kill * dumpee because it continue running somewhere * else. * * Thus ask user via script if we're to break * checkpoint. */ post_dump_ret = run_scripts(ACT_POST_DUMP); if (post_dump_ret) { post_dump_ret = WEXITSTATUS(post_dump_ret); pr_info("Post dump script passed with %d\n", post_dump_ret); } } /* * Dump is complete at this stage. To choose what * to do next we need to consider the following * scenarios * * - error happened during checkpoint: just clean up * everything and continue execution of the dumpee; * * - dump succeeded but post-dump script returned * some ret code: same as in previous scenario -- * just clean up everything and continue execution, * we will return script ret code back to criu caller * and it's up to a caller what to do with running instance * of the dumpee -- either kill it, or continue running; * * - dump succeeded but -R option passed, pointing that * we're asked to continue execution of the dumpee. It's * assumed that a user will use post-dump script to keep * consistency of the FS and other resources, we simply * start rollback procedure and cleanup everything. */ if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) { unsuspend_lsm(); network_unlock(); delete_link_remaps(); clean_cr_time_mounts(); } if (!ret && opts.lazy_pages) ret = cr_lazy_mem_dump(); if (arch_set_thread_regs(root_item, true) < 0) return -1; pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); timing_stop(TIME_FROZEN); free_pstree(root_item); seccomp_free_entries(); free_file_locks(); free_link_remaps(); free_aufs_branches(); free_userns_maps(); close_service_fd(CR_PROC_FD_OFF); close_image_dir(); if (ret || post_dump_ret) { pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); pr_info("Dumping finished successfully\n"); } return post_dump_ret ?: (ret != 0); } int cr_dump_tasks(pid_t pid) { InventoryEntry he = INVENTORY_ENTRY__INIT; InventoryEntry *parent_ie = NULL; struct pstree_item *item; int pre_dump_ret = 0; int ret = -1; pr_info("========================================\n"); pr_info("Dumping processes (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); /* * We will fetch all file descriptors for each task, their number can * be bigger than a default file limit, so we need to raise it to the * maximum. */ rlimit_unlimit_nofile(); root_item = alloc_pstree_item(); if (!root_item) goto err; root_item->pid->real = pid; pre_dump_ret = run_scripts(ACT_PRE_DUMP); if (pre_dump_ret != 0) { pr_err("Pre dump script failed with %d!\n", pre_dump_ret); goto err; } if (init_stats(DUMP_STATS)) goto err; if (cr_plugin_init(CR_PLUGIN_STAGE__DUMP)) goto err; if (lsm_check_opts()) goto err; if (irmap_load_cache()) goto err; if (cpu_init()) goto err; if (vdso_init_dump()) goto err; if (cgp_init(opts.cgroup_props, opts.cgroup_props ? strlen(opts.cgroup_props) : 0, opts.cgroup_props_file)) goto err; if (parse_cg_info()) goto err; if (prepare_inventory(&he)) goto err; if (opts.cpu_cap & CPU_CAP_IMAGE) { if (cpu_dump_cpuinfo()) goto err; } if (connect_to_page_server_to_send() < 0) goto err; if (setup_alarm_handler()) goto err; /* * The collect_pstree will also stop (PTRACE_SEIZE) the tasks * thus ensuring that they don't modify anything we collect * afterwards. */ if (collect_pstree()) goto err; if (collect_pstree_ids()) goto err; if (network_lock()) goto err; if (rpc_query_external_files()) goto err; if (collect_file_locks()) goto err; if (collect_namespaces(true) < 0) goto err; glob_imgset = cr_glob_imgset_open(O_DUMP); if (!glob_imgset) goto err; if (seccomp_collect_dump_filters() < 0) goto err; /* Errors handled later in detect_pid_reuse */ parent_ie = get_parent_inventory(); if (collect_and_suspend_lsm() < 0) goto err; for_each_pstree_item(item) { if (dump_one_task(item, parent_ie)) goto err; } if (parent_ie) { inventory_entry__free_unpacked(parent_ie, NULL); parent_ie = NULL; } /* * It may happen that a process has completed but its files in * /proc/PID/ are still open by another process. If the PID has been * given to some newer thread since then, we may be unable to dump * all this. */ if (dead_pid_conflict()) goto err; /* MNT namespaces are dumped after files to save remapped links */ if (dump_mnt_namespaces() < 0) goto err; if (dump_file_locks()) goto err; if (dump_verify_tty_sids()) goto err; if (dump_zombies()) goto err; if (dump_pstree(root_item)) goto err; /* * TODO: cr_dump_shmem has to be called before dump_namespaces(), * because page_ids is a global variable and it is used to dump * ipc shared memory, but an ipc namespace is dumped in a child * process. */ ret = cr_dump_shmem(); if (ret) goto err; if (root_ns_mask) { ret = dump_namespaces(root_item, root_ns_mask); if (ret) goto err; } if ((root_ns_mask & CLONE_NEWTIME) == 0) { ret = dump_time_ns(0); if (ret) goto err; } if (dump_aa_namespaces() < 0) goto err; ret = dump_cgroups(); if (ret) goto err; ret = fix_external_unix_sockets(); if (ret) goto err; ret = tty_post_actions(); if (ret) goto err; ret = inventory_save_uptime(&he); if (ret) goto err; he.has_pre_dump_mode = false; ret = write_img_inventory(&he); if (ret) goto err; err: if (parent_ie) inventory_entry__free_unpacked(parent_ie, NULL); return cr_dump_finish(ret); } crac-criu-1.5.0/criu/cr-errno.c000066400000000000000000000002151471504326700162310ustar00rootroot00000000000000static int cr_errno; int get_cr_errno(void) { return cr_errno; } void set_cr_errno(int new_err) { if (!cr_errno) cr_errno = new_err; } crac-criu-1.5.0/criu/cr-restore.c000066400000000000000000002777531471504326700166160ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include #include "common/compiler.h" #include "linux/rseq.h" #include "clone-noasan.h" #include "cr_options.h" #include "servicefd.h" #include "image.h" #include "img-streamer.h" #include "util.h" #include "util-pie.h" #include "criu-log.h" #include "restorer.h" #include "sockets.h" #include "sk-packet.h" #include "common/lock.h" #include "files.h" #include "pipes.h" #include "fifo.h" #include "sk-inet.h" #include "eventfd.h" #include "eventpoll.h" #include "signalfd.h" #include "proc_parse.h" #include "pie/restorer-blob.h" #include "crtools.h" #include "uffd.h" #include "namespaces.h" #include "mem.h" #include "mount.h" #include "fsnotify.h" #include "pstree.h" #include "net.h" #include "tty.h" #include "cpu.h" #include "file-lock.h" #include "vdso.h" #include "stats.h" #include "tun.h" #include "vma.h" #include "kerndat.h" #include "rst-malloc.h" #include "plugin.h" #include "cgroup.h" #include "timerfd.h" #include "action-scripts.h" #include "shmem.h" #include "aio.h" #include "lsm.h" #include "seccomp.h" #include "fault-injection.h" #include "sk-queue.h" #include "sigframe.h" #include "fdstore.h" #include "string.h" #include "memfd.h" #include "timens.h" #include "bpfmap.h" #include "apparmor.h" #include "parasite-syscall.h" #include "files-reg.h" #include #include "compel/include/asm/syscall.h" #include "linux/mount.h" #include "protobuf.h" #include "images/sa.pb-c.h" #include "images/timer.pb-c.h" #include "images/vma.pb-c.h" #include "images/rlimit.pb-c.h" #include "images/pagemap.pb-c.h" #include "images/siginfo.pb-c.h" #include "restore.h" #include "cr-errno.h" #include "pages-compress.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread #endif #ifndef arch_export_restore_task #define arch_export_restore_task __export_restore_task #endif #ifndef arch_export_unmap #define arch_export_unmap __export_unmap #define arch_export_unmap_compat __export_unmap_compat #endif #define NOT_THAT_PID_ECODE 2 struct pstree_item *current; static int restore_task_with_children(void *); static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); static int prepare_restorer_blob(void); static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); /* * Architectures can overwrite this function to restore registers that are not * present in the sigreturn signal frame. */ int __attribute__((weak)) arch_set_thread_regs_nosigrt(struct pid *pid) { return 0; } static inline int stage_participants(int next_stage) { switch (next_stage) { case CR_STATE_FAIL: return 0; case CR_STATE_ROOT_TASK: case CR_STATE_PREPARE_NAMESPACES: return 1; case CR_STATE_FORKING: return task_entries->nr_tasks + task_entries->nr_helpers; case CR_STATE_RESTORE: return task_entries->nr_threads + task_entries->nr_helpers; case CR_STATE_RESTORE_SIGCHLD: case CR_STATE_RESTORE_CREDS: return task_entries->nr_threads; } BUG(); return -1; } static inline int stage_current_participants(int next_stage) { switch (next_stage) { case CR_STATE_FORKING: return 1; case CR_STATE_RESTORE: /* * Each thread has to be reported about this stage, * so if we want to wait all other tasks, we have to * exclude all threads of the current process. * It is supposed that we will wait other tasks, * before creating threads of the current task. */ return current->nr_threads; } BUG(); return -1; } static int __restore_wait_inprogress_tasks(int participants) { int ret; futex_t *np = &task_entries->nr_in_progress; futex_wait_while_gt(np, participants); ret = (int)futex_get(np); if (ret < 0) { set_cr_errno(get_task_cr_err()); return ret; } return 0; } static int restore_wait_inprogress_tasks(void) { return __restore_wait_inprogress_tasks(0); } /* Wait all tasks except the current one */ static int restore_wait_other_tasks(void) { int participants, stage; stage = futex_get(&task_entries->start); participants = stage_current_participants(stage); return __restore_wait_inprogress_tasks(participants); } static inline void __restore_switch_stage_nw(int next_stage) { futex_set(&task_entries->nr_in_progress, stage_participants(next_stage)); futex_set(&task_entries->start, next_stage); } static inline void __restore_switch_stage(int next_stage) { if (next_stage != CR_STATE_COMPLETE) futex_set(&task_entries->nr_in_progress, stage_participants(next_stage)); futex_set_and_wake(&task_entries->start, next_stage); } static int restore_switch_stage(int next_stage) { __restore_switch_stage(next_stage); return restore_wait_inprogress_tasks(); } static int restore_finish_ns_stage(int from, int to) { if (root_ns_mask) return restore_finish_stage(task_entries, from); /* Nobody waits for this stage change, just go ahead */ __restore_switch_stage_nw(to); return 0; } static int crtools_prepare_shared(void) { if (prepare_memfd_inodes()) return -1; if (prepare_files()) return -1; /* We might want to remove ghost files on failed restore */ if (collect_remaps_and_regfiles()) return -1; /* Connections are unlocked from criu */ if (!files_collected() && collect_image(&inet_sk_cinfo)) return -1; if (collect_binfmt_misc()) return -1; if (tty_prep_fds()) return -1; if (prepare_apparmor_namespaces()) return -1; return 0; } /* * Collect order information: * - reg_file should be before remap, as the latter needs * to find file_desc objects * - per-pid collects (mm and fd) should be after remap and * reg_file since both per-pid ones need to get fdesc-s * and bump counters on remaps if they exist */ static struct collect_image_info *cinfos[] = { &file_locks_cinfo, &pipe_data_cinfo, &fifo_data_cinfo, &sk_queues_cinfo, #ifdef CONFIG_HAS_LIBBPF &bpfmap_data_cinfo, #endif }; static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, }; /* These images are required to restore namespaces */ static struct collect_image_info *before_ns_cinfos[] = { &tty_info_cinfo, /* Restore devpts content */ &tty_cdata, }; static struct pprep_head *post_prepare_heads = NULL; void add_post_prepare_cb(struct pprep_head *ph) { ph->next = post_prepare_heads; post_prepare_heads = ph; } static int run_post_prepare(void) { struct pprep_head *ph; for (ph = post_prepare_heads; ph != NULL; ph = ph->next) if (ph->actor(ph)) return -1; return 0; } static int root_prepare_shared(void) { int ret = 0; struct pstree_item *pi; pr_info("Preparing info about shared resources\n"); if (prepare_remaps()) return -1; if (seccomp_read_image()) return -1; if (collect_images(cinfos, ARRAY_SIZE(cinfos))) return -1; if (!files_collected() && collect_images(cinfos_files, ARRAY_SIZE(cinfos_files))) return -1; for_each_pstree_item(pi) { if (pi->pid->state == TASK_HELPER) continue; ret = prepare_mm_pid(pi); if (ret < 0) break; ret = prepare_fd_pid(pi); if (ret < 0) break; ret = prepare_fs_pid(pi); if (ret < 0) break; } if (ret < 0) goto err; prepare_cow_vmas(); ret = prepare_restorer_blob(); if (ret) goto err; ret = add_fake_unix_queuers(); if (ret) goto err; /* * This should be called with all packets collected AND all * fdescs and fles prepared BUT post-prep-s not run. */ ret = prepare_scms(); if (ret) goto err; ret = run_post_prepare(); if (ret) goto err; ret = unix_prepare_root_shared(); if (ret) goto err; show_saved_files(); err: return ret; } /* This actually populates and occupies ROOT_FD_OFF sfd */ static int populate_root_fd_off(void) { struct ns_id *mntns = NULL; int ret; if (root_ns_mask & CLONE_NEWNS) { mntns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); BUG_ON(!mntns); } ret = mntns_get_root_fd(mntns); if (ret < 0) pr_err("Can't get root fd\n"); return ret >= 0 ? 0 : -1; } static int populate_pid_proc(void) { if (open_pid_proc(vpid(current)) < 0) { pr_err("Can't open PROC_SELF\n"); return -1; } if (open_pid_proc(PROC_SELF) < 0) { pr_err("Can't open PROC_SELF\n"); return -1; } return 0; } static rt_sigaction_t sigchld_act; /* * If parent's sigaction has blocked SIGKILL (which is non-sense), * this parent action is non-valid and shouldn't be inherited. * Used to mark parent_act* no more valid. */ static rt_sigaction_t parent_act[SIGMAX]; #ifdef CONFIG_COMPAT static rt_sigaction_t_compat parent_act_compat[SIGMAX]; #endif static bool sa_inherited(int sig, rt_sigaction_t *sa) { rt_sigaction_t *pa; int i; if (current == root_item) return false; /* XXX -- inherit from CRIU? */ pa = &parent_act[sig]; /* Omitting non-valid sigaction */ if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) return false; for (i = 0; i < _KNSIG_WORDS; i++) if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) return false; return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && pa->rt_sa_restorer == sa->rt_sa_restorer; } static int restore_native_sigaction(int sig, SaEntry *e) { rt_sigaction_t act; int ret; ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); ASSIGN_TYPED(act.rt_sa_flags, e->flags); ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); #ifdef CONFIG_MIPS e->has_mask_extended = 1; BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); #else BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); #endif if (sig == SIGCHLD) { sigchld_act = act; return 0; } if (sa_inherited(sig - 1, &act)) return 1; /* * A pure syscall is used, because glibc * sigaction overwrites se_restorer. */ ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); if (ret < 0) { pr_perror("Can't restore sigaction"); return ret; } parent_act[sig - 1] = act; /* Mark SIGKILL blocked which makes compat sigaction non-valid */ #ifdef CONFIG_COMPAT parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; #endif return 1; } static void *stack32; #ifdef CONFIG_COMPAT static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) { rt_sigaction_t_compat *pa; int i; if (current == root_item) return false; pa = &parent_act_compat[sig]; /* Omitting non-valid sigaction */ if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) return false; for (i = 0; i < _KNSIG_WORDS; i++) if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) return false; return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && pa->rt_sa_restorer == sa->rt_sa_restorer; } static int restore_compat_sigaction(int sig, SaEntry *e) { rt_sigaction_t_compat act; int ret; ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); ASSIGN_TYPED(act.rt_sa_flags, e->flags); ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); if (sig == SIGCHLD) { memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); return 0; } if (sa_compat_inherited(sig - 1, &act)) return 1; if (!stack32) { stack32 = alloc_compat_syscall_stack(); if (!stack32) return -1; } ret = arch_compat_rt_sigaction(stack32, sig, &act); if (ret < 0) { pr_err("Can't restore compat sigaction: %d\n", ret); return ret; } parent_act_compat[sig - 1] = act; /* Mark SIGKILL blocked which makes native sigaction non-valid */ parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; return 1; } #else static int restore_compat_sigaction(int sig, SaEntry *e) { return -1; } #endif static int prepare_sigactions_from_core(TaskCoreEntry *tc) { int sig, i; if (tc->n_sigactions != SIGMAX - 2) { pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); return -1; } pr_info("Restore on-core sigactions for %d\n", vpid(current)); for (sig = 1, i = 0; sig <= SIGMAX; sig++) { int ret; SaEntry *e; bool sigaction_is_compat; if (sig == SIGKILL || sig == SIGSTOP) continue; e = tc->sigactions[i++]; sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; if (sigaction_is_compat) ret = restore_compat_sigaction(sig, e); else ret = restore_native_sigaction(sig, e); if (ret < 0) return ret; } return 0; } /* Returns number of restored signals, -1 or negative errno on fail */ static int restore_one_sigaction(int sig, struct cr_img *img, int pid) { bool sigaction_is_compat; SaEntry *e; int ret = 0; BUG_ON(sig == SIGKILL || sig == SIGSTOP); ret = pb_read_one_eof(img, &e, PB_SIGACT); if (ret == 0) { if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ pr_err("Unexpected EOF %d\n", sig); return -1; } pr_warn("This format of sigacts-%d.img is deprecated\n", pid); return -1; } if (ret < 0) return ret; sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; if (sigaction_is_compat) ret = restore_compat_sigaction(sig, e); else ret = restore_native_sigaction(sig, e); sa_entry__free_unpacked(e, NULL); return ret; } static int prepare_sigactions_from_image(void) { int pid = vpid(current); struct cr_img *img; int sig, rst = 0; int ret = 0; pr_info("Restore sigacts for %d\n", pid); img = open_image(CR_FD_SIGACT, O_RSTR, pid); if (!img) return -1; for (sig = 1; sig <= SIGMAX; sig++) { if (sig == SIGKILL || sig == SIGSTOP) continue; ret = restore_one_sigaction(sig, img, pid); if (ret < 0) break; if (ret) rst++; } pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); close_image(img); return ret; } static int prepare_sigactions(CoreEntry *core) { int ret; if (!task_alive(current)) return 0; if (core->tc->n_sigactions != 0) ret = prepare_sigactions_from_core(core->tc); else ret = prepare_sigactions_from_image(); if (stack32) { free_compat_syscall_stack(stack32); stack32 = NULL; } return ret; } static int __collect_child_pids(struct pstree_item *p, int state, unsigned int *n) { struct pstree_item *pi; list_for_each_entry(pi, &p->children, sibling) { pid_t *child; if (pi->pid->state != state) continue; child = rst_mem_alloc(sizeof(*child), RM_PRIVATE); if (!child) return -1; (*n)++; *child = vpid(pi); } return 0; } static int collect_child_pids(int state, unsigned int *n) { struct pstree_item *pi; *n = 0; /* * All children of helpers and zombies will be reparented to the init * process and they have to be collected too. */ if (current == root_item) { for_each_pstree_item(pi) { if (pi->pid->state != TASK_HELPER && pi->pid->state != TASK_DEAD) continue; if (__collect_child_pids(pi, state, n)) return -1; } } return __collect_child_pids(current, state, n); } static int collect_helper_pids(struct task_restore_args *ta) { ta->helpers = (pid_t *)rst_mem_align_cpos(RM_PRIVATE); return collect_child_pids(TASK_HELPER, &ta->helpers_n); } static int collect_zombie_pids(struct task_restore_args *ta) { ta->zombies = (pid_t *)rst_mem_align_cpos(RM_PRIVATE); return collect_child_pids(TASK_DEAD, &ta->zombies_n); } static int collect_inotify_fds(struct task_restore_args *ta) { struct list_head *list = &rsti(current)->fds; struct fdt *fdt = rsti(current)->fdt; struct fdinfo_list_entry *fle; /* Check we are an fdt-restorer */ if (fdt && fdt->pid != vpid(current)) return 0; ta->inotify_fds = (int *)rst_mem_align_cpos(RM_PRIVATE); list_for_each_entry(fle, list, ps_list) { struct file_desc *d = fle->desc; int *inotify_fd; if (d->ops->type != FD_TYPES__INOTIFY) continue; if (fle != file_master(d)) continue; inotify_fd = rst_mem_alloc(sizeof(*inotify_fd), RM_PRIVATE); if (!inotify_fd) return -1; ta->inotify_fds_n++; *inotify_fd = fle->fe->fd; pr_debug("Collect inotify fd %d to cleanup later\n", *inotify_fd); } return 0; } static int open_core(int pid, CoreEntry **pcore) { int ret; struct cr_img *img; img = open_image(CR_FD_CORE, O_RSTR, pid); if (!img) { pr_err("Can't open core data for %d\n", pid); return -1; } ret = pb_read_one(img, pcore, PB_CORE); close_image(img); return ret <= 0 ? -1 : 0; } static int open_cores(int pid, CoreEntry *leader_core) { int i, tpid; CoreEntry **cores = NULL; cores = xmalloc(sizeof(*cores) * current->nr_threads); if (!cores) goto err; for (i = 0; i < current->nr_threads; i++) { tpid = current->threads[i].ns[0].virt; if (tpid == pid) cores[i] = leader_core; else if (open_core(tpid, &cores[i])) goto err; } current->core = cores; /* * Walk over all threads and if one them is having * active seccomp mode we will suspend filtering * on the whole group until restore complete. * * Otherwise any criu code which might use same syscall * if present inside a filter chain would take filter * action and might break restore procedure. */ for (i = 0; i < current->nr_threads; i++) { ThreadCoreEntry *thread_core = cores[i]->thread_core; if (thread_core->seccomp_mode != SECCOMP_MODE_DISABLED) { rsti(current)->has_seccomp = true; break; } } for (i = 0; i < current->nr_threads; i++) { ThreadCoreEntry *tc = cores[i]->thread_core; struct rst_rseq *rseqs = rsti(current)->rseqe; RseqEntry *rseqe = tc->rseq_entry; /* compatibility with older CRIU versions */ if (!rseqe) continue; /* rseq cs had no RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL */ if (!rseqe->has_rseq_cs_pointer) continue; rseqs[i].rseq_abi_pointer = rseqe->rseq_abi_pointer; rseqs[i].rseq_cs_pointer = rseqe->rseq_cs_pointer; } return 0; err: xfree(cores); return -1; } static int prepare_oom_score_adj(int value) { int fd, ret = 0; char buf[11]; fd = open_proc_rw(PROC_SELF, "oom_score_adj"); if (fd < 0) return -1; snprintf(buf, 11, "%d", value); if (write(fd, buf, 11) < 0) { pr_perror("Write %s to /proc/self/oom_score_adj failed", buf); ret = -1; } close(fd); return ret; } static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_args *args) { int ret; if (tc->has_child_subreaper) args->child_subreaper = tc->child_subreaper; if (tc->has_membarrier_registration_mask) args->membarrier_registration_mask = tc->membarrier_registration_mask; /* loginuid value is critical to restore */ if (kdat.luid == LUID_FULL && tc->has_loginuid && tc->loginuid != INVALID_UID) { ret = prepare_loginuid(tc->loginuid); if (ret < 0) { pr_err("Setting loginuid for %d task failed\n", pid); return ret; } } /* oom_score_adj is not critical: only log errors */ if (tc->has_oom_score_adj && tc->oom_score_adj != 0) prepare_oom_score_adj(tc->oom_score_adj); return 0; } static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); static int prepare_mm(pid_t pid, struct task_restore_args *args); static int restore_one_alive_task(int pid, CoreEntry *core) { unsigned args_len; struct task_restore_args *ta; pr_info("Restoring resources\n"); rst_mem_switch_to_private(); args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) * current->nr_threads, page_size()); ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); if (!ta) return -1; memzero(ta, args_len); if (prepare_fds(current)) return -1; if (prepare_file_locks(pid)) return -1; if (open_vmas(current)) return -1; if (prepare_aios(current, ta)) return -1; if (fixup_sysv_shmems()) return -1; if (open_cores(pid, core)) return -1; if (prepare_signals(pid, ta, core)) return -1; if (prepare_posix_timers(pid, ta, core)) return -1; if (prepare_rlimits(pid, ta, core) < 0) return -1; if (collect_helper_pids(ta) < 0) return -1; if (collect_zombie_pids(ta) < 0) return -1; if (inherit_fd_fini() < 0) return -1; if (collect_inotify_fds(ta) < 0) return -1; if (prepare_proc_misc(pid, core->tc, ta)) return -1; /* * Get all the tcp sockets fds into rst memory -- restorer * will turn repair off before going sigreturn */ if (prepare_tcp_socks(ta)) return -1; /* * Copy timerfd params for restorer args, we need to proceed * timer setting at the very late. */ if (prepare_timerfds(ta)) return -1; if (seccomp_prepare_threads(current, ta) < 0) return -1; if (prepare_itimers(pid, ta, core) < 0) return -1; if (prepare_mm(pid, ta)) return -1; if (prepare_vmas(current, ta)) return -1; /* * Sockets have to be restored in their network namespaces, * so a task namespace has to be restored after sockets. */ if (restore_task_net_ns(current)) return -1; if (setup_uffd(pid, ta)) return -1; return sigreturn_restore(pid, ta, args_len, core); } static void zombie_prepare_signals(void) { sigset_t blockmask; int sig; struct sigaction act; sigfillset(&blockmask); sigprocmask(SIG_UNBLOCK, &blockmask, NULL); memset(&act, 0, sizeof(act)); act.sa_handler = SIG_DFL; for (sig = 1; sig <= SIGMAX; sig++) sigaction(sig, &act, NULL); } #define SIG_FATAL_MASK \ ((1 << SIGHUP) | (1 << SIGINT) | (1 << SIGQUIT) | (1 << SIGILL) | (1 << SIGTRAP) | (1 << SIGABRT) | \ (1 << SIGIOT) | (1 << SIGBUS) | (1 << SIGFPE) | (1 << SIGKILL) | (1 << SIGUSR1) | (1 << SIGSEGV) | \ (1 << SIGUSR2) | (1 << SIGPIPE) | (1 << SIGALRM) | (1 << SIGTERM) | (1 << SIGXCPU) | (1 << SIGXFSZ) | \ (1 << SIGVTALRM) | (1 << SIGPROF) | (1 << SIGPOLL) | (1 << SIGIO) | (1 << SIGSYS) | (1 << SIGSTKFLT) | \ (1 << SIGPWR)) static inline int sig_fatal(int sig) { return (sig > 0) && (sig < SIGMAX) && (SIG_FATAL_MASK & (1UL << sig)); } struct task_entries *task_entries; static unsigned long task_entries_pos; static int wait_on_helpers_zombies(void) { struct pstree_item *pi; list_for_each_entry(pi, ¤t->children, sibling) { pid_t pid = vpid(pi); int status; switch (pi->pid->state) { case TASK_DEAD: if (waitid(P_PID, pid, NULL, WNOWAIT | WEXITED) < 0) { pr_perror("Wait on %d zombie failed", pid); return -1; } break; case TASK_HELPER: if (waitpid(pid, &status, 0) != pid) { pr_perror("waitpid for helper %d failed", pid); return -1; } break; } } return 0; } static int wait_exiting_children(void); static int restore_one_zombie(CoreEntry *core) { int exit_code = core->tc->exit_code; pr_info("Restoring zombie with %d code\n", exit_code); if (prepare_fds(current)) return -1; if (lazy_pages_setup_zombie(vpid(current))) return -1; prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0); if (task_entries != NULL) { wait_exiting_children(); zombie_prepare_signals(); } if (exit_code & 0x7f) { int signr; /* prevent generating core files */ if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0)) pr_perror("Can't drop the dumpable flag"); signr = exit_code & 0x7F; if (!sig_fatal(signr)) { pr_warn("Exit with non fatal signal ignored\n"); signr = SIGABRT; } if (kill(vpid(current), signr) < 0) pr_perror("Can't kill myself, will just exit"); exit_code = 0; } exit((exit_code >> 8) & 0x7f); /* never reached */ BUG_ON(1); return -1; } static int setup_newborn_fds(struct pstree_item *me) { if (clone_service_fd(me)) return -1; if (!me->parent || (rsti(me->parent)->fdt && !(rsti(me)->clone_flags & CLONE_FILES))) { /* * When our parent has shared fd table, some of the table owners * may be already created. Files, they open, will be inherited * by current process, and here we close them. Also, service fds * of parent are closed here. And root_item closes the files, * that were inherited from criu process. */ if (close_old_fds()) return -1; } return 0; } static int check_core(CoreEntry *core, struct pstree_item *me) { int ret = -1; if (core->mtype != CORE_ENTRY__MARCH) { pr_err("Core march mismatch %d\n", (int)core->mtype); goto out; } if (!core->tc) { pr_err("Core task state data missed\n"); goto out; } if (core->tc->task_state != TASK_DEAD) { if (!core->ids && !me->ids) { pr_err("Core IDS data missed for non-zombie\n"); goto out; } if (!CORE_THREAD_ARCH_INFO(core)) { pr_err("Core info data missed for non-zombie\n"); goto out; } /* * Seccomp are moved to per-thread origin, * so for old images we need to move per-task * data into proper place. */ if (core->tc->has_old_seccomp_mode) { core->thread_core->has_seccomp_mode = core->tc->has_old_seccomp_mode; core->thread_core->seccomp_mode = core->tc->old_seccomp_mode; } if (core->tc->has_old_seccomp_filter) { core->thread_core->has_seccomp_filter = core->tc->has_old_seccomp_filter; core->thread_core->seccomp_filter = core->tc->old_seccomp_filter; rsti(me)->has_old_seccomp_filter = true; } } ret = 0; out: return ret; } /* * Find if there are children which are zombies or helpers - processes * which are expected to die during the restore. */ static bool child_death_expected(void) { struct pstree_item *pi; list_for_each_entry(pi, ¤t->children, sibling) { switch (pi->pid->state) { case TASK_DEAD: case TASK_HELPER: return true; } } return false; } static int wait_exiting_children(void) { siginfo_t info; if (!child_death_expected()) { /* * Restoree has no children that should die, during restore, * wait for the next stage on futex. * The default SIGCHLD handler will handle an unexpected * child's death and abort the restore if someone dies. */ restore_finish_stage(task_entries, CR_STATE_RESTORE); return 0; } /* * The restoree has children which will die - decrement itself from * nr. of tasks processing the stage and wait for anyone to die. * Tasks may die only when they're on the following stage. * If one dies earlier - that's unexpected - treat it as an error * and abort the restore. */ if (block_sigmask(NULL, SIGCHLD)) return -1; /* Finish CR_STATE_RESTORE, but do not wait for the next stage. */ futex_dec_and_wake(&task_entries->nr_in_progress); if (waitid(P_ALL, 0, &info, WEXITED | WNOWAIT)) { pr_perror("Failed to wait"); return -1; } if (futex_get(&task_entries->start) == CR_STATE_RESTORE) { pr_err("Child %d died too early\n", info.si_pid); return -1; } if (wait_on_helpers_zombies()) { pr_err("Failed to wait on helpers and zombies\n"); return -1; } return 0; } /* * Restore a helper process - artificially created by criu * to restore attributes of process tree. * - sessions for each leaders are dead * - process groups with dead leaders * - dead tasks for which /proc//... is opened by restoring task * - whatnot */ static int restore_one_helper(void) { int i; if (prepare_fds(current)) return -1; if (wait_exiting_children()) return -1; sfds_protected = false; close_image_dir(); close_proc(); for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++) close_service_fd(i); return 0; } static int restore_one_task(int pid, CoreEntry *core) { int ret; /* No more fork()-s => no more per-pid logs */ if (task_alive(current)) ret = restore_one_alive_task(pid, core); else if (current->pid->state == TASK_DEAD) ret = restore_one_zombie(core); else if (current->pid->state == TASK_HELPER) { ret = restore_one_helper(); } else { pr_err("Unknown state in code %d\n", (int)core->tc->task_state); ret = -1; } if (core) core_entry__free_unpacked(core, NULL); return ret; } /* All arguments should be above stack, because it grows down */ struct cr_clone_arg { struct pstree_item *item; unsigned long clone_flags; CoreEntry *core; }; static void maybe_clone_parent(struct pstree_item *item, struct cr_clone_arg *ca) { /* * zdtm runs in kernel 3.11, which has the problem described below. We * avoid this by including the pdeath_sig test. Once users/zdtm migrate * off of 3.11, this condition can be simplified to just test the * options and not have the pdeath_sig test. */ if (opts.restore_sibling) { /* * This means we're called from lib's criu_restore_child(). * In that case create the root task as the child one to+ * the caller. This is the only way to correctly restore the * pdeath_sig of the root task. But also looks nice. * * Alternatively, if we are --restore-detached, a similar trick is * needed to correctly restore pdeath_sig and prevent processes from * dying once restored. * * There were a problem in kernel 3.11 -- CLONE_PARENT can't be * set together with CLONE_NEWPID, which has been solved in further * versions of the kernels, but we treat 3.11 as a base, so at * least warn a user about potential problems. */ rsti(item)->clone_flags |= CLONE_PARENT; if (rsti(item)->clone_flags & CLONE_NEWPID) pr_warn("Set CLONE_PARENT | CLONE_NEWPID but it might cause restore problem," "because not all kernels support such clone flags combinations!\n"); } else if (opts.restore_detach) { if (ca->core->thread_core->pdeath_sig) pr_warn("Root task has pdeath_sig configured, so it will receive one _right_" "after restore on CRIU exit\n"); } } static bool needs_prep_creds(struct pstree_item *item) { /* * Before the 4.13 kernel, it was impossible to set * an exe_file if uid or gid isn't zero. */ return (!item->parent && ((root_ns_mask & CLONE_NEWUSER) || getuid())); } static int set_next_pid(void *arg) { char buf[32]; pid_t *pid = arg; int len; int fd; fd = do_open_proc(PROC_GEN, O_RDWR, LAST_PID_PATH); if (fd < 0) { pr_pwarn("Can't open %d/" LAST_PID_PATH " on procfs", PROC_GEN); \ return -1; } len = snprintf(buf, sizeof(buf), "%d", *pid - 1); if (write(fd, buf, len) != len) { pr_perror("Failed to write %s to /proc/%s", buf, LAST_PID_PATH); close(fd); return -1; } close(fd); return 0; } static inline int fork_with_pid(struct pstree_item *item) { struct cr_clone_arg ca; struct ns_id *pid_ns = NULL; bool external_pidns = false; int ret = -1; pid_t pid = vpid(item); if (item->pid->state != TASK_HELPER) { if (open_core(pid, &ca.core)) return -1; if (check_core(ca.core, item)) return -1; item->pid->state = ca.core->tc->task_state; /* * Zombie tasks' cgroup is not dumped/restored. * cg_set == 0 is skipped in prepare_task_cgroup() */ if (item->pid->state == TASK_DEAD) { rsti(item)->cg_set = 0; } else { if (ca.core->thread_core->has_cg_set) rsti(item)->cg_set = ca.core->thread_core->cg_set; else rsti(item)->cg_set = ca.core->tc->cg_set; } if (ca.core->tc->has_stop_signo) item->pid->stop_signo = ca.core->tc->stop_signo; if (item->pid->state != TASK_DEAD && !task_alive(item)) { pr_err("Unknown task state %d\n", item->pid->state); return -1; } /* * By default we assume that seccomp is not * used at all (especially on dead task). Later * we will walk over all threads and check in * details if filter is present setting up * this flag as appropriate. */ rsti(item)->has_seccomp = false; if (unlikely(item == root_item)) maybe_clone_parent(item, &ca); } else { /* * Helper entry will not get moved around and thus * will live in the parent's cgset. */ rsti(item)->cg_set = rsti(item->parent)->cg_set; ca.core = NULL; } if (item->ids) pid_ns = lookup_ns_by_id(item->ids->pid_ns_id, &pid_ns_desc); if (!current && pid_ns && pid_ns->ext_key) external_pidns = true; if (external_pidns) { int fd; /* Not possible to restore into an empty PID namespace. */ if (pid == INIT_PID) { pr_err("Unable to restore into an empty PID namespace\n"); return -1; } fd = inherit_fd_lookup_id(pid_ns->ext_key); if (fd < 0) { pr_err("Unable to find an external pidns: %s\n", pid_ns->ext_key); return -1; } ret = switch_ns_by_fd(fd, &pid_ns_desc, NULL); close(fd); if (ret) { pr_err("Unable to enter existing PID namespace\n"); return -1; } pr_info("Inheriting external pidns %s for %d\n", pid_ns->ext_key, pid); } ca.item = item; ca.clone_flags = rsti(item)->clone_flags; BUG_ON(ca.clone_flags & CLONE_VM); pr_info("Forking task with %d pid (flags 0x%lx)\n", pid, ca.clone_flags); if (!(ca.clone_flags & CLONE_NEWPID)) { lock_last_pid(); if (!kdat.has_clone3_set_tid) { if (external_pidns) { /* * Restoring into another namespace requires a helper * to write to LAST_PID_PATH. Using clone3() this is * so much easier and simpler. As long as CRIU supports * clone() this is needed. */ ret = call_in_child_process(set_next_pid, (void *)&pid); } else { ret = set_next_pid((void *)&pid); } if (ret != 0) { pr_warn("Setting PID failed\n"); } } } else { if (!external_pidns) { if (pid != INIT_PID) { pr_err("First PID in a PID namespace needs to be %d and not %d\n", pid, INIT_PID); return -1; } } } if (opts.compress) { // Sync with decompression thread before fork'ing pr_debug("Waiting for decompression thread is completed...\n"); if (decompression_thread_join()) { pr_err("Failed to join decompression thread\n"); return -1; } } if (kdat.has_clone3_set_tid) { ret = clone3_with_pid_noasan(restore_task_with_children, &ca, (ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWTIME)), SIGCHLD, pid); } else { /* * Some kernel modules, such as network packet generator * run kernel thread upon net-namespace creation taking * the @pid we've been requesting via LAST_PID_PATH interface * so that we can't restore a take with pid needed. * * Here is an idea -- unshare net namespace in callee instead. */ /* * The cgroup namespace is also unshared explicitly in the * move_in_cgroup(), so drop this flag here as well. */ close_pid_proc(); ret = clone_noasan(restore_task_with_children, (ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWTIME)) | SIGCHLD, &ca); if (ret < vpid(ca.item)) { const int close_cnt = 20; int cnt = 1024; waitpid(ret, NULL, 0); while (0 < ret && ret < vpid(ca.item) - close_cnt && 0 < --cnt) { ret = syscall(SYS_clone, SIGCHLD, NULL, NULL, 0); if (!ret) { syscall(SYS_exit, NOT_THAT_PID_ECODE); } pr_debug("clone pid %d\n", ret); } while (0 < ret && ret < vpid(ca.item) - close_cnt && 0 < --cnt); if (0 < ret && vpid(ca.item) <= ret + close_cnt) { while (0 < ret && ret < vpid(ca.item)) { ret = clone_noasan(restore_task_with_children, (ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWTIME)) | SIGCHLD, &ca); pr_debug("clone pid 2 %d\n", ret); if (ret != vpid(ca.item)) { waitpid(ret, NULL, 0); } } } } if (ret != vpid(ca.item)) { ret = -1; } } if (ret < 0) { pr_perror("Can't fork for %d", pid); if (errno == EEXIST) set_cr_errno(EEXIST); goto err_unlock; } if (item == root_item) { item->pid->real = ret; pr_debug("PID1: real %d virt %d\n", item->pid->real, vpid(item)); } err_unlock: if (!(ca.clone_flags & CLONE_NEWPID)) unlock_last_pid(); if (ca.core) core_entry__free_unpacked(ca.core, NULL); return ret; } /* Returns 0 if restore can be continued */ static int sigchld_process(int status, pid_t pid) { int sig; if (WIFEXITED(status)) { if (WEXITSTATUS(status) == NOT_THAT_PID_ECODE) { // expected orhpain process/thread return 0; } pr_err("%d exited, status=%d\n", pid, WEXITSTATUS(status)); return -1; } else if (WIFSIGNALED(status)) { sig = WTERMSIG(status); pr_err("%d killed by signal %d: %s\n", pid, sig, strsignal(sig)); return -1; } else if (WIFSTOPPED(status)) { sig = WSTOPSIG(status); /* The root task is ptraced. Allow it to handle SIGCHLD */ if (sig == SIGCHLD && !current) { if (ptrace(PTRACE_CONT, pid, 0, SIGCHLD)) { pr_perror("Unable to resume %d", pid); return -1; } return 0; } pr_err("%d stopped by signal %d: %s\n", pid, sig, strsignal(sig)); return -1; } else if (WIFCONTINUED(status)) { pr_err("%d unexpectedly continued\n", pid); return -1; } pr_err("wait for %d resulted in %x status\n", pid, status); return -1; } static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) { while (1) { int status; pid_t pid; pid = waitpid(-1, &status, WNOHANG); if (pid <= 0) return; if (sigchld_process(status, pid) < 0) goto err_abort; } err_abort: futex_abort_and_wake(&task_entries->nr_in_progress); } static int criu_signals_setup(void) { int ret; struct sigaction act; sigset_t blockmask; ret = sigaction(SIGCHLD, NULL, &act); if (ret < 0) { pr_perror("sigaction() failed"); return -1; } act.sa_flags |= SA_NOCLDSTOP | SA_SIGINFO | SA_RESTART; act.sa_sigaction = sigchld_handler; sigemptyset(&act.sa_mask); sigaddset(&act.sa_mask, SIGCHLD); ret = sigaction(SIGCHLD, &act, NULL); if (ret < 0) { pr_perror("sigaction() failed"); return -1; } /* * The block mask will be restored in sigreturn. * * TODO: This code should be removed, when a freezer will be added. */ sigfillset(&blockmask); sigdelset(&blockmask, SIGCHLD); /* * Here we use SIG_SETMASK instead of SIG_BLOCK to avoid the case where * we've been forked from a parent who had blocked SIGCHLD. If SIGCHLD * is blocked when a task dies (e.g. if the task fails to restore * somehow), we hang because our SIGCHLD handler is never run. Since we * depend on SIGCHLD being unblocked, let's set the mask explicitly. */ ret = sigprocmask(SIG_SETMASK, &blockmask, NULL); if (ret < 0) { pr_perror("Can't block signals"); return -1; } return 0; } static void restore_sid(void) { pid_t sid; /* * SID can only be reset to pid or inherited from parent. * Thus we restore it right here to let our kids inherit * one in case they need it. * * PGIDs are restored late when all tasks are forked and * we can call setpgid() on custom values. */ if (vpid(current) == current->sid) { pr_info("Restoring %d to %d sid\n", vpid(current), current->sid); sid = setsid(); if (sid != current->sid) { pr_perror("Can't restore sid (%d)", sid); exit(1); } } else { sid = getsid(0); if (sid != current->sid) { /* Skip the root task if it's not init */ if (current == root_item && vpid(root_item) != INIT_PID) return; pr_err("Requested sid %d doesn't match inherited %d\n", current->sid, sid); exit(1); } } } static void restore_pgid(void) { /* * Unlike sessions, process groups (a.k.a. pgids) can be joined * by any task, provided the task with pid == pgid (group leader) * exists. Thus, in order to restore pgid we must make sure that * group leader was born and created the group, then join one. * * We do this _before_ finishing the forking stage to make sure * helpers are still with us. */ pid_t pgid, my_pgid = current->pgid; pr_info("Restoring %d to %d pgid\n", vpid(current), my_pgid); pgid = getpgrp(); if (my_pgid == pgid) return; if (my_pgid != vpid(current)) { struct pstree_item *leader; /* * Wait for leader to become such. * Missing leader means we're going to crtools * group (-j option). */ leader = rsti(current)->pgrp_leader; if (leader) { BUG_ON(my_pgid != vpid(leader)); futex_wait_until(&rsti(leader)->pgrp_set, 1); } } pr_info("\twill call setpgid, mine pgid is %d\n", pgid); if (setpgid(0, my_pgid) != 0) { pr_perror("Can't restore pgid (%d/%d->%d)", vpid(current), pgid, current->pgid); exit(1); } if (my_pgid == vpid(current)) futex_set_and_wake(&rsti(current)->pgrp_set, 1); } static int __legacy_mount_proc(void) { char proc_mountpoint[] = "/tmp/crtools-proc.XXXXXX"; int fd; if (mkdtemp(proc_mountpoint) == NULL) { pr_perror("mkdtemp failed %s", proc_mountpoint); return -1; } pr_info("Mount procfs in %s\n", proc_mountpoint); if (mount("proc", proc_mountpoint, "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) { pr_perror("mount failed"); if (rmdir(proc_mountpoint)) pr_perror("Unable to remove %s", proc_mountpoint); return -1; } fd = open_detach_mount(proc_mountpoint); return fd; } static int mount_proc(void) { int fd, ret; if (root_ns_mask == 0) fd = ret = open("/proc", O_DIRECTORY); else { if (kdat.has_fsopen) fd = ret = mount_detached_fs("proc"); else fd = ret = __legacy_mount_proc(); } if (fd >= 0) { ret = set_proc_fd(fd); close(fd); } return ret; } /* * Tasks cannot change sid (session id) arbitrary, but can either * inherit one from ancestor, or create a new one with id equal to * their pid. Thus sid-s restore is tied with children creation. */ static int create_children_and_session(void) { int ret; struct pstree_item *child; pr_info("Restoring children in alien sessions:\n"); list_for_each_entry(child, ¤t->children, sibling) { if (!restore_before_setsid(child)) continue; BUG_ON(child->born_sid != -1 && getsid(0) != child->born_sid); ret = fork_with_pid(child); if (ret < 0) return ret; } if (current->parent) restore_sid(); pr_info("Restoring children in our session:\n"); list_for_each_entry(child, ¤t->children, sibling) { if (restore_before_setsid(child)) continue; ret = fork_with_pid(child); if (ret < 0) return ret; } return 0; } static int restore_task_with_children(void *_arg) { struct cr_clone_arg *ca = _arg; pid_t pid; int ret; current = ca->item; pid = getpid(); if (pid < vpid(current)) { /* Expected pid mismatch, communcate back */ exit(NOT_THAT_PID_ECODE); } if (current != root_item) { char buf[12]; int fd; /* Determine PID in CRIU's namespace */ fd = get_service_fd(CR_PROC_FD_OFF); if (fd < 0) goto err; ret = readlinkat(fd, "self", buf, sizeof(buf) - 1); if (ret < 0) { pr_perror("Unable to read the /proc/self link"); goto err; } buf[ret] = '\0'; current->pid->real = atoi(buf); pr_debug("PID2: real %d virt %d\n", current->pid->real, vpid(current)); } pid = getpid(); if (vpid(current) != pid) { pr_err("Pid %d do not match expected %d\n", pid, vpid(current)); set_task_cr_err(EEXIST); goto err; } if (log_init_by_pid(vpid(current))) goto err; if (current->parent == NULL) { /* * The root task has to be in its namespaces before executing * ACT_SETUP_NS scripts, so the root netns has to be created here */ if (root_ns_mask & CLONE_NEWNET) { struct ns_id *ns = net_get_root_ns(); if (ns->ext_key) ret = net_set_ext(ns); else ret = unshare(CLONE_NEWNET); if (ret) { pr_perror("Can't unshare net-namespace"); goto err; } } if (root_ns_mask & CLONE_NEWTIME) { if (prepare_timens(current->ids->time_ns_id)) goto err; } else if (kdat.has_timens) { if (prepare_timens(0)) goto err; } if (set_opts_cap_eff()) goto err; /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; /* * Since we don't support nesting of cgroup namespaces, let's * only set up the cgns (if it exists) in the init task. */ if (prepare_cgroup_namespace(current) < 0) goto err; } if (needs_prep_creds(current) && (prepare_userns_creds())) goto err; /* * Call this _before_ forking to optimize cgroups * restore -- if all tasks live in one set of cgroups * we will only move the root one there, others will * just have it inherited. */ if (restore_task_cgroup(current) < 0) goto err; /* Restore root task */ if (current->parent == NULL) { if (join_namespaces()) { pr_perror("Join namespaces failed"); goto err; } pr_info("Calling restore_sid() for init\n"); restore_sid(); /* * We need non /proc proc mount for restoring pid and mount * namespaces and do not care for the rest of the cases. * Thus -- mount proc at custom location for any new namespace */ if (mount_proc()) goto err; if (!files_collected() && collect_image(&tty_cinfo)) goto err; if (collect_images(before_ns_cinfos, ARRAY_SIZE(before_ns_cinfos))) goto err; if (prepare_namespace(current, ca->clone_flags)) goto err; if (restore_finish_ns_stage(CR_STATE_PREPARE_NAMESPACES, CR_STATE_FORKING) < 0) goto err; if (root_prepare_shared()) goto err; if (populate_root_fd_off()) goto err; } if (setup_newborn_fds(current)) goto err; if (restore_task_mnt_ns(current)) goto err; if (prepare_mappings(current)) goto err; if (prepare_sigactions(ca->core) < 0) goto err; if (fault_injected(FI_RESTORE_ROOT_ONLY)) { pr_info("fault: Restore root task failure!\n"); kill(getpid(), SIGKILL); } if (open_transport_socket()) goto err; timing_start(TIME_FORK); if (create_children_and_session()) goto err; timing_stop(TIME_FORK); if (populate_pid_proc()) goto err; sfds_protected = true; if (unmap_guard_pages(current)) goto err; restore_pgid(); if (current->parent == NULL) { /* * Wait when all tasks passed the CR_STATE_FORKING stage. * The stage was started by criu, but now it waits for * the CR_STATE_RESTORE to finish. See comment near the * CR_STATE_FORKING macro for details. * * It means that all tasks entered into their namespaces. */ if (restore_wait_other_tasks()) goto err; fini_restore_mntns(); __restore_switch_stage(CR_STATE_RESTORE); } else { if (restore_finish_stage(task_entries, CR_STATE_FORKING) < 0) goto err; } if (restore_one_task(vpid(current), ca->core)) goto err; return 0; err: if (current->parent == NULL) futex_abort_and_wake(&task_entries->nr_in_progress); exit(1); } static int attach_to_tasks(bool root_seized) { struct pstree_item *item; for_each_pstree_item(item) { int status, i; if (!task_alive(item)) continue; if (item->nr_threads == 1) { item->threads[0].real = item->pid->real; } else { if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) return -1; } for (i = 0; i < item->nr_threads; i++) { pid_t pid = item->threads[i].real; if (item != root_item || !root_seized || i != 0) { if (ptrace(PTRACE_SEIZE, pid, 0, 0)) { pr_perror("Can't attach to %d", pid); return -1; } } if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { pr_perror("Can't interrupt the %d task", pid); return -1; } if (wait4(pid, &status, __WALL, NULL) != pid) { pr_perror("waitpid(%d) failed", pid); return -1; } if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); return -1; } /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the * restorer blob (and the final sigreturn is ok), here we're * doing an munmap in the process, which may be blocked by * seccomp and cause the task to be killed. */ if (rsti(item)->has_seccomp && ptrace_suspend_seccomp(pid) < 0) pr_err("failed to suspend seccomp, restore will probably fail...\n"); if (ptrace(PTRACE_CONT, pid, NULL, NULL)) { pr_perror("Unable to resume %d", pid); return -1; } } } return 0; } static int restore_rseq_cs(void) { struct pstree_item *item; for_each_pstree_item(item) { int i; if (!task_alive(item)) continue; if (item->nr_threads == 1) { item->threads[0].real = item->pid->real; } else { if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) { pr_err("restore_rseq_cs: parse_threads failed\n"); return -1; } } for (i = 0; i < item->nr_threads; i++) { pid_t pid = item->threads[i].real; struct rst_rseq *rseqe = rsti(item)->rseqe; if (!rseqe) { pr_err("restore_rseq_cs: rsti(item)->rseqe is NULL\n"); return -1; } if (!rseqe[i].rseq_cs_pointer || !rseqe[i].rseq_abi_pointer) continue; if (ptrace_poke_area( pid, &rseqe[i].rseq_cs_pointer, decode_pointer(rseqe[i].rseq_abi_pointer + offsetof(struct criu_rseq, rseq_cs)), sizeof(uint64_t))) { pr_err("Can't restore rseq_cs pointer (pid: %d)\n", pid); return -1; } } } return 0; } static int catch_tasks(bool root_seized) { struct pstree_item *item; for_each_pstree_item(item) { int status, i, ret; if (!task_alive(item)) continue; if (item->nr_threads == 1) { item->threads[0].real = item->pid->real; } else { if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) return -1; } for (i = 0; i < item->nr_threads; i++) { pid_t pid = item->threads[i].real; if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { pr_pwarn("Can't interrupt the %d task", pid); return -1; } if (wait4(pid, &status, __WALL, NULL) != pid) { pr_perror("waitpid(%d) failed", pid); return -1; } ret = compel_stop_pie(pid, rsti(item)->breakpoint, fault_injected(FI_NO_BREAKPOINTS)); if (ret < 0) return -1; } } return 0; } static void finalize_restore(void) { struct pstree_item *item; for_each_pstree_item(item) { pid_t pid = item->pid->real; struct parasite_ctl *ctl; unsigned long restorer_addr; if (!task_alive(item)) continue; /* Unmap the restorer blob */ ctl = compel_prepare_noctx(pid); if (ctl == NULL) continue; restorer_addr = (unsigned long)rsti(item)->munmap_restorer; if (compel_unmap(ctl, restorer_addr)) pr_err("Failed to unmap restorer from %d\n", pid); xfree(ctl); if (opts.final_state == TASK_STOPPED) kill(item->pid->real, SIGSTOP); else if (item->pid->state == TASK_STOPPED) { if (item->pid->stop_signo > 0) kill(item->pid->real, item->pid->stop_signo); else kill(item->pid->real, SIGSTOP); } } } static int finalize_restore_detach(void) { struct pstree_item *item; for_each_pstree_item(item) { pid_t pid; int i; if (!task_alive(item)) continue; for (i = 0; i < item->nr_threads; i++) { pid = item->threads[i].real; if (pid < 0) { pr_err("pstree item has invalid pid %d\n", pid); continue; } if (arch_set_thread_regs_nosigrt(&item->threads[i])) { pr_perror("Restoring regs for %d failed", pid); return -1; } if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { pr_perror("Unable to detach %d", pid); return -1; } } } return 0; } static void ignore_kids(void) { struct sigaction sa = { .sa_handler = SIG_DFL }; if (sigaction(SIGCHLD, &sa, NULL) < 0) pr_perror("Restoring CHLD sigaction failed"); } static unsigned int saved_loginuid; static int prepare_userns_hook(void) { int ret; if (kdat.luid != LUID_FULL) return 0; /* * Save old loginuid and set it to INVALID_UID: * this value means that loginuid is unset and it will be inherited. * After you set some value to /proc/<>/loginuid it can't be changed * inside container due to permissions. * But you still can set this value if it was unset. */ saved_loginuid = parse_pid_loginuid(getpid(), &ret, false); if (ret < 0) return -1; if (prepare_loginuid(INVALID_UID) < 0) { pr_err("Setting loginuid for CT init task failed, CAP_AUDIT_CONTROL?\n"); return -1; } return 0; } static void restore_origin_ns_hook(void) { if (kdat.luid != LUID_FULL) return; /* not critical: it does not affect CT in any way */ if (prepare_loginuid(saved_loginuid) < 0) pr_err("Restore original /proc/self/loginuid failed\n"); } static int write_restored_pid(void) { int pid; if (!opts.pidfile) return 0; pid = root_item->pid->real; if (write_pidfile(pid) < 0) { pr_perror("Can't write pidfile"); return -1; } return 0; } static void reap_zombies(void) { while (1) { pid_t pid = wait(NULL); if (pid == -1) { if (errno != ECHILD) pr_perror("Error while waiting for pids"); return; } } } static int restore_root_task(struct pstree_item *init) { //enum trace_flags flag = TRACE_ALL; int root_seized = 0; bool ptrace_allowed = opts.ptrace_allowed; int ret, fd, mnt_ns_fd = -1; struct pstree_item *item; ret = run_scripts(ACT_PRE_RESTORE); if (ret != 0) { pr_err("Aborting restore due to pre-restore script ret code %d\n", ret); return -1; } fd = open("/proc", O_DIRECTORY | O_RDONLY); if (fd < 0) { pr_perror("Unable to open /proc"); return -1; } ret = install_service_fd(CR_PROC_FD_OFF, fd); if (ret < 0) return -1; /* * FIXME -- currently we assume that all the tasks live * in the same set of namespaces. This is done to debug * the ns contents dumping/restoring. Need to revisit * this later. */ if (prepare_userns_hook()) return -1; if (prepare_namespace_before_tasks()) return -1; if (vpid(init) == INIT_PID) { if (!(root_ns_mask & CLONE_NEWPID)) { pr_err("This process tree can only be restored " "in a new pid namespace.\n" "criu should be re-executed with the " "\"--namespace pid\" option.\n"); return -1; } } else if (root_ns_mask & CLONE_NEWPID) { struct ns_id *ns; /* * Restoring into an existing PID namespace. This disables * the check to require a PID 1 when restoring a process * which used to be in a PID namespace. */ ns = lookup_ns_by_id(init->ids->pid_ns_id, &pid_ns_desc); if (!ns || !ns->ext_key) { pr_err("Can't restore pid namespace without the process init\n"); return -1; } } __restore_switch_stage_nw(CR_STATE_ROOT_TASK); ret = fork_with_pid(init); if (ret < 0) goto out; restore_origin_ns_hook(); if (ptrace_allowed && (rsti(init)->clone_flags & CLONE_PARENT)) { struct sigaction act; root_seized = 1; /* * Root task will be our sibling. This means, that * we will not notice when (if) it dies in SIGCHLD * handler, but we should. To do this -- attach to * the guy with ptrace (below) and (!) make the kernel * deliver us the signal when it will get stopped. * It will in case of e.g. segfault before handling * the signal. */ sigaction(SIGCHLD, NULL, &act); act.sa_flags &= ~SA_NOCLDSTOP; sigaction(SIGCHLD, &act, NULL); if (ptrace(PTRACE_SEIZE, init->pid->real, 0, 0)) { pr_warn("Can't seize root task, disabling ptrace\n"); ptrace_allowed = false; } } if (!root_ns_mask) goto skip_ns_bouncing; /* * uid_map and gid_map must be filled from a parent user namespace. * prepare_userns_creds() must be called after filling mappings. */ if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init)) goto out_kill; pr_info("Wait until namespaces are created\n"); ret = restore_wait_inprogress_tasks(); if (ret) goto out_kill; ret = run_scripts(ACT_SETUP_NS); if (ret) goto out_kill; ret = restore_switch_stage(CR_STATE_PREPARE_NAMESPACES); if (ret) goto out_kill; if (root_ns_mask & CLONE_NEWNS) { mnt_ns_fd = open_proc(init->pid->real, "ns/mnt"); if (mnt_ns_fd < 0) goto out_kill; } if (root_ns_mask & opts.empty_ns & CLONE_NEWNET) { /* * Local TCP connections were locked by network_lock_internal() * on dump and normally should have been C/R-ed by respectively * dump_iptables() and restore_iptables() in net.c. However in * the '--empty-ns net' mode no iptables C/R is done and we * need to return these rules by hands. */ ret = network_lock_internal(); if (ret) goto out_kill; } ret = run_scripts(ACT_POST_SETUP_NS); if (ret) goto out_kill; __restore_switch_stage(CR_STATE_FORKING); skip_ns_bouncing: ret = restore_wait_inprogress_tasks(); if (ret < 0) goto out_kill; ret = apply_memfd_seals(); if (ret < 0) goto out_kill; /* * Zombies die after CR_STATE_RESTORE which is switched * by root task, not by us. See comment before CR_STATE_FORKING * in the header for details. */ for_each_pstree_item(item) { if (item->pid->state == TASK_DEAD) task_entries->nr_threads--; } ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD); if (ret < 0) goto out_kill; ret = stop_usernsd(); if (ret < 0) goto out_kill; ret = stop_cgroupd(); if (ret < 0) goto out_kill; ret = move_veth_to_bridge(); if (ret < 0) goto out_kill; ret = prepare_cgroup_properties(); if (ret < 0) goto out_kill; if (fault_injected(FI_POST_RESTORE)) goto out_kill; ret = run_scripts(ACT_POST_RESTORE); if (ret != 0) { pr_err("Aborting restore due to post-restore script ret code %d\n", ret); timing_stop(TIME_RESTORE); write_stats(RESTORE_STATS); goto out_kill; } /* * There is no need to call try_clean_remaps() after this point, * as restore went OK and all ghosts were removed by the openers. */ if (depopulate_roots_yard(mnt_ns_fd, false)) goto out_kill; close_safe(&mnt_ns_fd); if (write_restored_pid()) goto out_kill; /* Unlock network before disabling repair mode on sockets */ network_unlock(); /* * Stop getting sigchld, after we resume the tasks they * may start to exit poking criu in vain. */ ignore_kids(); /* * ------------------------------------------------------------- * Network is unlocked. If something fails below - we lose data * or a connection. */ if (ptrace_allowed && (attach_to_tasks(root_seized) < 0)) { pr_warn("Can't attach to all tasks, disabling ptrace\n"); ptrace_allowed = false; } if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) goto out_kill_network_unlocked; timing_stop(TIME_RESTORE); if (ptrace_allowed && catch_tasks(root_seized)) { pr_warn("Can't catch all tasks, disabling ptrace\n"); ptrace_allowed = false; goto out_kill_network_unlocked; } if (lazy_pages_finish_restore()) goto out_kill_network_unlocked; __restore_switch_stage(CR_STATE_COMPLETE); if (ptrace_allowed) { ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1)); if (ret) { pr_err("Can't stop all tasks on rt_sigreturn\n"); goto out_kill_network_unlocked; } finalize_restore(); } /* * Some external devices such as GPUs might need a very late * trigger to kick-off some events, memory notifiers and for * restarting the previously restored queues during criu restore * stage. This is needed since criu pie code may shuffle VMAs * around so things such as registering MMU notifiers (for GPU * mapped memory) could be done sanely once the pie code hands * over the control to master process. */ for_each_pstree_item(item) { pr_info("Run late stage hook from criu master for external devices\n"); ret = run_plugins(RESUME_DEVICES_LATE, item->pid->real); /* * This may not really be an error. Only certain plugin hooks * (if available) will return success such as amdgpu_plugin that * validates the pid of the resuming tasks in the kernel mode. * Most of the times, it'll be -ENOTSUP and in few cases, it * might actually be a true error code but that would be also * captured in the plugin so no need to print the error here. */ if (ret < 0) pr_debug("restore late stage hook for external plugin failed\n"); } ret = run_scripts(ACT_PRE_RESUME); if (ret) pr_err("Pre-resume script ret code %d\n", ret); if (restore_freezer_state()) pr_err("Unable to restore freezer state\n"); if (ptrace_allowed) { /* just before releasing threads we have to restore rseq_cs */ if (restore_rseq_cs()) pr_err("Unable to restore rseq_cs state\n"); /* Detaches from processes and they continue run through sigreturn. */ if (finalize_restore_detach()) goto out_kill_network_unlocked; } pr_info("Restore finished successfully. Tasks resumed.\n"); write_stats(RESTORE_STATS); /* This has the effect of dismissing the image streamer */ close_image_dir(); ret = run_scripts(ACT_POST_RESUME); if (ret != 0) pr_err("Post-resume script ret code %d\n", ret); if (!opts.restore_detach && !opts.exec_cmd) { reap_zombies(); } return 0; out_kill_network_unlocked: pr_err("Killing processes because of failure on restore.\nThe Network was unlocked so some data or a connection may have been lost.\n"); out_kill: /* * The processes can be killed only when all of them have been created, * otherwise an external processes can be killed. */ if (vpid(root_item) == INIT_PID) { int status; /* Kill init */ if (root_item->pid->real > 0) kill(root_item->pid->real, SIGKILL); if (waitpid(root_item->pid->real, &status, 0) < 0) pr_warn("Unable to wait %d: %s\n", root_item->pid->real, strerror(errno)); } else { struct pstree_item *pi; for_each_pstree_item(pi) if (pi->pid->real > 0) kill(pi->pid->real, SIGKILL); } out: depopulate_roots_yard(mnt_ns_fd, true); stop_usernsd(); __restore_switch_stage(CR_STATE_FAIL); pr_err("Restoring FAILED.\n"); return -1; } int prepare_task_entries(void) { task_entries_pos = rst_mem_align_cpos(RM_SHREMAP); task_entries = rst_mem_alloc(sizeof(*task_entries), RM_SHREMAP); if (!task_entries) { pr_perror("Can't map shmem"); return -1; } task_entries->nr_threads = 0; task_entries->nr_tasks = 0; task_entries->nr_helpers = 0; futex_set(&task_entries->start, CR_STATE_FAIL); mutex_init(&task_entries->userns_sync_lock); mutex_init(&task_entries->last_pid_mutex); return 0; } int prepare_dummy_task_state(struct pstree_item *pi) { CoreEntry *core; if (open_core(vpid(pi), &core)) return -1; pi->pid->state = core->tc->task_state; core_entry__free_unpacked(core, NULL); return 0; } int cr_restore_tasks(void) { int ret = -1; if (init_service_fd()) return 1; if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) return -1; if (check_img_inventory(/* restore = */ true) < 0) goto err; if (init_stats(RESTORE_STATS)) goto err; if (lsm_check_opts()) goto err; timing_start(TIME_RESTORE); // Automatically detect if decompression is needed if (0 == decompression_thread_start()) { opts.compress = 1; } if (cpu_init() < 0) goto err; if (vdso_init_restore()) goto err; if (tty_init_restore()) goto err; if (opts.cpu_cap & CPU_CAP_IMAGE) { if (cpu_validate_cpuinfo()) goto err; } if (prepare_task_entries() < 0) goto err; if (prepare_pstree() < 0) goto err; if (fdstore_init()) goto err; if (inherit_fd_move_to_fdstore()) goto err; if (crtools_prepare_shared() < 0) goto err; if (prepare_cgroup()) goto clean_cgroup; if (criu_signals_setup() < 0) goto clean_cgroup; if (prepare_lazy_pages_socket() < 0) goto clean_cgroup; ret = restore_root_task(root_item); clean_cgroup: fini_cgroup(); err: cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); if (opts.compress) { decompression_unlink_tmpfile(); } return ret; } static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long vma_len) { struct vma_area *t_vma, *s_vma; long prev_vma_end = 0; struct vma_area end_vma; VmaEntry end_e; end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; prev_vma_end = kdat.mmap_min_addr; s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); while (1) { if (prev_vma_end + vma_len > s_vma->e->start) { if (s_vma->list.next == self_vma_list) { s_vma = &end_vma; continue; } if (s_vma == &end_vma) break; if (prev_vma_end < s_vma->e->end) prev_vma_end = s_vma->e->end; s_vma = vma_next(s_vma); continue; } if (prev_vma_end + vma_len > t_vma->e->start) { if (t_vma->list.next == tgt_vma_list) { t_vma = &end_vma; continue; } if (t_vma == &end_vma) break; if (prev_vma_end < t_vma->e->end) prev_vma_end = t_vma->e->end; t_vma = vma_next(t_vma); continue; } return prev_vma_end; } return -1; } static inline int timeval_valid(struct timeval *tv) { return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); } static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) { if (ie->isec == 0 && ie->iusec == 0) { memzero_p(val); return 0; } val->it_interval.tv_sec = ie->isec; val->it_interval.tv_usec = ie->iusec; if (!timeval_valid(&val->it_interval)) { pr_err("Invalid timer interval\n"); return -1; } if (ie->vsec == 0 && ie->vusec == 0) { /* * Remaining time was too short. Set it to * interval to make the timer armed and work. */ val->it_value.tv_sec = ie->isec; val->it_value.tv_usec = ie->iusec; } else { val->it_value.tv_sec = ie->vsec; val->it_value.tv_usec = ie->vusec; } if (!timeval_valid(&val->it_value)) { pr_err("Invalid timer value\n"); return -1; } pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, val->it_interval.tv_sec, val->it_interval.tv_usec); return 0; } /* * Legacy itimers restore from CR_FD_ITIMERS */ static int prepare_itimers_from_fd(int pid, struct task_restore_args *args) { int ret = -1; struct cr_img *img; ItimerEntry *ie; if (!deprecated_ok("Itimers")) return -1; img = open_image(CR_FD_ITIMERS, O_RSTR, pid); if (!img) return -1; ret = pb_read_one(img, &ie, PB_ITIMER); if (ret < 0) goto out; ret = decode_itimer("real", ie, &args->itimers[0]); itimer_entry__free_unpacked(ie, NULL); if (ret < 0) goto out; ret = pb_read_one(img, &ie, PB_ITIMER); if (ret < 0) goto out; ret = decode_itimer("virt", ie, &args->itimers[1]); itimer_entry__free_unpacked(ie, NULL); if (ret < 0) goto out; ret = pb_read_one(img, &ie, PB_ITIMER); if (ret < 0) goto out; ret = decode_itimer("prof", ie, &args->itimers[2]); itimer_entry__free_unpacked(ie, NULL); if (ret < 0) goto out; out: close_image(img); return ret; } static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) { int ret = 0; TaskTimersEntry *tte = core->tc->timers; if (!tte) return prepare_itimers_from_fd(pid, args); ret |= decode_itimer("real", tte->real, &args->itimers[0]); ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); return ret; } static inline int timespec_valid(struct timespec *ts) { return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); } static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) { pt->val.it_interval.tv_sec = pte->isec; pt->val.it_interval.tv_nsec = pte->insec; if (!timespec_valid(&pt->val.it_interval)) { pr_err("Invalid timer interval(posix)\n"); return -1; } if (pte->vsec == 0 && pte->vnsec == 0) { /* * Remaining time was too short. Set it to * interval to make the timer armed and work. */ pt->val.it_value.tv_sec = pte->isec; pt->val.it_value.tv_nsec = pte->insec; } else { pt->val.it_value.tv_sec = pte->vsec; pt->val.it_value.tv_nsec = pte->vnsec; } if (!timespec_valid(&pt->val.it_value)) { pr_err("Invalid timer value(posix)\n"); return -1; } pt->spt.it_id = pte->it_id; pt->spt.clock_id = pte->clock_id; pt->spt.si_signo = pte->si_signo; pt->spt.it_sigev_notify = pte->it_sigev_notify; pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); pt->spt.notify_thread_id = pte->notify_thread_id; pt->overrun = pte->overrun; return 0; } static int cmp_posix_timer_proc_id(const void *p1, const void *p2) { return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; } static void sort_posix_timers(struct task_restore_args *ta) { void *tmem; /* * This is required for restorer's create_posix_timers(), * it will probe them one-by-one for the desired ID, since * kernel doesn't provide another API for timer creation * with given ID. */ if (ta->posix_timers_n > 0) { tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); } } /* * Legacy posix timers restoration from CR_FD_POSIX_TIMERS */ static int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) { struct cr_img *img; int ret = -1; struct restore_posix_timer *t; if (!deprecated_ok("Posix timers")) return -1; img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); if (!img) return -1; ta->posix_timers_n = 0; while (1) { PosixTimerEntry *pte; ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); if (ret <= 0) break; t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); if (!t) break; ret = decode_posix_timer(pte, t); if (ret < 0) break; posix_timer_entry__free_unpacked(pte, NULL); ta->posix_timers_n++; } close_image(img); if (!ret) sort_posix_timers(ta); return ret; } static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) { int i, ret = -1; TaskTimersEntry *tte = core->tc->timers; struct restore_posix_timer *t; ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); if (!tte) return prepare_posix_timers_from_fd(pid, ta); ta->posix_timers_n = tte->n_posix; for (i = 0; i < ta->posix_timers_n; i++) { t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); if (!t) goto out; if (decode_posix_timer(tte->posix[i], t)) goto out; } ret = 0; sort_posix_timers(ta); out: return ret; } static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; MmEntry *mm = rsti(current)->mm; args->mm = *mm; args->mm.n_mm_saved_auxv = 0; args->mm.mm_saved_auxv = NULL; if (mm->n_mm_saved_auxv > AT_VECTOR_SIZE) { pr_err("Image corrupted on pid %d\n", pid); goto out; } args->mm_saved_auxv_size = mm->n_mm_saved_auxv * sizeof(auxv_t); for (i = 0; i < mm->n_mm_saved_auxv; ++i) { args->mm_saved_auxv[i] = (auxv_t)mm->mm_saved_auxv[i]; } exe_fd = open_reg_by_id(mm->exe_file_id); if (exe_fd < 0) goto out; args->fd_exe_link = exe_fd; args->thp_disabled = mm->has_thp_disabled && mm->thp_disabled; ret = 0; out: return ret; } static void *restorer; static unsigned long restorer_len; static int prepare_restorer_blob(void) { /* * We map anonymous mapping, not mremap the restorer itself later. * Otherwise the restorer vma would be tied to criu binary which * in turn will lead to set-exe-file prctl to fail with EBUSY. */ struct parasite_blob_desc pbd; /* * We pass native=true, which is then used to set the value of * pbd.parasite_ip_off. We don't use parasite_ip_off, so the value we * pass as native argument is not relevant. */ restorer_setup_c_header_desc(&pbd, true); /* * args_off is the offset where the binary blob with its GOT table * ends. As we don't do RPC, parasite sections after args_off can be * ignored. See compel_infect() for a description of the parasite * memory layout. */ restorer_len = round_up(pbd.hdr.args_off, page_size()); restorer = mmap(NULL, restorer_len, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (restorer == MAP_FAILED) { pr_perror("Can't map restorer code"); return -1; } memcpy(restorer, pbd.hdr.mem, pbd.hdr.bsize); return 0; } static int remap_restorer_blob(void *addr) { struct parasite_blob_desc pbd; void *mem; mem = mremap(restorer, restorer_len, restorer_len, MREMAP_FIXED | MREMAP_MAYMOVE, addr); if (mem != addr) { pr_perror("Can't remap restorer blob"); return -1; } /* * Pass native=true, which is then used to set the value of * pbd.parasite_ip_off. parasite_ip_off is unused in restorer * as compat (ia32) tasks are restored from native (x86_64) * mode, so the value we pass as native argument is not relevant. */ restorer_setup_c_header_desc(&pbd, true); compel_relocs_apply(addr, addr, &pbd); return 0; } static int validate_sched_parm(struct rst_sched_param *sp) { if ((sp->nice < -20) || (sp->nice > 19)) return 0; switch (sp->policy) { case SCHED_RR: case SCHED_FIFO: return ((sp->prio > 0) && (sp->prio < 100)); case SCHED_IDLE: case SCHED_OTHER: case SCHED_BATCH: return sp->prio == 0; } return 0; } static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc) { if (!tc->has_sched_policy) { sp->policy = SCHED_OTHER; sp->nice = 0; return 0; } sp->policy = tc->sched_policy; sp->nice = tc->sched_nice; sp->prio = tc->sched_prio; if (!validate_sched_parm(sp)) { pr_err("Inconsistent sched params received (%d.%d.%d)\n", sp->policy, sp->nice, sp->prio); return -1; } return 0; } static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc) { /* compatibility with older CRIU versions */ if (!tc->rseq_entry) return 0; rseq->rseq_abi_pointer = tc->rseq_entry->rseq_abi_pointer; rseq->rseq_abi_size = tc->rseq_entry->rseq_abi_size; rseq->signature = tc->rseq_entry->signature; if (rseq->rseq_abi_pointer && !kdat.has_rseq) { pr_err("rseq: can't restore as kernel doesn't support it\n"); return -1; } return 0; } #if defined(__GLIBC__) && defined(RSEQ_SIG) static void prep_libc_rseq_info(struct rst_rseq_param *rseq) { if (!kdat.has_rseq) { rseq->rseq_abi_pointer = 0; return; } if (!kdat.has_ptrace_get_rseq_conf) { rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); rseq->rseq_abi_size = __rseq_size; rseq->signature = RSEQ_SIG; return; } rseq->rseq_abi_pointer = kdat.libc_rseq_conf.rseq_abi_pointer; rseq->rseq_abi_size = kdat.libc_rseq_conf.rseq_abi_size; rseq->signature = kdat.libc_rseq_conf.signature; } #elif defined(__GLIBC__) #define NO_RSEQ_OFFSET ((void*)(-1)) #if defined(__x86_64__) #define CRIU_RSEQ_SIG 0x53053053 #elif defined(__aarch64__) #ifdef __AARCH64EB__ #define CRIU_RSEQ_SIG 0x00bc28d4 #else // __AARCH64EB__ #define CRIU_RSEQ_SIG 0xd428bc00 #endif // __AARCH64EB__ #elif defined(__PPC64__) #define CRIU_RSEQ_SIG 0x0fe5000b #elif defined(__s390x__) #define CRIU_RSEQ_SIG 0xB2FF0FFF #else #error "Unknown arch for RSEQ_SIG" #endif static void prep_libc_rseq_info(struct rst_rseq_param *rseq) { static ptrdiff_t *rseq_offset_p; static unsigned int *rseq_size_p; if (!kdat.has_rseq) { rseq->rseq_abi_pointer = 0; return; } if (!rseq_offset_p) { rseq_offset_p = dlsym(RTLD_DEFAULT, "__rseq_offset"); rseq_size_p = dlsym(RTLD_DEFAULT, "__rseq_size"); if (!(rseq_offset_p && rseq_size_p)) { rseq_offset_p = NO_RSEQ_OFFSET; } } if (rseq_offset_p == NO_RSEQ_OFFSET) { // This glibc version does not use rseq rseq->rseq_abi_pointer = 0; return; } rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + *rseq_offset_p); rseq->rseq_abi_size = *rseq_size_p; rseq->signature = CRIU_RSEQ_SIG; } #else // __GLIBC__ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) { /* * TODO: handle built-in rseq on other libc'ies like musl * We can do that using get_rseq_conf kernel feature. * * For now we just assume that other libc libraries are * not registering rseq by default. */ rseq->rseq_abi_pointer = 0; } #endif // __GLIBC__ static rlim_t decode_rlim(rlim_t ival) { return ival == -1 ? RLIM_INFINITY : ival; } /* * Legacy rlimits restore from CR_FD_RLIMIT */ static int prepare_rlimits_from_fd(int pid, struct task_restore_args *ta) { struct rlimit *r; int ret; struct cr_img *img; if (!deprecated_ok("Rlimits")) return -1; /* * Old image -- read from the file. */ img = open_image(CR_FD_RLIMIT, O_RSTR, pid); if (!img) return -1; ta->rlims_n = 0; while (1) { RlimitEntry *re; ret = pb_read_one_eof(img, &re, PB_RLIMIT); if (ret <= 0) break; r = rst_mem_alloc(sizeof(*r), RM_PRIVATE); if (!r) { pr_err("Can't allocate memory for resource %d\n", ta->rlims_n); return -1; } r->rlim_cur = decode_rlim(re->cur); r->rlim_max = decode_rlim(re->max); if (r->rlim_cur > r->rlim_max) { pr_err("Can't restore cur > max for %d.%d\n", pid, ta->rlims_n); r->rlim_cur = r->rlim_max; } rlimit_entry__free_unpacked(re, NULL); ta->rlims_n++; } close_image(img); return 0; } static int prepare_rlimits(int pid, struct task_restore_args *ta, CoreEntry *core) { int i; TaskRlimitsEntry *rls = core->tc->rlimits; struct rlimit64 *r; ta->rlims = (struct rlimit64 *)rst_mem_align_cpos(RM_PRIVATE); if (!rls) return prepare_rlimits_from_fd(pid, ta); for (i = 0; i < rls->n_rlimits; i++) { r = rst_mem_alloc(sizeof(*r), RM_PRIVATE); if (!r) { pr_err("Can't allocate memory for resource %d\n", i); return -1; } r->rlim_cur = decode_rlim(rls->rlimits[i]->cur); r->rlim_max = decode_rlim(rls->rlimits[i]->max); if (r->rlim_cur > r->rlim_max) { pr_warn("Can't restore cur > max for %d.%d\n", pid, i); r->rlim_cur = r->rlim_max; } } ta->rlims_n = rls->n_rlimits; return 0; } static int signal_to_mem(SiginfoEntry *se) { siginfo_t *info, *t; info = (siginfo_t *)se->siginfo.data; t = rst_mem_alloc(sizeof(siginfo_t), RM_PRIVATE); if (!t) return -1; memcpy(t, info, sizeof(*info)); return 0; } static int open_signal_image(int type, pid_t pid, unsigned int *nr) { int ret; struct cr_img *img; img = open_image(type, O_RSTR, pid); if (!img) return -1; *nr = 0; while (1) { SiginfoEntry *se; ret = pb_read_one_eof(img, &se, PB_SIGINFO); if (ret <= 0) break; if (se->siginfo.len != sizeof(siginfo_t)) { pr_err("Unknown image format\n"); ret = -1; break; } ret = signal_to_mem(se); if (ret) break; (*nr)++; siginfo_entry__free_unpacked(se, NULL); } close_image(img); return ret ?: 0; } static int prepare_one_signal_queue(SignalQueueEntry *sqe, unsigned int *nr) { int i; for (i = 0; i < sqe->n_signals; i++) if (signal_to_mem(sqe->signals[i])) return -1; *nr = sqe->n_signals; return 0; } static unsigned int *siginfo_priv_nr; /* FIXME -- put directly on thread_args */ static int prepare_signals(int pid, struct task_restore_args *ta, CoreEntry *leader_core) { int ret = -1, i; ta->siginfo = (siginfo_t *)rst_mem_align_cpos(RM_PRIVATE); siginfo_priv_nr = xmalloc(sizeof(int) * current->nr_threads); if (siginfo_priv_nr == NULL) goto out; /* Prepare shared signals */ if (!leader_core->tc->signals_s) /*backward compatibility*/ ret = open_signal_image(CR_FD_SIGNAL, pid, &ta->siginfo_n); else ret = prepare_one_signal_queue(leader_core->tc->signals_s, &ta->siginfo_n); if (ret < 0) goto out; for (i = 0; i < current->nr_threads; i++) { if (!current->core[i]->thread_core->signals_p) /*backward compatibility*/ ret = open_signal_image(CR_FD_PSIGNAL, current->threads[i].ns[0].virt, &siginfo_priv_nr[i]); else ret = prepare_one_signal_queue(current->core[i]->thread_core->signals_p, &siginfo_priv_nr[i]); if (ret < 0) goto out; } out: return ret; } extern void __gcov_flush(void) __attribute__((weak)); void __gcov_flush(void) { } static void rst_reloc_creds(struct thread_restore_args *thread_args, unsigned long *creds_pos_next) { struct thread_creds_args *args; if (unlikely(!*creds_pos_next)) return; args = rst_mem_remap_ptr(*creds_pos_next, RM_PRIVATE); if (args->lsm_profile) args->lsm_profile = rst_mem_remap_ptr(args->mem_lsm_profile_pos, RM_PRIVATE); if (args->lsm_sockcreate) args->lsm_sockcreate = rst_mem_remap_ptr(args->mem_lsm_sockcreate_pos, RM_PRIVATE); if (args->groups) args->groups = rst_mem_remap_ptr(args->mem_groups_pos, RM_PRIVATE); *creds_pos_next = args->mem_pos_next; thread_args->creds_args = args; } static bool groups_match(gid_t *groups, int n_groups) { int n, len; bool ret; gid_t *gids; n = getgroups(0, NULL); if (n == -1) { pr_perror("Failed to get number of supplementary groups"); return false; } if (n != n_groups) return false; if (n == 0) return true; len = n * sizeof(gid_t); gids = xmalloc(len); if (gids == NULL) return false; n = getgroups(n, gids); if (n == -1) { pr_perror("Failed to get supplementary groups"); ret = false; } else { /* getgroups sorts gids, so it is safe to memcmp gid arrays */ ret = !memcmp(gids, groups, len); } xfree(gids); return ret; } static void copy_caps(u32 *out_caps, u32 *in_caps, int n_words) { int i, cap_end; for (i = kdat.last_cap + 1; i < 32 * n_words; ++i) { if (~in_caps[i / 32] & (1 << (i % 32))) continue; pr_warn("Dropping unsupported capability %d > %d)\n", i, kdat.last_cap); /* extra caps will be cleared below */ } n_words = min(n_words, (kdat.last_cap + 31) / 32); cap_end = (kdat.last_cap & 31) + 1; memcpy(out_caps, in_caps, sizeof(*out_caps) * n_words); if ((cap_end & 31) && n_words) out_caps[n_words - 1] &= (1 << cap_end) - 1; memset(out_caps + n_words, 0, sizeof(*out_caps) * (CR_CAP_SIZE - n_words)); } static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) { unsigned long this_pos; struct thread_creds_args *args; this_pos = rst_mem_align_cpos(RM_PRIVATE); args = rst_mem_alloc(sizeof(*args), RM_PRIVATE); if (!args) return ERR_PTR(-ENOMEM); args->cap_last_cap = kdat.last_cap; memcpy(&args->creds, ce, sizeof(args->creds)); if (ce->lsm_profile || opts.lsm_supplied) { char *rendered = NULL, *profile; profile = ce->lsm_profile; if (validate_lsm(profile) < 0) return ERR_PTR(-EINVAL); if (profile && render_lsm_profile(profile, &rendered)) { return ERR_PTR(-EINVAL); } if (rendered) { size_t lsm_profile_len; char *lsm_profile; args->mem_lsm_profile_pos = rst_mem_align_cpos(RM_PRIVATE); lsm_profile_len = strlen(rendered); lsm_profile = rst_mem_alloc(lsm_profile_len + 1, RM_PRIVATE); if (!lsm_profile) { xfree(rendered); return ERR_PTR(-ENOMEM); } args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_profile = lsm_profile; __strlcpy(args->lsm_profile, rendered, lsm_profile_len + 1); xfree(rendered); } } else { args->lsm_profile = NULL; args->mem_lsm_profile_pos = 0; } if (ce->lsm_sockcreate) { char *rendered = NULL; char *profile; profile = ce->lsm_sockcreate; if (validate_lsm(profile) < 0) return ERR_PTR(-EINVAL); if (profile && render_lsm_profile(profile, &rendered)) { return ERR_PTR(-EINVAL); } if (rendered) { size_t lsm_sockcreate_len; char *lsm_sockcreate; args->mem_lsm_sockcreate_pos = rst_mem_align_cpos(RM_PRIVATE); lsm_sockcreate_len = strlen(rendered); lsm_sockcreate = rst_mem_alloc(lsm_sockcreate_len + 1, RM_PRIVATE); if (!lsm_sockcreate) { xfree(rendered); return ERR_PTR(-ENOMEM); } args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_sockcreate = lsm_sockcreate; __strlcpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len + 1); xfree(rendered); } } else { args->lsm_sockcreate = NULL; args->mem_lsm_sockcreate_pos = 0; } /* * Zap fields which we can't use. */ args->creds.cap_inh = NULL; args->creds.cap_eff = NULL; args->creds.cap_prm = NULL; args->creds.cap_bnd = NULL; args->creds.groups = NULL; args->creds.lsm_profile = NULL; copy_caps(args->cap_inh, ce->cap_inh, ce->n_cap_inh); copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; args->mem_groups_pos = rst_mem_align_cpos(RM_PRIVATE); groups = rst_mem_alloc(ce->n_groups * sizeof(u32), RM_PRIVATE); if (!groups) return ERR_PTR(-ENOMEM); args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->groups = groups; memcpy(args->groups, ce->groups, ce->n_groups * sizeof(u32)); } else { args->groups = NULL; args->mem_groups_pos = 0; } args->mem_pos_next = 0; if (prev_pos) { if (*prev_pos) { struct thread_creds_args *prev; prev = rst_mem_remap_ptr(*prev_pos, RM_PRIVATE); prev->mem_pos_next = this_pos; } *prev_pos = this_pos; } return args; } static int rst_prep_creds_from_img(pid_t pid) { CredsEntry *ce = NULL; struct cr_img *img; int ret; img = open_image(CR_FD_CREDS, O_RSTR, pid); if (!img) return -ENOENT; ret = pb_read_one(img, &ce, PB_CREDS); close_image(img); if (ret > 0) { struct thread_creds_args *args; args = rst_prep_creds_args(ce, NULL); if (IS_ERR(args)) ret = PTR_ERR(args); else ret = 0; } creds_entry__free_unpacked(ce, NULL); return ret; } static int rst_prep_creds(pid_t pid, CoreEntry *core, unsigned long *creds_pos) { struct thread_creds_args *args = NULL; unsigned long this_pos = 0; size_t i; /* * This is _really_ very old image * format where @thread_core were not * present. It means we don't have * creds either, just ignore and exit * early. */ if (unlikely(!core->thread_core)) { *creds_pos = 0; return 0; } *creds_pos = rst_mem_align_cpos(RM_PRIVATE); /* * Old format: one Creds per task carried in own image file. */ if (!core->thread_core->creds) return rst_prep_creds_from_img(pid); for (i = 0; i < current->nr_threads; i++) { CredsEntry *ce = current->core[i]->thread_core->creds; args = rst_prep_creds_args(ce, &this_pos); if (IS_ERR(args)) return PTR_ERR(args); } return 0; } static void *restorer_munmap_addr(CoreEntry *core, void *restorer_blob) { #ifdef CONFIG_COMPAT if (core_is_compat(core)) return restorer_sym(restorer_blob, arch_export_unmap_compat); #endif return restorer_sym(restorer_blob, arch_export_unmap); } static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, unsigned long alen, CoreEntry *core) { void *mem = MAP_FAILED; void *restore_task_exec_start; long new_sp; long ret; long rst_mem_size; long memzone_size; struct thread_restore_args *thread_args; struct restore_mem_zone *mz; struct vdso_maps vdso_maps_rt; unsigned long vdso_rt_size = 0; struct vm_area_list self_vmas; struct vm_area_list *vmas = &rsti(current)->vmas; int i, siginfo_n; unsigned long creds_pos = 0; unsigned long creds_pos_next; sigset_t blockmask; pr_info("Restore via sigreturn\n"); /* pr_info_vma_list(&self_vma_list); */ BUILD_BUG_ON(sizeof(struct task_restore_args) & 1); BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1); /* * Read creds info for every thread and allocate memory * needed so we can use this data inside restorer. */ if (rst_prep_creds(pid, core, &creds_pos)) goto err_nv; if (current->parent == NULL) { /* Wait when all tasks restored all files */ if (restore_wait_other_tasks()) goto err_nv; if (root_ns_mask & CLONE_NEWNS && remount_readonly_mounts()) goto err_nv; } /* * We're about to search for free VM area and inject the restorer blob * into it. No irrelevant mmaps/mremaps beyond this point, otherwise * this unwanted mapping might get overlapped by the restorer. */ ret = parse_self_maps_lite(&self_vmas); if (ret < 0) goto err; rst_mem_size = rst_mem_lock(); memzone_size = round_up(sizeof(struct restore_mem_zone) * current->nr_threads, page_size()); task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size; BUG_ON(task_args->bootstrap_len & (PAGE_SIZE - 1)); pr_info("%d threads require %ldK of memory\n", current->nr_threads, KBYTES(task_args->bootstrap_len)); if (core_is_compat(core)) vdso_maps_rt = vdso_maps_compat; else vdso_maps_rt = vdso_maps; /* * Figure out how much memory runtime vdso and vvar will need. * Check if vDSO or VVAR is not provided by kernel. */ if (vdso_maps_rt.sym.vdso_size != VDSO_BAD_SIZE) { vdso_rt_size = vdso_maps_rt.sym.vdso_size; if (vdso_maps_rt.sym.vvar_size != VVAR_BAD_SIZE) vdso_rt_size += vdso_maps_rt.sym.vvar_size; } task_args->bootstrap_len += vdso_rt_size; /* * Restorer is a blob (code + args) that will get mapped in some * place, that should _not_ intersect with both -- current mappings * and mappings of the task we're restoring here. The subsequent * call finds the start address for the restorer. * * After the start address is found we populate it with the restorer * parts one by one (some are remap-ed, some are mmap-ed and copied * or inited from scratch). */ mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, task_args->bootstrap_len); if (mem == (void *)-1) { pr_err("No suitable area for task_restore bootstrap (%ldK)\n", task_args->bootstrap_len); goto err; } pr_info("Found bootstrap VMA hint at: %p (needs ~%ldK)\n", mem, KBYTES(task_args->bootstrap_len)); ret = remap_restorer_blob(mem); if (ret < 0) goto err; /* * Prepare a memory map for restorer. Note a thread space * might be completely unused so it's here just for convenience. */ task_args->clone_restore_fn = restorer_sym(mem, arch_export_restore_thread); restore_task_exec_start = restorer_sym(mem, arch_export_restore_task); rsti(current)->munmap_restorer = restorer_munmap_addr(core, mem); task_args->bootstrap_start = mem; mem += restorer_len; /* VMA we need for stacks and sigframes for threads */ if (mmap(mem, memzone_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, 0, 0) != mem) { pr_perror("Can't mmap section for restore code"); goto err; } memzero(mem, memzone_size); mz = mem; mem += memzone_size; /* New home for task_restore_args and thread_restore_args */ task_args = mremap(task_args, alen, alen, MREMAP_MAYMOVE | MREMAP_FIXED, mem); if (task_args != mem) { pr_perror("Can't move task args"); goto err; } task_args->rst_mem = mem; task_args->rst_mem_size = rst_mem_size + alen; thread_args = (struct thread_restore_args *)(task_args + 1); /* * And finally -- the rest arguments referenced by task_ and * thread_restore_args. Pointers will get remapped below. */ mem += alen; if (rst_mem_remap(mem)) goto err; /* * At this point we've found a gap in VM that fits in both -- current * and target tasks' mappings -- and its structure is * * | restorer code | memzone (stacks and sigframes) | arguments | * * Arguments is task_restore_args, thread_restore_args-s and all * the bunch of objects allocated with rst_mem_alloc(). * Note, that the task_args itself is inside the 3rd section and (!) * it gets unmapped at the very end of __export_restore_task */ task_args->proc_fd = dup(get_service_fd(PROC_FD_OFF)); if (task_args->proc_fd < 0) { pr_perror("can't dup proc fd"); goto err; } task_args->breakpoint = &rsti(current)->breakpoint; task_args->fault_strategy = fi_strategy; sigemptyset(&blockmask); sigaddset(&blockmask, SIGCHLD); if (sigprocmask(SIG_BLOCK, &blockmask, NULL) == -1) { pr_perror("Can not set mask of blocked signals"); return -1; } task_args->task_entries = rst_mem_remap_ptr(task_entries_pos, RM_SHREMAP); task_args->premmapped_addr = (unsigned long)rsti(current)->premmapped_addr; task_args->premmapped_len = rsti(current)->premmapped_len; task_args->task_size = kdat.task_size; #ifdef ARCH_HAS_LONG_PAGES task_args->page_size = PAGE_SIZE; #endif RST_MEM_FIXUP_PPTR(task_args->vmas); RST_MEM_FIXUP_PPTR(task_args->rings); RST_MEM_FIXUP_PPTR(task_args->tcp_socks); RST_MEM_FIXUP_PPTR(task_args->timerfd); RST_MEM_FIXUP_PPTR(task_args->posix_timers); RST_MEM_FIXUP_PPTR(task_args->siginfo); RST_MEM_FIXUP_PPTR(task_args->rlims); RST_MEM_FIXUP_PPTR(task_args->helpers); RST_MEM_FIXUP_PPTR(task_args->zombies); RST_MEM_FIXUP_PPTR(task_args->vma_ios); RST_MEM_FIXUP_PPTR(task_args->inotify_fds); task_args->compatible_mode = core_is_compat(core); /* * Arguments for task restoration. */ BUG_ON(core->mtype != CORE_ENTRY__MARCH); task_args->logfd = log_get_fd(); task_args->loglevel = log_get_loglevel(); log_get_logstart(&task_args->logstart); task_args->sigchld_act = sigchld_act; strncpy(task_args->comm, core->tc->comm, TASK_COMM_LEN - 1); task_args->comm[TASK_COMM_LEN - 1] = 0; prep_libc_rseq_info(&task_args->libc_rseq); task_args->uid = opts.uid; for (i = 0; i < CR_CAP_SIZE; i++) task_args->cap_eff[i] = opts.cap_eff[i]; /* * Fill up per-thread data. */ creds_pos_next = creds_pos; siginfo_n = task_args->siginfo_n; for (i = 0; i < current->nr_threads; i++) { CoreEntry *tcore; struct rt_sigframe *sigframe; #ifdef CONFIG_MIPS k_rtsigset_t mips_blkset; #else k_rtsigset_t *blkset = NULL; #endif thread_args[i].pid = current->threads[i].ns[0].virt; thread_args[i].siginfo_n = siginfo_priv_nr[i]; thread_args[i].siginfo = task_args->siginfo; thread_args[i].siginfo += siginfo_n; siginfo_n += thread_args[i].siginfo_n; /* skip self */ if (thread_args[i].pid == pid) { task_args->t = thread_args + i; tcore = core; #ifdef CONFIG_MIPS mips_blkset.sig[0] = tcore->tc->blk_sigset; mips_blkset.sig[1] = tcore->tc->blk_sigset_extended; #else blkset = (void *)&tcore->tc->blk_sigset; #endif } else { tcore = current->core[i]; if (tcore->thread_core->has_blk_sigset) { #ifdef CONFIG_MIPS mips_blkset.sig[0] = tcore->thread_core->blk_sigset; mips_blkset.sig[1] = tcore->thread_core->blk_sigset_extended; #else blkset = (void *)&tcore->thread_core->blk_sigset; #endif } } if ((tcore->tc || tcore->ids) && thread_args[i].pid != pid) { pr_err("Thread has optional fields present %d\n", thread_args[i].pid); ret = -1; } if (ret < 0) { pr_err("Can't read core data for thread %d\n", thread_args[i].pid); goto err; } thread_args[i].ta = task_args; thread_args[i].gpregs = *CORE_THREAD_ARCH_INFO(tcore)->gpregs; thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); if (tcore->thread_core->has_cg_set && rsti(current)->cg_set != tcore->thread_core->cg_set) { thread_args[i].cg_set = tcore->thread_core->cg_set; thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK)); } else { thread_args[i].cg_set = -1; } ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); if (ret) goto err; rst_reloc_creds(&thread_args[i], &creds_pos_next); thread_args[i].futex_rla = tcore->thread_core->futex_rla; thread_args[i].futex_rla_len = tcore->thread_core->futex_rla_len; thread_args[i].pdeath_sig = tcore->thread_core->pdeath_sig; if (tcore->thread_core->pdeath_sig > _KNSIG) { pr_err("Pdeath signal is too big\n"); goto err; } ret = prep_sched_info(&thread_args[i].sp, tcore->thread_core); if (ret) goto err; seccomp_rst_reloc(&thread_args[i]); thread_args[i].seccomp_force_tsync = rsti(current)->has_old_seccomp_filter; thread_args[i].mz = mz + i; sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe; #ifdef CONFIG_MIPS if (construct_sigframe(sigframe, sigframe, &mips_blkset, tcore)) #else if (construct_sigframe(sigframe, sigframe, blkset, tcore)) #endif goto err; if (tcore->thread_core->comm) strncpy(thread_args[i].comm, tcore->thread_core->comm, TASK_COMM_LEN - 1); else strncpy(thread_args[i].comm, core->tc->comm, TASK_COMM_LEN - 1); thread_args[i].comm[TASK_COMM_LEN - 1] = 0; if (thread_args[i].pid != pid) core_entry__free_unpacked(tcore, NULL); pr_info("Thread %4d stack %8p rt_sigframe %8p\n", i, mz[i].stack, mz[i].rt_sigframe); } /* * Restorer needs own copy of vdso parameters. Runtime * vdso must be kept non intersecting with anything else, * since we need it being accessible even when own * self-vmas are unmaped. */ mem += rst_mem_size; task_args->vdso_rt_parked_at = (unsigned long)mem; task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_rt_size = vdso_rt_size; task_args->can_map_vdso = kdat.can_map_vdso; task_args->has_clone3_set_tid = kdat.has_clone3_set_tid; new_sp = restorer_stack(task_args->t->mz); /* No longer need it */ core_entry__free_unpacked(core, NULL); xfree(current->core); /* * Now prepare run-time data for threads restore. */ task_args->nr_threads = current->nr_threads; task_args->thread_args = thread_args; task_args->auto_dedup = opts.auto_dedup; task_args->mmap_page_image = opts.mmap_page_image; /* * In the restorer we need to know if it is SELinux or not. For SELinux * we must change the process context before creating threads. For * Apparmor we can change each thread after they have been created. */ task_args->lsm_type = kdat.lsm; /* * Make root and cwd restore _that_ late not to break any * attempts to open files by paths above (e.g. /proc). */ if (restore_fs(current)) goto err; sfds_protected = false; close_image_dir(); close_proc(); close_service_fd(TRANSPORT_FD_OFF); close_service_fd(CR_PROC_FD_OFF); close_service_fd(ROOT_FD_OFF); close_service_fd(USERNSD_SK); close_service_fd(FDSTORE_SK_OFF); close_service_fd(RPC_SK_OFF); close_service_fd(CGROUPD_SK); __gcov_flush(); pr_info("task_args: %p\n" "task_args->pid: %d\n" "task_args->nr_threads: %d\n" "task_args->clone_restore_fn: %p\n" "task_args->thread_args: %p\n", task_args, task_args->t->pid, task_args->nr_threads, task_args->clone_restore_fn, task_args->thread_args); /* * An indirect call to task_restore, note it never returns * and restoring core is extremely destructive. */ JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args); err: free_mappings(&self_vmas); err_nv: /* Just to be sure */ exit(1); return -1; } crac-criu-1.5.0/criu/cr-service.c000066400000000000000000001025141471504326700165510ustar00rootroot00000000000000#ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "version.h" #include "crtools.h" #include "cr_options.h" #include "external.h" #include "util.h" #include "criu-log.h" #include "cpu.h" #include "files.h" #include "pstree.h" #include "cr-service.h" #include "cr-service-const.h" #include "page-xfer.h" #include "protobuf.h" #include "net.h" #include "mount.h" #include "filesystems.h" #include "cgroup.h" #include "cgroup-props.h" #include "action-scripts.h" #include "sockets.h" #include "irmap.h" #include "kerndat.h" #include "proc_parse.h" #include "common/scm.h" #include "uffd.h" #include "pidfd-store.h" #include "setproctitle.h" #include "cr-errno.h" #include "namespaces.h" unsigned int service_sk_ino = -1; static int recv_criu_msg(int socket_fd, CriuReq **req) { u8 local[PB_PKOBJ_LOCAL_SIZE]; void *buf = (void *)&local; int len, exit_code = -1; len = recv(socket_fd, NULL, 0, MSG_TRUNC | MSG_PEEK); if (len == -1) { pr_perror("Can't read request"); goto err; } if (len > sizeof(local)) { buf = xmalloc(len); if (!buf) return -ENOMEM; } len = recv(socket_fd, buf, len, MSG_TRUNC); if (len == -1) { pr_perror("Can't read request"); goto err; } if (len == 0) { pr_info("Client exited unexpectedly\n"); errno = ECONNRESET; goto err; } *req = criu_req__unpack(NULL, len, buf); if (!*req) { pr_perror("Failed unpacking request"); goto err; } exit_code = 0; err: if (buf != (void *)&local) xfree(buf); return exit_code; } static int send_criu_msg_with_fd(int socket_fd, CriuResp *msg, int fd) { u8 local[PB_PKOBJ_LOCAL_SIZE]; void *buf = (void *)&local; int len, exit_code = -1; len = criu_resp__get_packed_size(msg); if (len > sizeof(local)) { buf = xmalloc(len); if (!buf) return -ENOMEM; } if (criu_resp__pack(msg, buf) != len) { pr_perror("Failed packing response"); goto err; } if (fd >= 0) exit_code = send_fds(socket_fd, NULL, 0, &fd, 1, buf, len); else exit_code = write(socket_fd, buf, len); if (exit_code < 0) { pr_perror("Can't send response"); goto err; } exit_code = 0; err: if (buf != (void *)&local) xfree(buf); return exit_code; } static int send_criu_msg(int socket_fd, CriuResp *msg) { return send_criu_msg_with_fd(socket_fd, msg, -1); } static void set_resp_err(CriuResp *resp) { resp->cr_errno = get_cr_errno(); resp->has_cr_errno = resp->cr_errno ? true : false; resp->cr_errmsg = log_first_err(); } static void send_criu_err(int sk, char *msg) { CriuResp resp = CRIU_RESP__INIT; pr_perror("RPC error: %s", msg); resp.type = CRIU_REQ_TYPE__EMPTY; resp.success = false; set_resp_err(&resp); send_criu_msg(sk, &resp); } int send_criu_dump_resp(int socket_fd, bool success, bool restored) { CriuResp msg = CRIU_RESP__INIT; CriuDumpResp resp = CRIU_DUMP_RESP__INIT; msg.type = CRIU_REQ_TYPE__DUMP; msg.success = success; set_resp_err(&msg); msg.dump = &resp; resp.has_restored = true; resp.restored = restored; return send_criu_msg(socket_fd, &msg); } static int send_criu_pre_dump_resp(int socket_fd, bool success, bool single) { CriuResp msg = CRIU_RESP__INIT; msg.type = single ? CRIU_REQ_TYPE__SINGLE_PRE_DUMP : CRIU_REQ_TYPE__PRE_DUMP; msg.success = success; set_resp_err(&msg); return send_criu_msg(socket_fd, &msg); } int send_criu_restore_resp(int socket_fd, bool success, int pid) { CriuResp msg = CRIU_RESP__INIT; CriuRestoreResp resp = CRIU_RESTORE_RESP__INIT; msg.type = CRIU_REQ_TYPE__RESTORE; msg.success = success; set_resp_err(&msg); msg.restore = &resp; resp.pid = pid; return send_criu_msg(socket_fd, &msg); } int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd) { int ret; CriuResp msg = CRIU_RESP__INIT; CriuReq *req; CriuNotify cn = CRIU_NOTIFY__INIT; msg.type = CRIU_REQ_TYPE__NOTIFY; msg.success = true; msg.notify = &cn; cn.script = name; switch (act) { case ACT_SETUP_NS: case ACT_POST_RESTORE: /* * FIXME pid is required only once on * restore. Need some more sane way of * checking this. */ cn.has_pid = true; cn.pid = root_item->pid->real; break; default: break; } ret = send_criu_msg_with_fd(sk, &msg, fd); if (ret < 0) return ret; ret = recv_criu_msg(sk, &req); if (ret < 0) return ret; if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { pr_err("RPC client reported script error\n"); return -1; } criu_req__free_unpacked(req, NULL); return 0; } int exec_rpc_query_external_files(char *name, int sk) { int i, ret; CriuNotify cn = CRIU_NOTIFY__INIT; CriuResp msg = CRIU_RESP__INIT; CriuReq *req; cn.script = name; msg.type = CRIU_REQ_TYPE__NOTIFY; msg.success = true; msg.notify = &cn; ret = send_criu_msg_with_fd(sk, &msg, -1); if (ret < 0) return ret; ret = recv_criu_msg(sk, &req); if (ret < 0) return ret; if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { pr_err("RPC client reported script error\n"); return -1; } ret = 0; if (req->opts) for (i = 0; i < req->opts->n_external; i++) { char *key = req->opts->external[i]; pr_info("Adding external object: %s\n", key); if (add_external(key)) { pr_err("Failed to add external object: %s\n", key); ret = -1; } } else pr_info("RPC NOTIFY %s: no `opts` returned.\n", name); criu_req__free_unpacked(req, NULL); return ret; } static char images_dir[PATH_MAX]; static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); char images_dir_path[PATH_MAX]; char work_dir_path[PATH_MAX]; char status_fd[PATH_MAX]; bool output_changed_by_rpc_conf = false; bool work_changed_by_rpc_conf = false; bool imgs_changed_by_rpc_conf = false; int i; bool dummy = false; if (getsockopt(sk, SOL_SOCKET, SO_PEERCRED, &ids, &ids_len)) { pr_perror("Can't get socket options"); goto err; } if (fstat(sk, &st)) { pr_perror("Can't get socket stat"); goto err; } BUG_ON(st.st_ino == -1); service_sk_ino = st.st_ino; /* * Evaluate an additional configuration file if specified. * This needs to happen twice, because it is needed early to detect * things like work_dir, imgs_dir and logfile. The second parsing * of the optional RPC configuration file happens at the end and * overwrites all options set via RPC. */ if (req->config_file) { char *tmp_output = opts.output; char *tmp_work = opts.work_dir; char *tmp_imgs = opts.imgs_dir; opts.output = NULL; opts.work_dir = NULL; opts.imgs_dir = NULL; rpc_cfg_file = req->config_file; i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); if (i) { xfree(tmp_output); xfree(tmp_work); xfree(tmp_imgs); goto err; } /* If this is non-NULL, the RPC configuration file had a value, use it.*/ if (opts.output) output_changed_by_rpc_conf = true; /* If this is NULL, use the old value if it was set. */ if (!opts.output && tmp_output) { opts.output = tmp_output; tmp_output = NULL; } if (opts.work_dir) work_changed_by_rpc_conf = true; if (!opts.work_dir && tmp_work) { opts.work_dir = tmp_work; tmp_work = NULL; } if (opts.imgs_dir) imgs_changed_by_rpc_conf = true; /* * As the images directory is a required RPC setting, it is not * necessary to use the value from other configuration files. * Either it is set in the RPC configuration file or it is set * via RPC. */ xfree(tmp_output); xfree(tmp_work); xfree(tmp_imgs); } /* * open images_dir - images_dir_fd is a required RPC parameter * * This assumes that if opts.imgs_dir is set we have a value * from the configuration file parser. The test to see that * imgs_changed_by_rpc_conf is true is used to make sure the value * is from the RPC configuration file. * The idea is that only the RPC configuration file is able to * overwrite RPC settings: * * apply_config(global_conf) * * apply_config(user_conf) * * apply_config(environment variable) * * apply_rpc_options() * * apply_config(rpc_conf) */ if (imgs_changed_by_rpc_conf) strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); else if (req->images_dir_fd != -1) sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); else if (req->images_dir) strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); else { pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); goto err; } if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); /* * Image streaming is not supported with CRIU's service feature as * the streamer must be started for each dump/restore operation. * It is unclear how to do that with RPC, so we punt for now. * This explains why we provide the argument mode=-1 instead of * O_RSTR or O_DUMP. */ if (open_image_dir(images_dir_path, -1) < 0) { pr_perror("Can't open images directory"); goto err; } /* get full path to images_dir to use in process title */ if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { pr_perror("Can't readlink %s", images_dir_path); goto err; } /* chdir to work dir */ if (work_changed_by_rpc_conf) /* Use the value from the RPC configuration file first. */ strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); else if (req->has_work_dir_fd) /* Use the value set via RPC. */ sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); else if (opts.work_dir) /* Use the value from one of the other configuration files. */ strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); else /* Use the images directory a work directory. */ strcpy(work_dir_path, images_dir_path); if (chdir(work_dir_path)) { pr_perror("Can't chdir to work_dir"); goto err; } /* initiate log file in work dir */ if (req->log_file && !output_changed_by_rpc_conf) { /* * If RPC sets a log file and if there nothing from the * RPC configuration file, use the RPC value. */ if (strchr(req->log_file, '/')) { pr_perror("No subdirs are allowed in log_file name"); goto err; } SET_CHAR_OPTS(output, req->log_file); } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { xfree(opts.output); opts.output = NULL; } else if (!opts.output) { SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); } /* This is needed later to correctly set the log_level */ opts.log_level = req->log_level; log_set_loglevel(req->log_level); if (log_init(opts.output) == -1) { pr_perror("Can't initiate log"); goto err; } if (req->config_file) { pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); } if (req->has_unprivileged) opts.unprivileged = req->unprivileged; if (check_caps()) return 1; if (kerndat_init()) return 1; if (log_keep_err()) { pr_perror("Can't tune log"); goto err; } /* checking flags from client */ if (req->has_leave_running && req->leave_running) opts.final_state = TASK_ALIVE; if (req->has_leave_stopped && req->leave_stopped) opts.final_state = TASK_STOPPED; if (!req->has_pid) { req->has_pid = true; req->pid = ids.pid; } if (req->has_ext_unix_sk) { opts.ext_unix_sk = req->ext_unix_sk; for (i = 0; i < req->n_unix_sk_ino; i++) { if (unix_sk_id_add((unsigned int)req->unix_sk_ino[i]->inode) < 0) goto err; } } if (req->root) SET_CHAR_OPTS(root, req->root); if (req->has_rst_sibling) { if (!opts.swrk_restore) { pr_err("rst_sibling is not allowed in standalone service\n"); goto err; } opts.restore_sibling = req->rst_sibling; } if (req->has_tcp_established) opts.tcp_established_ok = req->tcp_established; if (req->has_tcp_skip_in_flight) opts.tcp_skip_in_flight = req->tcp_skip_in_flight; if (req->has_tcp_close) opts.tcp_close = req->tcp_close; if (req->has_weak_sysctls) opts.weak_sysctls = req->weak_sysctls; if (req->has_evasive_devices) opts.evasive_devices = req->evasive_devices; if (req->has_shell_job) opts.shell_job = req->shell_job; if (req->has_skip_file_rwx_check) opts.skip_file_rwx_check = req->skip_file_rwx_check; if (req->has_file_locks) opts.handle_file_locks = req->file_locks; if (req->has_track_mem) opts.track_mem = req->track_mem; if (req->has_link_remap) opts.link_remap_ok = req->link_remap; if (req->has_auto_dedup) opts.auto_dedup = req->auto_dedup; if (req->has_force_irmap) opts.force_irmap = req->force_irmap; if (req->n_exec_cmd > 0) { opts.exec_cmd = xmalloc((req->n_exec_cmd + 1) * sizeof(char *)); memcpy(opts.exec_cmd, req->exec_cmd, req->n_exec_cmd * sizeof(char *)); opts.exec_cmd[req->n_exec_cmd] = NULL; } if (req->has_lazy_pages) { opts.lazy_pages = req->lazy_pages; } if (req->has_pre_dump_mode) { switch (req->pre_dump_mode) { case CRIU_PRE_DUMP_MODE__SPLICE: opts.pre_dump_mode = PRE_DUMP_SPLICE; break; case CRIU_PRE_DUMP_MODE__VM_READ: opts.pre_dump_mode = PRE_DUMP_READ; break; default: goto err; } } if (req->has_network_lock) { switch (req->network_lock) { case CRIU_NETWORK_LOCK_METHOD__IPTABLES: opts.network_lock_method = NETWORK_LOCK_IPTABLES; break; case CRIU_NETWORK_LOCK_METHOD__NFTABLES: opts.network_lock_method = NETWORK_LOCK_NFTABLES; break; case CRIU_NETWORK_LOCK_METHOD__SKIP: opts.network_lock_method = NETWORK_LOCK_SKIP; break; default: goto err; } } if (req->ps) { opts.port = (short)req->ps->port; if (!opts.lazy_pages) { opts.use_page_server = true; if (req->ps->address) SET_CHAR_OPTS(addr, req->ps->address); else opts.addr = NULL; if (req->ps->has_fd) { if (!opts.swrk_restore) goto err; opts.ps_socket = req->ps->fd; } } } if (req->notify_scripts && add_rpc_notify(sk)) goto err; for (i = 0; i < req->n_veths; i++) { if (veth_pair_add(req->veths[i]->if_in, req->veths[i]->if_out)) goto err; } for (i = 0; i < req->n_ext_mnt; i++) { if (ext_mount_add(req->ext_mnt[i]->key, req->ext_mnt[i]->val)) goto err; } for (i = 0; i < req->n_join_ns; i++) { if (join_ns_add(req->join_ns[i]->ns, req->join_ns[i]->ns_file, req->join_ns[i]->extra_opt)) goto err; } if (req->n_inherit_fd && !opts.swrk_restore) { pr_err("inherit_fd is not allowed in standalone service\n"); goto err; } for (i = 0; i < req->n_inherit_fd; i++) { if (inherit_fd_add(req->inherit_fd[i]->fd, req->inherit_fd[i]->key)) goto err; } for (i = 0; i < req->n_external; i++) if (add_external(req->external[i])) goto err; for (i = 0; i < req->n_cg_root; i++) { if (new_cg_root_add(req->cg_root[i]->ctrl, req->cg_root[i]->path)) goto err; } for (i = 0; i < req->n_enable_fs; i++) { if (!add_fsname_auto(req->enable_fs[i])) goto err; } for (i = 0; i < req->n_skip_mnt; i++) { if (!add_skip_mount(req->skip_mnt[i])) goto err; } if (req->has_cpu_cap) { opts.cpu_cap = req->cpu_cap; opts.cpu_cap |= CPU_CAP_IMAGE; } /* * FIXME: For backward compatibility we setup * soft mode here, need to enhance to support * other modes as well via separate option * probably. */ if (req->has_manage_cgroups) opts.manage_cgroups = req->manage_cgroups ? CG_MODE_SOFT : CG_MODE_IGNORE; /* Override the manage_cgroup if mode is set explicitly */ if (req->has_manage_cgroups_mode) { unsigned int mode; switch (req->manage_cgroups_mode) { case CRIU_CG_MODE__IGNORE: mode = CG_MODE_IGNORE; break; case CRIU_CG_MODE__CG_NONE: mode = CG_MODE_NONE; break; case CRIU_CG_MODE__PROPS: mode = CG_MODE_PROPS; break; case CRIU_CG_MODE__SOFT: mode = CG_MODE_SOFT; break; case CRIU_CG_MODE__FULL: mode = CG_MODE_FULL; break; case CRIU_CG_MODE__STRICT: mode = CG_MODE_STRICT; break; case CRIU_CG_MODE__DEFAULT: mode = CG_MODE_DEFAULT; break; default: goto err; } opts.manage_cgroups = mode; } if (req->freeze_cgroup) SET_CHAR_OPTS(freeze_cgroup, req->freeze_cgroup); if (req->lsm_profile) { opts.lsm_supplied = true; SET_CHAR_OPTS(lsm_profile, req->lsm_profile); } if (req->lsm_mount_context) SET_CHAR_OPTS(lsm_mount_context, req->lsm_mount_context); if (req->has_timeout) opts.timeout = req->timeout; if (req->cgroup_props) SET_CHAR_OPTS(cgroup_props, req->cgroup_props); if (req->cgroup_props_file) SET_CHAR_OPTS(cgroup_props_file, req->cgroup_props_file); for (i = 0; i < req->n_cgroup_dump_controller; i++) { if (!cgp_add_dump_controller(req->cgroup_dump_controller[i])) goto err; } if (req->cgroup_yard) SET_CHAR_OPTS(cgroup_yard, req->cgroup_yard); if (req->tls_cacert) SET_CHAR_OPTS(tls_cacert, req->tls_cacert); if (req->tls_cacrl) SET_CHAR_OPTS(tls_cacrl, req->tls_cacrl); if (req->tls_cert) SET_CHAR_OPTS(tls_cert, req->tls_cert); if (req->tls_key) SET_CHAR_OPTS(tls_key, req->tls_key); if (req->tls) opts.tls = req->tls; if (req->tls_no_cn_verify) opts.tls_no_cn_verify = req->tls_no_cn_verify; if (req->has_auto_ext_mnt) opts.autodetect_ext_mounts = req->auto_ext_mnt; if (req->has_ext_sharing) opts.enable_external_sharing = req->ext_sharing; if (req->has_ext_masters) opts.enable_external_masters = req->ext_masters; if (req->has_ghost_limit) opts.ghost_limit = req->ghost_limit; if (req->has_empty_ns) { opts.empty_ns = req->empty_ns; if (req->empty_ns & ~(CLONE_NEWNET)) goto err; } if (req->n_irmap_scan_paths) { for (i = 0; i < req->n_irmap_scan_paths; i++) { if (irmap_scan_path_add(req->irmap_scan_paths[i])) goto err; } } if (req->has_status_fd) { pr_warn("status_fd is obsoleted; use status-ready notification instead\n"); sprintf(status_fd, "/proc/%d/fd/%d", ids.pid, req->status_fd); opts.status_fd = open(status_fd, O_WRONLY); if (opts.status_fd < 0) { pr_perror("Can't reopen status fd %s", status_fd); goto err; } } if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk)) goto err; if (req->orphan_pts_master) opts.orphan_pts_master = true; if (req->has_display_stats) opts.display_stats = req->display_stats; /* Evaluate additional configuration file a second time to overwrite * all RPC settings. */ if (req->config_file) { rpc_cfg_file = req->config_file; i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); if (i) goto err; } if (req->mntns_compat_mode) opts.mntns_compat_mode = true; log_set_loglevel(opts.log_level); if (check_options()) goto err; return 0; err: set_cr_errno(EBADRQC); return -1; } static int dump_using_req(int sk, CriuOpts *req) { bool success = false; bool self_dump = !req->pid; opts.mode = CR_DUMP; if (setup_opts_from_req(sk, req)) goto exit; __setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); if (init_pidfd_store_hash()) goto pidfd_store_err; /* * FIXME -- cr_dump_tasks() may return code from custom * scripts, that can be positive. However, right now we * don't have ability to push scripts via RPC, so positive * ret values are impossible here. */ if (cr_dump_tasks(req->pid)) goto exit; success = true; exit: free_pidfd_store(); pidfd_store_err: if (req->leave_running || !self_dump || !success) { if (send_criu_dump_resp(sk, success, false) == -1) { pr_perror("Can't send response"); success = false; } } return success ? 0 : 1; } static int restore_using_req(int sk, CriuOpts *req) { bool success = false; /* * We can't restore processes under arbitrary task yet. * Thus for now we force the detached restore under the * cr service task. */ opts.restore_detach = true; opts.mode = CR_RESTORE; if (setup_opts_from_req(sk, req)) goto exit; __setproctitle("restore --rpc -D %s", images_dir); if (cr_restore_tasks()) goto exit; success = true; exit: if (send_criu_restore_resp(sk, success, root_item ? root_item->pid->real : -1) == -1) { pr_perror("Can't send response"); success = false; } if (success && opts.exec_cmd) { int logfd; logfd = log_get_fd(); if (dup2(logfd, STDOUT_FILENO) == -1 || dup2(logfd, STDERR_FILENO) == -1) { pr_perror("Failed to redirect stdout and stderr to the logfile"); return 1; } close_pid_proc(); close(sk); execvp(opts.exec_cmd[0], opts.exec_cmd); pr_perror("Failed to exec cmd %s", opts.exec_cmd[0]); success = false; } return success ? 0 : 1; } static int check(int sk, CriuOpts *req) { int pid, status; CriuResp resp = CRIU_RESP__INIT; resp.type = CRIU_REQ_TYPE__CHECK; pid = fork(); if (pid < 0) { pr_perror("Can't fork"); goto out; } if (pid == 0) { __setproctitle("check --rpc"); opts.mode = CR_CHECK; if (setup_opts_from_req(sk, req)) exit(1); exit(!!cr_check()); } if (waitpid(pid, &status, 0) != pid) { pr_perror("Unable to wait %d", pid); goto out; } if (status) goto out; resp.success = true; out: return send_criu_msg(sk, &resp); } static int pre_dump_using_req(int sk, CriuOpts *req, bool single) { int pid, status; bool success = false; pid = fork(); if (pid < 0) { pr_perror("Can't fork"); goto out; } if (pid == 0) { int ret = 1; opts.mode = CR_PRE_DUMP; if (setup_opts_from_req(sk, req)) goto cout; __setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); if (init_pidfd_store_hash()) goto pidfd_store_err; if (cr_pre_dump_tasks(req->pid)) goto cout; ret = 0; cout: free_pidfd_store(); pidfd_store_err: exit(ret); } if (waitpid(pid, &status, 0) != pid) { pr_perror("Unable to wait %d", pid); goto out; } if (status != 0) goto out; success = true; out: if (send_criu_pre_dump_resp(sk, success, single) == -1) { pr_perror("Can't send pre-dump resp"); success = false; } return success ? 0 : -1; } static int pre_dump_loop(int sk, CriuReq *msg) { int ret; do { ret = pre_dump_using_req(sk, msg->opts, false); if (ret < 0) return ret; criu_req__free_unpacked(msg, NULL); if (recv_criu_msg(sk, &msg) == -1) { pr_perror("Can't recv request"); return -1; } } while (msg->type == CRIU_REQ_TYPE__PRE_DUMP); if (msg->type != CRIU_REQ_TYPE__DUMP) { send_criu_err(sk, "Bad req seq"); return -1; } return dump_using_req(sk, msg->opts); } static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) { int ret = -1, pid, start_pipe[2]; ssize_t count; bool success = false; CriuResp resp = CRIU_RESP__INIT; CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; struct ps_info info; if (pipe(start_pipe)) { pr_perror("No start pipe"); goto out; } pid = fork(); if (pid == 0) { close(start_pipe[0]); opts.mode = CR_PAGE_SERVER; if (setup_opts_from_req(sk, req)) goto out_ch; __setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); pr_debug("Starting page server\n"); pid = cr_page_server(daemon_mode, false, start_pipe[1]); if (pid < 0) goto out_ch; if (daemon_mode) { info.pid = pid; info.port = opts.port; count = write(start_pipe[1], &info, sizeof(info)); if (count != sizeof(info)) goto out_ch; } ret = 0; out_ch: if (daemon_mode && ret < 0 && pid > 0) kill(pid, SIGKILL); close(start_pipe[1]); exit(ret); } close(start_pipe[1]); if (daemon_mode) { if (waitpid(pid, &ret, 0) != pid) { pr_perror("Unable to wait %d", pid); goto out; } if (WIFEXITED(ret)) { if (WEXITSTATUS(ret)) { pr_err("Child exited with an error\n"); goto out; } } else { pr_err("Child wasn't terminated normally\n"); goto out; } } count = read(start_pipe[0], &info, sizeof(info)); close(start_pipe[0]); if (count != sizeof(info)) goto out; ps.pid = info.pid; ps.has_port = true; ps.port = info.port; success = true; ps.has_pid = true; resp.ps = &ps; pr_debug("Page server started\n"); out: resp.type = CRIU_REQ_TYPE__PAGE_SERVER; resp.success = success; return send_criu_msg(sk, &resp); } static int chk_keepopen_req(CriuReq *msg) { if (!msg->keep_open) return 0; /* * Service may (well, it will) leave some * resources leaked after processing e.g. * dump or restore requests. Before we audit * the code for this, let's first enable * mreq RPCs for those requests we know do * good work */ if (msg->type == CRIU_REQ_TYPE__PAGE_SERVER) /* This just fork()-s so no leaks */ return 0; if (msg->type == CRIU_REQ_TYPE__PAGE_SERVER_CHLD) /* This just fork()-s so no leaks */ return 0; else if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP || msg->type == CRIU_REQ_TYPE__CPUINFO_CHECK) return 0; else if (msg->type == CRIU_REQ_TYPE__FEATURE_CHECK) return 0; else if (msg->type == CRIU_REQ_TYPE__VERSION) return 0; return -1; } /* * Return the version information, depending on the information * available in version.h */ static int handle_version(int sk, CriuReq *msg) { CriuResp resp = CRIU_RESP__INIT; CriuVersion version = CRIU_VERSION__INIT; /* This assumes we will always have a major and minor version */ version.major_number = CRIU_VERSION_MAJOR; version.minor_number = CRIU_VERSION_MINOR; if (strcmp(CRIU_GITID, "0")) { version.gitid = CRIU_GITID; } #ifdef CRIU_VERSION_SUBLEVEL version.has_sublevel = 1; version.sublevel = CRIU_VERSION_SUBLEVEL; #endif #ifdef CRIU_VERSION_EXTRA version.has_extra = 1; version.extra = CRIU_VERSION_EXTRA; #endif #ifdef CRIU_VERSION_NAME /* This is not actually exported in version.h */ version.name = CRIU_VERSION_NAME; #endif resp.type = msg->type; resp.success = true; resp.version = &version; return send_criu_msg(sk, &resp); } /* * Generic function to handle CRIU_REQ_TYPE__FEATURE_CHECK. * * The function will have resp.success = true for most cases * and the actual result will be in resp.features. * * For each feature which has been requested in msg->features * the corresponding parameter will be set in resp.features. */ static int handle_feature_check(int sk, CriuReq *msg) { CriuResp resp = CRIU_RESP__INIT; CriuFeatures feat = CRIU_FEATURES__INIT; int pid, status; int ret; /* enable setting of an optional message */ feat.has_mem_track = 1; feat.mem_track = false; feat.has_lazy_pages = 1; feat.lazy_pages = false; feat.has_pidfd_store = 1; feat.pidfd_store = false; pid = fork(); if (pid < 0) { pr_perror("Can't fork"); goto out; } if (pid == 0) { if (kerndat_init()) exit(1); __setproctitle("feature-check --rpc"); if ((msg->features->has_mem_track == 1) && (msg->features->mem_track == true)) feat.mem_track = kdat.has_dirty_track; if ((msg->features->has_lazy_pages == 1) && (msg->features->lazy_pages == true)) feat.lazy_pages = kdat.has_uffd && uffd_noncooperative(); if ((msg->features->has_pidfd_store == 1) && (msg->features->pidfd_store == true)) feat.pidfd_store = kdat.has_pidfd_getfd && kdat.has_pidfd_open; resp.features = &feat; resp.type = msg->type; /* The feature check is working, actual results are in resp.features */ resp.success = true; /* * If this point is reached the information about the features * is transmitted from the forked CRIU process (here). * If an error occurred earlier, the feature check response will be * be send from the parent process. */ ret = send_criu_msg(sk, &resp); exit(!!ret); } if (waitpid(pid, &status, 0) != pid) { pr_perror("Unable to wait %d", pid); goto out; } if (status != 0) goto out; return 0; /* * The child process was not able to send an answer. Tell * the RPC client that something did not work as expected. */ out: resp.type = msg->type; resp.success = false; return send_criu_msg(sk, &resp); } static int handle_wait_pid(int sk, int pid) { CriuResp resp = CRIU_RESP__INIT; bool success = false; int status; if (waitpid(pid, &status, 0) == -1) { resp.cr_errno = errno; pr_perror("Unable to wait %d", pid); goto out; } resp.status = status; resp.has_status = true; success = true; out: resp.type = CRIU_REQ_TYPE__WAIT_PID; resp.success = success; return send_criu_msg(sk, &resp); } static int handle_cpuinfo(int sk, CriuReq *msg) { CriuResp resp = CRIU_RESP__INIT; bool success = false; int pid, status; pid = fork(); if (pid < 0) { pr_perror("Can't fork"); goto out; } if (pid == 0) { int ret = 1; opts.mode = CR_CPUINFO; if (setup_opts_from_req(sk, msg->opts)) goto cout; __setproctitle("cpuinfo %s --rpc -D %s", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check", images_dir); if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ret = cpuinfo_dump(); else ret = cpuinfo_check(); cout: exit(ret); } if (waitpid(pid, &status, 0) != pid) { pr_perror("Unable to wait %d", pid); goto out; } if (!WIFEXITED(status)) goto out; switch (WEXITSTATUS(status)) { case (-ENOTSUP & 0xff): resp.has_cr_errno = 1; /* * Let's return the actual error code and * not just (-ENOTSUP & 0xff) */ resp.cr_errno = ENOTSUP; break; case 0: success = true; break; default: break; } out: resp.type = msg->type; resp.success = success; return send_criu_msg(sk, &resp); } int cr_service_work(int sk) { int ret = -1; CriuReq *msg = 0; more: opts.mode = CR_SWRK; if (recv_criu_msg(sk, &msg) != 0) { pr_perror("Can't recv request"); goto err; } if (chk_keepopen_req(msg)) goto err; switch (msg->type) { case CRIU_REQ_TYPE__DUMP: ret = dump_using_req(sk, msg->opts); break; case CRIU_REQ_TYPE__RESTORE: ret = restore_using_req(sk, msg->opts); break; case CRIU_REQ_TYPE__CHECK: ret = check(sk, msg->opts); break; case CRIU_REQ_TYPE__PRE_DUMP: ret = pre_dump_loop(sk, msg); break; case CRIU_REQ_TYPE__PAGE_SERVER: ret = start_page_server_req(sk, msg->opts, true); break; case CRIU_REQ_TYPE__PAGE_SERVER_CHLD: ret = start_page_server_req(sk, msg->opts, false); break; case CRIU_REQ_TYPE__WAIT_PID: ret = handle_wait_pid(sk, msg->pid); break; case CRIU_REQ_TYPE__CPUINFO_DUMP: case CRIU_REQ_TYPE__CPUINFO_CHECK: ret = handle_cpuinfo(sk, msg); break; case CRIU_REQ_TYPE__FEATURE_CHECK: ret = handle_feature_check(sk, msg); break; case CRIU_REQ_TYPE__VERSION: ret = handle_version(sk, msg); break; case CRIU_REQ_TYPE__SINGLE_PRE_DUMP: ret = pre_dump_using_req(sk, msg->opts, true); break; default: send_criu_err(sk, "Invalid req"); break; } if (!ret && msg->keep_open) { criu_req__free_unpacked(msg, NULL); ret = -1; goto more; } err: return ret; } static void reap_worker(int signo) { int saved_errno; int status; pid_t pid; saved_errno = errno; /* * As we block SIGCHLD, lets wait for every child that has * already changed state. */ while (1) { pid = waitpid(-1, &status, WNOHANG); if (pid <= 0) { errno = saved_errno; return; } if (WIFEXITED(status)) pr_info("Worker(pid %d) exited with %d\n", pid, WEXITSTATUS(status)); else if (WIFSIGNALED(status)) pr_info("Worker(pid %d) was killed by %d: %s\n", pid, WTERMSIG(status), strsignal(WTERMSIG(status))); } } static int setup_sigchld_handler(void) { struct sigaction action; sigemptyset(&action.sa_mask); sigaddset(&action.sa_mask, SIGCHLD); action.sa_handler = reap_worker; action.sa_flags = SA_RESTART; if (sigaction(SIGCHLD, &action, NULL)) { pr_perror("Can't setup SIGCHLD handler"); return -1; } return 0; } static int restore_sigchld_handler(void) { struct sigaction action; sigemptyset(&action.sa_mask); sigaddset(&action.sa_mask, SIGCHLD); action.sa_handler = SIG_DFL; action.sa_flags = SA_RESTART; if (sigaction(SIGCHLD, &action, NULL)) { pr_perror("Can't restore SIGCHLD handler"); return -1; } return 0; } int cr_service(bool daemon_mode) { int server_fd = -1; int child_pid; struct sockaddr_un client_addr; socklen_t client_addr_len; { struct sockaddr_un server_addr; socklen_t server_addr_len; server_fd = socket(AF_LOCAL, SOCK_SEQPACKET, 0); if (server_fd == -1) { pr_perror("Can't initialize service socket"); goto err; } memset(&server_addr, 0, sizeof(server_addr)); memset(&client_addr, 0, sizeof(client_addr)); server_addr.sun_family = AF_LOCAL; if (opts.addr == NULL) { pr_warn("Binding to local dir address!\n"); SET_CHAR_OPTS(addr, CR_DEFAULT_SERVICE_ADDRESS); } strncpy(server_addr.sun_path, opts.addr, sizeof(server_addr.sun_path) - 1); server_addr_len = strlen(server_addr.sun_path) + sizeof(server_addr.sun_family); client_addr_len = sizeof(client_addr); unlink(server_addr.sun_path); if (bind(server_fd, (struct sockaddr *)&server_addr, server_addr_len) == -1) { pr_perror("Can't bind"); goto err; } pr_info("The service socket is bound to %s\n", server_addr.sun_path); /* change service socket permissions, so anyone can connect to it */ if (chmod(server_addr.sun_path, 0666)) { pr_perror("Can't change permissions of the service socket"); goto err; } if (listen(server_fd, 16) == -1) { pr_perror("Can't listen for socket connections"); goto err; } } if (daemon_mode) { if (daemon(1, 0) == -1) { pr_perror("Can't run service server in the background"); goto err; } } if (opts.pidfile) { if (write_pidfile(getpid()) == -1) { pr_perror("Can't write pidfile"); goto err; } } if (setup_sigchld_handler()) goto err; if (status_ready()) goto err; while (1) { int sk; pr_info("Waiting for connection...\n"); sk = accept(server_fd, (struct sockaddr *)&client_addr, &client_addr_len); if (sk == -1) { pr_perror("Can't accept connection"); goto err; } pr_info("Connected.\n"); child_pid = fork(); if (child_pid == 0) { int ret; if (restore_sigchld_handler()) exit(1); close(server_fd); init_opts(); ret = cr_service_work(sk); close(sk); exit(ret != 0); } if (child_pid < 0) pr_perror("Can't fork a child"); close(sk); } err: close_safe(&server_fd); return 1; } crac-criu-1.5.0/criu/crtools.c000066400000000000000000000533141471504326700161770ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "int.h" #include "page.h" #include "common/compiler.h" #include "crtools.h" #include "cr_options.h" #include "external.h" #include "files.h" #include "sk-inet.h" #include "net.h" #include "page-xfer.h" #include "tty.h" #include "file-lock.h" #include "cr-service.h" #include "plugin.h" #include "criu-log.h" #include "util.h" #include "protobuf-desc.h" #include "namespaces.h" #include "cgroup.h" #include "cpu.h" #include "fault-injection.h" #include "proc_parse.h" #include "kerndat.h" #include "setproctitle.h" #include "sysctl.h" void flush_early_log_to_stderr(void) __attribute__((destructor)); void flush_early_log_to_stderr(void) { flush_early_log_buffer(STDERR_FILENO); } static int image_dir_mode(char *argv[], int optind) { switch (opts.mode) { case CR_DUMP: /* fallthrough */ case CR_PRE_DUMP: return O_DUMP; case CR_RESTORE: return O_RSTR; case CR_CPUINFO: if (!strcmp(argv[optind + 1], "dump")) return O_DUMP; /* fallthrough */ default: return -1; } /* never reached */ BUG(); return -1; } static int parse_criu_mode(char *mode) { if (!strcmp(mode, "dump")) opts.mode = CR_DUMP; else if (!strcmp(mode, "pre-dump")) opts.mode = CR_PRE_DUMP; else if (!strcmp(mode, "restore")) opts.mode = CR_RESTORE; else if (!strcmp(mode, "lazy-pages")) opts.mode = CR_LAZY_PAGES; else if (!strcmp(mode, "check")) opts.mode = CR_CHECK; else if (!strcmp(mode, "page-server")) opts.mode = CR_PAGE_SERVER; else if (!strcmp(mode, "service")) opts.mode = CR_SERVICE; else if (!strcmp(mode, "swrk")) opts.mode = CR_SWRK; else if (!strcmp(mode, "dedup")) opts.mode = CR_DEDUP; else if (!strcmp(mode, "cpuinfo")) opts.mode = CR_CPUINFO; else if (!strcmp(mode, "exec")) opts.mode = CR_EXEC_DEPRECATED; else if (!strcmp(mode, "show")) opts.mode = CR_SHOW_DEPRECATED; else return -1; return 0; } int main(int argc, char *argv[], char *envp[]) { int ret = -1; bool usage_error = true; bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); /* We use it for fd overlap handling in clone_service_fd() */ BUG_ON(get_service_fd(SERVICE_FD_MIN + 1) < get_service_fd(SERVICE_FD_MAX - 1)); if (fault_injection_init()) { pr_err("Failed to initialize fault injection when initializing crtools.\n"); return 1; } cr_pb_init(); __setproctitle_init(argc, argv, envp); if (argc < 2) goto usage; init_opts(); ret = parse_options(argc, argv, &usage_error, &has_exec_cmd, state); if (ret == 1) return 1; if (ret == 2) goto usage; if (optind >= argc) { pr_err("command is required\n"); goto usage; } log_set_loglevel(opts.log_level); /* * There kernel might send us lethal signals in the following cases: * 1) Writing a pipe which reader has disappeared. * 2) Writing to a socket of type SOCK_STREAM which is no longer connected. * We deal with write()/Send() failures on our own, and prefer not to get killed. * So we ignore SIGPIPEs. * * Pipes are used in various places: * 1) Receiving application page data * 2) Transmitting data to the image streamer * 3) Emitting logs (potentially to a pipe). * Sockets are mainly used in transmitting memory data. */ if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { pr_perror("Failed to set a SIGPIPE signal ignore."); return 1; } if (parse_criu_mode(argv[optind])) { pr_err("unknown command: %s\n", argv[optind]); goto usage; } if (opts.mode == CR_SWRK) { if (argc != optind + 2) { fprintf(stderr, "Usage: criu swrk \n"); return 1; } /* * This is to start criu service worker from libcriu calls. * The usage is "criu swrk " and is not for CLI/scripts. * The arguments semantics can change at any time with the * corresponding lib call change. */ opts.swrk_restore = true; return cr_service_work(atoi(argv[optind + 1])); } if (check_caps()) return 1; if (opts.imgs_dir == NULL) SET_CHAR_OPTS(imgs_dir, "."); if (opts.work_dir == NULL) SET_CHAR_OPTS(work_dir, opts.imgs_dir); has_sub_command = (argc - optind) > 1; if (has_exec_cmd) { if (!has_sub_command) { pr_err("--exec-cmd requires a command\n"); goto usage; } if (opts.mode != CR_RESTORE) { pr_err("--exec-cmd is available for the restore command only\n"); goto usage; } if (opts.restore_detach) { pr_err("--restore-detached and --exec-cmd cannot be used together\n"); goto usage; } opts.exec_cmd = xmalloc((argc - optind) * sizeof(char *)); if (!opts.exec_cmd) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; } else { /* No subcommands except for cpuinfo and restore --exec-cmd */ if (opts.mode != CR_CPUINFO && has_sub_command) { pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); goto usage; } else if (opts.mode == CR_CPUINFO && !has_sub_command) { pr_err("cpuinfo requires an action: dump or check\n"); goto usage; } } if (opts.stream && image_dir_mode(argv, optind) == -1) { pr_err("--stream cannot be used with the %s command\n", argv[optind]); goto usage; } /* We must not open imgs dir, if service is called */ if (opts.mode != CR_SERVICE) { ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); if (ret < 0) { pr_err("Couldn't open image dir %s\n", opts.imgs_dir); return 1; } } /* * When a process group becomes an orphan, * its processes are sent a SIGHUP signal */ if (opts.mode == CR_RESTORE && opts.restore_detach && opts.final_state == TASK_STOPPED && opts.shell_job) pr_warn("Stopped and detached shell job will get SIGHUP from OS.\n"); if (chdir(opts.work_dir)) { pr_perror("Can't change directory to %s", opts.work_dir); return 1; } util_init(); if (log_init(opts.output)) return 1; if (kerndat_init()) { pr_err("Could not initialize kernel features detection.\n"); return 1; } if (check_options()) return 1; if (fault_injected(FI_CANNOT_MAP_VDSO)) kdat.can_map_vdso = 0; if (!strcmp(argv[optind], "restore")) { if (inherit_fd_parse("fd[0]:fd[0]") < 0) return 1; if (inherit_fd_parse("fd[1]:fd[1]") < 0) return 1; if (inherit_fd_parse("fd[2]:fd[2]") < 0) return 1; } if (!list_empty(&opts.inherit_fds)) { if (opts.mode != CR_RESTORE) { pr_err("--inherit-fd is restore-only option\n"); return 1; } /* now that log file is set up, print inherit fd list */ inherit_fd_log(); } if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); if (opts.mode == CR_DUMP) { if (!opts.tree_id) goto opt_pid_missing; return cr_dump_tasks(opts.tree_id); } if (opts.mode == CR_PRE_DUMP) { if (!opts.tree_id) goto opt_pid_missing; if (opts.lazy_pages) { pr_err("Cannot pre-dump with --lazy-pages\n"); return 1; } return cr_pre_dump_tasks(opts.tree_id) != 0; } if (opts.mode == CR_RESTORE) { if (opts.tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); ret = cr_restore_tasks(); if (ret == 0 && opts.exec_cmd) { close_pid_proc(); execvp(opts.exec_cmd[0], opts.exec_cmd); pr_perror("Failed to exec command %s", opts.exec_cmd[0]); ret = 1; } return ret != 0; } if (opts.mode == CR_LAZY_PAGES) return cr_lazy_pages(opts.daemon_mode) != 0; if (opts.mode == CR_CHECK) return cr_check() != 0; if (opts.mode == CR_PAGE_SERVER) return cr_page_server(opts.daemon_mode, false, -1) != 0; if (opts.mode == CR_SERVICE) return cr_service(opts.daemon_mode); if (opts.mode == CR_DEDUP) return cr_dedup() != 0; if (opts.mode == CR_CPUINFO) { if (!argv[optind + 1]) { pr_err("cpuinfo requires an action: dump or check\n"); goto usage; } if (!strcmp(argv[optind + 1], "dump")) return cpuinfo_dump(); else if (!strcmp(argv[optind + 1], "check")) return cpuinfo_check(); } if (opts.mode == CR_EXEC_DEPRECATED) { pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; } if (opts.mode == CR_SHOW_DEPRECATED) { pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; } pr_err("unknown command: %s\n", argv[optind]); usage: pr_msg("\n" "Usage:\n" " criu dump|pre-dump -t PID []\n" " criu restore []\n" " criu check [--feature FEAT]\n" " criu page-server\n" " criu service []\n" " criu dedup\n" " criu lazy-pages -D DIR []\n" "\n" "Commands:\n" " dump checkpoint a process/tree identified by pid\n" " pre-dump pre-dump task(s) minimizing their frozen time\n" " restore restore a process/tree\n" " check checks whether the kernel support is up-to-date\n" " page-server launch page server\n" " service launch service\n" " dedup remove duplicates in memory dump\n" " cpuinfo dump writes cpu information into image file\n" " cpuinfo check validates cpu information read from image file\n"); if (usage_error) { pr_msg("\nTry -h|--help for more info\n"); return 1; } pr_msg("\n" "Most of the true / false long options (the ones without arguments) can be\n" "prefixed with --no- to negate the option (example: --display-stats and\n" "--no-display-stats).\n" "\n" "Dump/Restore options:\n" "\n" "* Generic:\n" " -t|--tree PID checkpoint a process tree identified by PID\n" " -d|--restore-detached detach after restore\n" " -S|--restore-sibling restore root task as sibling\n" " -s|--leave-stopped leave tasks in stopped state after checkpoint\n" " -R|--leave-running leave tasks in running state after checkpoint\n" " -D|--images-dir DIR directory for image files\n" " --pidfile FILE write root task, service or page-server pid to FILE\n" " -W|--work-dir DIR directory to cd and write logs/pidfiles/stats to\n" " (if not specified, value of --images-dir is used)\n" " --cpu-cap [CAP] CPU capabilities to write/check. CAP is comma-separated\n" " list of: cpu, fpu, all, ins, none. To disable\n" " a capability, use ^CAP. Empty argument implies all\n" " --exec-cmd execute the command specified after '--' on successful\n" " restore making it the parent of the restored process\n" " --freeze-cgroup use cgroup freezer to collect processes\n" " --weak-sysctls skip restoring sysctls that are not available\n" " --lazy-pages restore pages on demand\n" " this requires running a second instance of criu\n" " in lazy-pages mode: 'criu lazy-pages -D DIR'\n" " --lazy-pages and lazy-pages mode require userfaultfd\n" " --mmap-page-image mmap pages from image on restore\n" " --stream dump/restore images using criu-image-streamer\n" " --mntns-compat-mode Use mount engine in compatibility mode. By default criu\n" " tries to use mount-v2 mode with more reliable algorithm\n" " based on MOVE_MOUNT_SET_GROUP kernel feature\n" " --compress use compressed image files when possible\n" " --network-lock METHOD network locking/unlocking method; argument\n" " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" " Formats of RES on dump:\n" " tty[rdev:dev]\n" " file[mnt_id:inode]\n" " dev[major/minor]:NAME\n" " unix[ino]\n" " mnt[MOUNTPOINT]:COOKIE\n" " mnt[]{:AUTO_OPTIONS}\n" " Formats of RES on restore:\n" " dev[NAME]:DEVPATH\n" " veth[IFNAME]:OUTNAME{@BRIDGE}\n" " macvlan[IFNAME]:OUTNAME\n" " mnt[COOKIE]:ROOT\n" " netdev[IFNAME]:ORIGNAME\n" "\n" "* Special resources support:\n" " --" SK_EST_PARAM " checkpoint/restore established TCP connections\n" " --" SK_INFLIGHT_PARAM " skip (ignore) in-flight TCP connections\n" " --" SK_CLOSE_PARAM " don't dump the state of, or block, established tcp\n" " connections, and restore them in closed state.\n" " -r|--root PATH change the root filesystem (when run in mount namespace)\n" " --evasive-devices use any path to a device file if the original one\n" " is inaccessible\n" " --link-remap allow one to link unlinked files back when possible\n" " --ghost-limit size limit max size of deleted file contents inside image\n" " --ghost-fiemap enable dumping of deleted files using fiemap\n" " --action-script FILE add an external action script\n" " -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" " -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n" " -L|--libdir path to a plugin directory (by default " CR_PLUGIN_DEFAULT ")\n" " --timeout NUM a timeout (in seconds) on collecting tasks during dump\n" " (default 10 seconds)\n" " --force-irmap force resolving names for inotify/fsnotify watches\n" " --irmap-scan-path FILE\n" " add a path the irmap hints to scan\n" " --manage-cgroups [m] dump/restore process' cgroups; argument can be one of\n" " 'none', 'props', 'soft' (default), 'full', 'strict'\n" " or 'ignore'\n" " --cgroup-root [controller:]/newroot\n" " on dump: change the root for the controller that will\n" " be dumped. By default, only the paths with tasks in\n" " them and below will be dumped.\n" " on restore: change the root cgroup the controller will\n" " be installed into. No controller means that root is the\n" " default for all controllers not specified\n" " --cgroup-props STRING\n" " define cgroup controllers and properties\n" " to be checkpointed, which are described\n" " via STRING using simplified YAML format\n" " --cgroup-props-file FILE\n" " same as --cgroup-props, but taking description\n" " from the path specified\n" " --cgroup-dump-controller NAME\n" " define cgroup controller to be dumped\n" " and skip anything else present in system\n" " --cgroup-yard PATH\n" " instead of trying to mount cgroups in CRIU, provide\n" " a path to a directory with already created cgroup yard.\n" " Useful if you don't want to grant CAP_SYS_ADMIN to CRIU\n" " --lsm-profile TYPE:NAME\n" " Specify an LSM profile to be used during restore.\n" " The type can be either 'apparmor' or 'selinux'.\n" " --lsm-mount-context CTX\n" " Specify a mount context to be used during restore.\n" " Only mounts with an existing context will have their\n" " mount context replaced with CTX.\n" " --skip-mnt PATH ignore this mountpoint when dumping the mount namespace\n" " --enable-fs FSNAMES a comma separated list of filesystem names or \"all\"\n" " force criu to (try to) dump/restore these filesystem's\n" " mountpoints even if fs is not supported\n" " --inherit-fd fd[NUM]:RES\n" " Inherit file descriptors, treating fd NUM as being\n" " already opened via an existing RES, which can be:\n" " tty[rdev:dev]\n" " pipe[inode]\n" " socket[inode]\n" " file[mnt_id:inode]\n" " /memfd:name\n" " path/to/file\n" " --empty-ns net Create a namespace, but don't restore its properties\n" " (assuming it will be restored by action scripts)\n" " -J|--join-ns NS:{PID|NS_FILE}[,OPTIONS]\n" " Join existing namespace and restore process in it.\n" " Namespace can be specified as either pid or file path.\n" " OPTIONS can be used to specify parameters for userns:\n" " user:PID,UID,GID\n" " --file-validation METHOD\n" " pass the validation method to be used; argument\n" " can be 'filesize' or 'buildid' (default).\n" " --skip-file-rwx-check\n" " Skip checking file permissions\n" " (r/w/x for u/g/o) on restore.\n" "\n" "Check options:\n" " Without options, \"criu check\" checks availability of absolutely required\n" " kernel features, critical for performing dump and restore.\n" " --extra add check for extra kernel features\n" " --experimental add check for experimental kernel features\n" " --all same as --extra --experimental\n" " --feature FEAT only check a particular feature, one of:"); pr_check_features(" ", ", ", 80); pr_msg("\n" "* Logging:\n" " -o|--log-file FILE log file name\n" " --log-pid enable per-process logging to separate FILE.pid files\n" " -v[v...]|--verbosity increase verbosity (can use multiple v)\n" " -vNUM|--verbosity=NUM set verbosity to NUM (higher level means more output):\n" " -v1 - only errors and messages\n" " -v2 - also warnings (default level)\n" " -v3 - also information messages and timestamps\n" " -v4 - lots of debug\n" " --display-stats print out dump/restore stats\n" "\n" "* Memory dumping options:\n" " --track-mem turn on memory changes tracker in kernel\n" " --prev-images-dir DIR path to images from previous dump (relative to -D)\n" " --page-server send pages to page server (see options below as well)\n" " --auto-dedup when used on dump it will deduplicate \"old\" data in\n" " pages images of previous dump\n" " when used on restore, as soon as page is restored, it\n" " will be punched from the image\n" " --pre-dump-mode splice - parasite based pre-dumping (default)\n" " read - process_vm_readv syscall based pre-dumping\n" "\n" "Page/Service server options:\n" " --address ADDR address of server or service\n" " --port PORT port of page server\n" " --ps-socket FD use specified FD as page server socket\n" " -d|--daemon run in the background after creating socket\n" " --status-fd FD write \\0 to the FD and close it once process is ready\n" " to handle requests\n" #ifdef CONFIG_GNUTLS " --tls-cacert FILE trust certificates signed only by this CA\n" " --tls-cacrl FILE path to CA certificate revocation list file\n" " --tls-cert FILE path to TLS certificate file\n" " --tls-key FILE path to TLS private key file\n" " --tls use TLS to secure remote connection\n" " --tls-no-cn-verify do not verify common name in server certificate\n" #endif "\n" "Configuration file options:\n" " --config FILEPATH pass a specific configuration file\n" " --no-default-config forbid usage of default configuration files\n" "\n" "Other options:\n" " -h|--help show this text\n" " -V|--version show version\n"); return 0; opt_pid_missing: pr_err("pid not specified\n"); return 1; } crac-criu-1.5.0/criu/eventfd.c000066400000000000000000000047611471504326700161470ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "common/compiler.h" #include "imgset.h" #include "eventfd.h" #include "fdinfo.h" #include "image.h" #include "util.h" #include "log.h" #include "protobuf.h" #include "images/eventfd.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "eventfd: " struct eventfd_file_info { EventfdFileEntry *efe; struct file_desc d; }; /* Checks if file descriptor @lfd is eventfd */ int is_eventfd_link(char *link) { return is_anon_link_type(link, "[eventfd]"); } static void pr_info_eventfd(char *action, EventfdFileEntry *efe) { pr_info("%s: id %#08x flags %#04x counter %#016" PRIx64 "\n", action, efe->id, efe->flags, efe->counter); } static int dump_one_eventfd(int lfd, u32 id, const struct fd_parms *p) { EventfdFileEntry efd = EVENTFD_FILE_ENTRY__INIT; FileEntry fe = FILE_ENTRY__INIT; if (parse_fdinfo(lfd, FD_TYPES__EVENTFD, &efd)) return -1; efd.id = id; efd.flags = p->flags; efd.fown = (FownEntry *)&p->fown; fe.type = FD_TYPES__EVENTFD; fe.id = efd.id; fe.efd = &efd; pr_info_eventfd("Dumping ", &efd); return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } const struct fdtype_ops eventfd_dump_ops = { .type = FD_TYPES__EVENTFD, .dump = dump_one_eventfd, }; static int eventfd_open(struct file_desc *d, int *new_fd) { struct eventfd_file_info *info; int tmp; info = container_of(d, struct eventfd_file_info, d); tmp = eventfd(info->efe->counter, 0); if (tmp < 0) { pr_perror("Can't create eventfd %#08x", info->efe->id); return -1; } if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) { pr_perror("Can't restore params on eventfd %#08x", info->efe->id); goto err_close; } *new_fd = tmp; return 0; err_close: close(tmp); return -1; } static struct file_desc_ops eventfd_desc_ops = { .type = FD_TYPES__EVENTFD, .open = eventfd_open, }; static int collect_one_efd(void *obj, ProtobufCMessage *msg, struct cr_img *i) { struct eventfd_file_info *info = obj; info->efe = pb_msg(msg, EventfdFileEntry); pr_info_eventfd("Collected ", info->efe); return file_desc_add(&info->d, info->efe->id, &eventfd_desc_ops); } struct collect_image_info eventfd_cinfo = { .fd_type = CR_FD_EVENTFD_FILE, .pb_type = PB_EVENTFD_FILE, .priv_size = sizeof(struct eventfd_file_info), .collect = collect_one_efd, }; crac-criu-1.5.0/criu/eventpoll.c000066400000000000000000000257671471504326700165350ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include "crtools.h" #include "common/compiler.h" #include "imgset.h" #include "rst_info.h" #include "eventpoll.h" #include "fdinfo.h" #include "image.h" #include "util.h" #include "log.h" #include "pstree.h" #include "parasite.h" #include "kerndat.h" #include "file-ids.h" #include "kcmp-ids.h" #include "protobuf.h" #include "images/eventpoll.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "epoll: " static LIST_HEAD(dinfo_list); typedef struct { uint32_t tfd; uint32_t off; uint32_t idx; } toff_t; struct eventpoll_dinfo { struct list_head list; FileEntry *fe; EventpollFileEntry *e; toff_t *toff; FownEntry fown; pid_t pid; int efd; }; struct eventpoll_file_info { EventpollFileEntry *efe; struct file_desc d; }; /* Checks if file descriptor @lfd is eventfd */ int is_eventpoll_link(char *link) { return is_anon_link_type(link, "[eventpoll]"); } static void pr_info_eventpoll_tfd(char *action, uint32_t id, EventpollTfdEntry *e) { pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016" PRIx64 "\n", action, id, e->tfd, e->events, e->data); } static void pr_info_eventpoll(char *action, EventpollFileEntry *e) { pr_info("%seventpoll: id %#08x flags %#04x\n", action, e->id, e->flags); } static int queue_dinfo(FileEntry **fe, EventpollFileEntry **e, toff_t **toff, const struct fd_parms *p) { struct eventpoll_dinfo *dinfo; pr_info_eventpoll("Queueing ", *e); dinfo = xmalloc(sizeof(*dinfo)); if (!dinfo) return -ENOMEM; memcpy(&dinfo->fown, &p->fown, sizeof(dinfo->fown)); INIT_LIST_HEAD(&dinfo->list); dinfo->fe = *fe; dinfo->e = *e; dinfo->toff = *toff; dinfo->e->fown = &dinfo->fown; dinfo->pid = p->pid; dinfo->efd = p->fd; *fe = NULL; *e = NULL; *toff = NULL; list_add_tail(&dinfo->list, &dinfo_list); return 0; } static void dequeue_dinfo(struct eventpoll_dinfo *dinfo) { ssize_t i; for (i = 0; i < dinfo->e->n_tfd; i++) eventpoll_tfd_entry__free_unpacked(dinfo->e->tfd[i], NULL); xfree(dinfo->fe); xfree(dinfo->e->tfd); xfree(dinfo->e); xfree(dinfo->toff); list_del(&dinfo->list); xfree(dinfo); } int flush_eventpoll_dinfo_queue(void) { struct eventpoll_dinfo *dinfo, *t; ssize_t i; list_for_each_entry_safe(dinfo, t, &dinfo_list, list) { EventpollFileEntry *e = dinfo->e; for (i = 0; i < e->n_tfd; i++) { EventpollTfdEntry *tfde = e->tfd[i]; struct kid_elem ke = { .pid = dinfo->pid, .genid = make_gen_id(tfde->dev, tfde->inode, tfde->pos), .idx = tfde->tfd, }; kcmp_epoll_slot_t slot = { .efd = dinfo->efd, .tfd = tfde->tfd, .toff = dinfo->toff[i].off, }; struct kid_elem *t = kid_lookup_epoll_tfd(&fd_tree, &ke, &slot); if (!t) { pr_debug("kid_lookup_epoll: no match pid %d efd %d tfd %d toff %u\n", dinfo->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); goto err; } pr_debug("kid_lookup_epoll: rbsearch match pid %d efd %d tfd %d toff %u -> %d\n", dinfo->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off, t->idx); /* Make sure the pid matches */ if (t->pid != dinfo->pid) { pr_debug("kid_lookup_epoll: pid mismatch %d %d efd %d tfd %d toff %u\n", dinfo->pid, t->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); goto err; } tfde->tfd = t->idx; } pr_info_eventpoll("Dumping ", e); if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), dinfo->fe, PB_FILE)) goto err; for (i = 0; i < e->n_tfd; i++) pr_info_eventpoll_tfd("Dumping: ", e->id, e->tfd[i]); dequeue_dinfo(dinfo); } return 0; err: list_for_each_entry_safe(dinfo, t, &dinfo_list, list) dequeue_dinfo(dinfo); return -1; } static int tfd_cmp(const void *a, const void *b) { if (((int *)a)[0] > ((int *)b)[0]) return 1; if (((int *)a)[0] < ((int *)b)[0]) return -1; return 0; } static int toff_cmp(const void *a, const void *b) { if (((toff_t *)a)[0].tfd > ((toff_t *)b)[0].tfd) return 1; if (((toff_t *)a)[0].tfd < ((toff_t *)b)[0].tfd) return -1; if (((toff_t *)a)[0].idx > ((toff_t *)b)[0].idx) return 1; if (((toff_t *)a)[0].idx < ((toff_t *)b)[0].idx) return -1; return 0; } static int toff_cmp_idx(const void *a, const void *b) { if (((toff_t *)a)[0].idx > ((toff_t *)b)[0].idx) return 1; if (((toff_t *)a)[0].idx < ((toff_t *)b)[0].idx) return -1; return 0; } /* * fds in fd_parms are sorted so we can use binary search * for better performance. */ static int find_tfd_bsearch(pid_t pid, int efd, int fds[], size_t nr_fds, int tfd, unsigned int toff) { kcmp_epoll_slot_t slot = { .efd = efd, .tfd = tfd, .toff = toff, }; int *tfd_found; pr_debug("find_tfd_bsearch: pid %d efd %d tfd %d toff %u\n", pid, efd, tfd, toff); /* * Optimistic case: the target fd belongs to us * and wasn't dup'ed. */ tfd_found = bsearch(&tfd, fds, nr_fds, sizeof(int), tfd_cmp); if (tfd_found) { if (kdat.has_kcmp_epoll_tfd) { if (syscall(SYS_kcmp, pid, pid, KCMP_EPOLL_TFD, tfd, &slot) == 0) { pr_debug("find_tfd_bsearch (kcmp-yes): bsearch match pid %d efd %d tfd %d toff %u\n", pid, efd, tfd, toff); return tfd; } } else { pr_debug("find_tfd_bsearch (kcmp-no): bsearch match pid %d efd %d tfd %d toff %u\n", pid, efd, tfd, toff); return tfd; } } pr_debug("find_tfd_bsearch: no match pid %d efd %d tfd %d toff %u\n", pid, efd, tfd, toff); return -1; } static int dump_one_eventpoll(int lfd, u32 id, const struct fd_parms *p) { toff_t *toff = NULL; EventpollFileEntry *e = NULL; FileEntry *fe = NULL; int ret = -1; ssize_t i; e = xmalloc(sizeof(*e)); if (!e) goto out; eventpoll_file_entry__init(e); fe = xmalloc(sizeof(*fe)); if (!fe) goto out; file_entry__init(fe); e->id = id; e->flags = p->flags; e->fown = (FownEntry *)&p->fown; if (parse_fdinfo(lfd, FD_TYPES__EVENTPOLL, e)) goto out; fe->type = FD_TYPES__EVENTPOLL; fe->id = e->id; fe->epfd = e; /* * In regular case there is no so many dup'ed * descriptors so instead of complex mappings * lets rather walk over members with O(n^2) */ if (p->dfds) { toff = xmalloc(sizeof(*toff) * e->n_tfd); if (!toff) goto out; for (i = 0; i < e->n_tfd; i++) { toff[i].idx = i; toff[i].tfd = e->tfd[i]->tfd; toff[i].off = 0; } qsort(toff, e->n_tfd, sizeof(*toff), toff_cmp); for (i = 1; i < e->n_tfd; i++) if (toff[i].tfd == toff[i - 1].tfd) toff[i].off = toff[i - 1].off + 1; qsort(toff, e->n_tfd, sizeof(*toff), toff_cmp_idx); } /* * Handling dup'ed or transferred target * files is tricky: we need to use kcmp * to find out where file came from. Until * it's implemented lets use simpler approach * just check the targets are belonging to the * pid's file set. */ if (p->dfds) { for (i = 0; i < e->n_tfd; i++) { int tfd = find_tfd_bsearch(p->pid, p->fd, p->dfds->fds, p->dfds->nr_fds, e->tfd[i]->tfd, toff[i].off); if (tfd == -1) { if (kdat.has_kcmp_epoll_tfd) { ret = queue_dinfo(&fe, &e, &toff, p); } else { pr_err("Escaped/closed fd descriptor %d on pid %d\n", e->tfd[i]->tfd, p->pid); } goto out; } } } else pr_warn_once("Unix SCM files are not verified\n"); pr_info_eventpoll("Dumping ", e); ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), fe, PB_FILE); if (!ret) { for (i = 0; i < e->n_tfd; i++) pr_info_eventpoll_tfd("Dumping: ", e->id, e->tfd[i]); } out: for (i = 0; e && i < e->n_tfd; i++) eventpoll_tfd_entry__free_unpacked(e->tfd[i], NULL); xfree(fe); if (e) xfree(e->tfd); xfree(e); xfree(toff); return ret; } const struct fdtype_ops eventpoll_dump_ops = { .type = FD_TYPES__EVENTPOLL, .dump = dump_one_eventpoll, }; static int eventpoll_post_open(struct file_desc *d, int fd); static int eventpoll_open(struct file_desc *d, int *new_fd) { struct fdinfo_list_entry *fle = file_master(d); struct eventpoll_file_info *info; int tmp; info = container_of(d, struct eventpoll_file_info, d); if (fle->stage >= FLE_OPEN) return eventpoll_post_open(d, fle->fe->fd); pr_info_eventpoll("Restore ", info->efe); tmp = epoll_create(1); if (tmp < 0) { pr_perror("Can't create epoll %#08x", info->efe->id); return -1; } if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) { pr_perror("Can't restore file params on epoll %#08x", info->efe->id); goto err_close; } *new_fd = tmp; return 1; err_close: close(tmp); return -1; } static int epoll_not_ready_tfd(EventpollTfdEntry *tdefe) { struct fdinfo_list_entry *fle; list_for_each_entry(fle, &rsti(current)->fds, ps_list) { if (tdefe->tfd != fle->fe->fd) continue; if (fle->desc->ops->type == FD_TYPES__EVENTPOLL) return (fle->stage < FLE_OPEN); else return (fle->stage != FLE_RESTORED); } /* * If tgt fle is not on the fds list, it's already * restored (see open_fdinfos), so we're ready. */ return 0; } static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe) { struct epoll_event event; pr_info_eventpoll_tfd("Restore ", id, tdefe); event.events = tdefe->events; event.data.u64 = tdefe->data; if (epoll_ctl(fd, EPOLL_CTL_ADD, tdefe->tfd, &event)) { pr_perror("Can't add event on %#08x", id); return -1; } return 0; } static int eventpoll_post_open(struct file_desc *d, int fd) { struct eventpoll_file_info *info; int i; info = container_of(d, struct eventpoll_file_info, d); for (i = 0; i < info->efe->n_tfd; i++) { if (epoll_not_ready_tfd(info->efe->tfd[i])) return 1; } for (i = 0; i < info->efe->n_tfd; i++) { if (eventpoll_retore_tfd(fd, info->efe->id, info->efe->tfd[i])) return -1; } return 0; } static struct file_desc_ops desc_ops = { .type = FD_TYPES__EVENTPOLL, .open = eventpoll_open, }; static int collect_one_epoll_tfd(void *o, ProtobufCMessage *msg, struct cr_img *i) { EventpollTfdEntry *tfde; struct file_desc *d; struct eventpoll_file_info *ef; EventpollFileEntry *efe; int n_tfd; if (!deprecated_ok("Epoll TFD image")) return -1; tfde = pb_msg(msg, EventpollTfdEntry); d = find_file_desc_raw(FD_TYPES__EVENTPOLL, tfde->id); if (!d) { pr_err("No epoll FD for %u\n", tfde->id); return -1; } ef = container_of(d, struct eventpoll_file_info, d); efe = ef->efe; n_tfd = efe->n_tfd + 1; if (xrealloc_safe(&efe->tfd, n_tfd * sizeof(EventpollTfdEntry *))) return -1; efe->tfd[efe->n_tfd] = tfde; efe->n_tfd = n_tfd; return 0; } struct collect_image_info epoll_tfd_cinfo = { .fd_type = CR_FD_EVENTPOLL_TFD, .pb_type = PB_EVENTPOLL_TFD, .collect = collect_one_epoll_tfd, .flags = COLLECT_NOFREE, }; static int collect_one_epoll(void *o, ProtobufCMessage *msg, struct cr_img *i) { struct eventpoll_file_info *info = o; info->efe = pb_msg(msg, EventpollFileEntry); pr_info_eventpoll("Collected ", info->efe); return file_desc_add(&info->d, info->efe->id, &desc_ops); } struct collect_image_info epoll_cinfo = { .fd_type = CR_FD_EVENTPOLL_FILE, .pb_type = PB_EVENTPOLL_FILE, .priv_size = sizeof(struct eventpoll_file_info), .collect = collect_one_epoll, }; crac-criu-1.5.0/criu/external.c000066400000000000000000000032221471504326700163250ustar00rootroot00000000000000#include "common/err.h" #include "common/list.h" #include "cr_options.h" #include "xmalloc.h" #include "mount.h" #include "external.h" #include "util.h" #include "net.h" int add_external(char *key) { struct external *ext; if (strstartswith(key, "mnt[]")) return ext_mount_parse_auto(key + 5); ext = xmalloc(sizeof(*ext)); if (!ext) return -1; ext->id = xstrdup(key); if (!ext->id) goto err_id; if (strstartswith(key, "macvlan") && macvlan_ext_add(ext) < 0) goto err; list_add(&ext->node, &opts.external); return 0; err: xfree(ext->id); err_id: xfree(ext); return -1; } bool external_lookup_id(char *id) { struct external *ext; list_for_each_entry(ext, &opts.external, node) if (!strcmp(ext->id, id)) return true; return false; } void *external_lookup_data(char *key) { struct external *ext; int len = strlen(key); list_for_each_entry(ext, &opts.external, node) { if (strncmp(ext->id, key, len)) continue; return ext->data; } return ERR_PTR(-ENOENT); } char *external_lookup_by_key(char *key) { struct external *ext; int len = strlen(key); list_for_each_entry(ext, &opts.external, node) { if (strncmp(ext->id, key, len)) continue; if (ext->id[len] == ':') return ext->id + len + 1; else if (ext->id[len] == '\0') return NULL; } return ERR_PTR(-ENOENT); } int external_for_each_type(char *type, int (*cb)(struct external *, void *), void *arg) { struct external *ext; int ln = strlen(type); int ret = 0; list_for_each_entry(ext, &opts.external, node) { if (strncmp(ext->id, type, ln)) continue; if (ext->id[ln] != '[') continue; ret = cb(ext, arg); if (ret) break; } return ret; } crac-criu-1.5.0/criu/fault-injection.c000066400000000000000000000005601471504326700176000ustar00rootroot00000000000000#include #include "criu-log.h" #include "fault-injection.h" enum faults fi_strategy; int fault_injection_init(void) { char *val; int start; val = getenv("CRIU_FAULT"); if (val == NULL) return 0; start = atoi(val); if (start <= 0 || start >= FI_MAX) { pr_err("CRIU_FAULT out of bounds.\n"); return -1; } fi_strategy = start; return 0; } crac-criu-1.5.0/criu/fdstore.c000066400000000000000000000055021471504326700161540ustar00rootroot00000000000000#include #include #include #include #include #include #include "common/scm.h" #include "common/lock.h" #include "servicefd.h" #include "fdstore.h" #include "xmalloc.h" #include "rst-malloc.h" #include "log.h" #include "util.h" #include "cr_options.h" #include "util-caps.h" #include "sockets.h" /* clang-format off */ static struct fdstore_desc { int next_id; mutex_t lock; /* to protect a peek offset */ } *desc; /* clang-format on */ int fdstore_init(void) { /* In kernel a bufsize has type int and a value is doubled. */ uint32_t buf[2] = { INT_MAX / 2, INT_MAX / 2 }; struct sockaddr_un addr; unsigned int addrlen; struct stat st; int sk, ret; desc = shmalloc(sizeof(*desc)); if (!desc) return -1; desc->next_id = 0; mutex_init(&desc->lock); sk = socket(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0); if (sk < 0) { pr_perror("Unable to create a socket"); return -1; } if (fstat(sk, &st)) { pr_perror("Unable to stat a file descriptor"); close(sk); return -1; } if (sk_setbufs(sk, buf)) { close(sk); return -1; } addr.sun_family = AF_UNIX; addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%" PRIx64, st.st_ino, criu_run_id); addrlen += sizeof(addr.sun_family); addr.sun_path[0] = 0; /* * This socket is connected to itself, so all messages are queued to * its receive queue. Here we are going to use this socket to store * file descriptors. For that we need to send a file descriptor in * a queue and remember its sequence number. Then we can set SO_PEEK_OFF * to get a file descriptor without dequeuing it. */ if (bind(sk, (struct sockaddr *)&addr, addrlen)) { pr_perror("Unable to bind a socket"); close(sk); return -1; } if (connect(sk, (struct sockaddr *)&addr, addrlen)) { pr_perror("Unable to connect a socket"); close(sk); return -1; } ret = install_service_fd(FDSTORE_SK_OFF, sk); if (ret < 0) return -1; return 0; } int fdstore_add(int fd) { int sk = get_service_fd(FDSTORE_SK_OFF); int id, ret; mutex_lock(&desc->lock); ret = send_fd(sk, NULL, 0, fd); if (ret) { pr_perror("Can't send fd %d into store", fd); mutex_unlock(&desc->lock); return -1; } id = desc->next_id++; mutex_unlock(&desc->lock); return id; } int fdstore_get(int id) { int sk, fd; sk = get_service_fd(FDSTORE_SK_OFF); if (sk < 0) { pr_err("Cannot get FDSTORE_SK_OFF fd\n"); return -1; } mutex_lock(&desc->lock); if (setsockopt(sk, SOL_SOCKET, SO_PEEK_OFF, &id, sizeof(id))) { mutex_unlock(&desc->lock); pr_perror("Unable to a peek offset"); return -1; } if (__recv_fds(sk, &fd, 1, NULL, 0, MSG_PEEK) < 0) { mutex_unlock(&desc->lock); pr_perror("Unable to get a file descriptor with the %d id", id); return -1; } mutex_unlock(&desc->lock); return fd; } crac-criu-1.5.0/criu/fifo.c000066400000000000000000000103741471504326700154340ustar00rootroot00000000000000#include #include #include #include #include #include "imgset.h" #include "image.h" #include "files.h" #include "files-reg.h" #include "file-ids.h" #include "pipes.h" #include "fifo.h" #include "protobuf.h" #include "images/regfile.pb-c.h" #include "images/fifo.pb-c.h" /* * FIFO checkpoint and restore is done in a bit unusual manner. * We use files-reg.c engine to save fifo path and flags, * thus regular files image will contain fifo descriptors which * are useless for reg-files engine itself but needed for our fifo * engine. * * In particular we dump fifo-entry automatically and appropriate * reg-file entry manually, thus on restore we need to ask reg-file * engine to restore fifo path and flags via direct call. */ struct fifo_info { struct list_head list; struct file_desc d; FifoEntry *fe; bool restore_data; }; static LIST_HEAD(fifo_head); static struct pipe_data_dump pd_fifo = { .img_type = CR_FD_FIFO_DATA, }; static int dump_one_fifo(int lfd, u32 id, const struct fd_parms *p) { struct cr_img *img = img_from_set(glob_imgset, CR_FD_FILES); FileEntry fe = FILE_ENTRY__INIT; FifoEntry e = FIFO_ENTRY__INIT; u32 rf_id; fd_id_generate_special(NULL, &rf_id); /* * It's a trick here, we use regular files dumping * code to save path to a fifo, then we reuse it * on restore. */ if (dump_one_reg_file(lfd, rf_id, p)) return -1; pr_info("Dumping fifo %d with id %#x pipe_id %#x\n", lfd, id, pipe_id(p)); e.id = id; e.pipe_id = pipe_id(p); e.has_regf_id = true; e.regf_id = rf_id; fe.type = FD_TYPES__FIFO; fe.id = e.id; fe.fifo = &e; if (pb_write_one(img, &fe, PB_FILE)) return -1; return dump_one_pipe_data(&pd_fifo, lfd, p); } const struct fdtype_ops fifo_dump_ops = { .type = FD_TYPES__FIFO, .dump = dump_one_fifo, }; static struct pipe_data_rst *pd_hash_fifo[PIPE_DATA_HASH_SIZE]; static int do_open_fifo(int ns_root_fd, struct reg_file_info *rfi, void *arg) { struct fifo_info *info = arg; int new_fifo, fake_fifo = -1; /* * The fifos (except read-write fifos) do wait until * another pipe-end get connected, so to be able to * proceed the restoration procedure we open a fake * fifo here. */ fake_fifo = openat(ns_root_fd, rfi->path, O_RDWR); if (fake_fifo < 0) { pr_perror("Can't open fake fifo %#x [%s]", info->fe->id, rfi->path); return -1; } new_fifo = openat(ns_root_fd, rfi->path, rfi->rfe->flags); if (new_fifo < 0) { pr_perror("Can't open fifo %#x [%s]", info->fe->id, rfi->path); goto out; } if (info->restore_data) if (restore_pipe_data(CR_FD_FIFO_DATA, fake_fifo, info->fe->pipe_id, pd_hash_fifo)) { close(new_fifo); new_fifo = -1; } out: close(fake_fifo); return new_fifo; } static int open_fifo_fd(struct file_desc *d, int *new_fd) { struct fifo_info *info = container_of(d, struct fifo_info, d); struct file_desc *reg_d; int fd; reg_d = collect_special_file(info->fe->has_regf_id ? info->fe->regf_id : info->fe->id); if (!reg_d) return -1; fd = open_path(reg_d, do_open_fifo, info); if (fd < 0) return -1; *new_fd = fd; return 0; } static struct file_desc_ops fifo_desc_ops = { .type = FD_TYPES__FIFO, .open = open_fifo_fd, }; static int collect_one_fifo(void *o, ProtobufCMessage *base, struct cr_img *i) { struct fifo_info *info = o, *f; info->fe = pb_msg(base, FifoEntry); pr_info("Collected fifo entry ID %#x PIPE ID %#x\n", info->fe->id, info->fe->pipe_id); /* check who will restore the fifo data */ list_for_each_entry(f, &fifo_head, list) if (f->fe->pipe_id == info->fe->pipe_id) break; if (&f->list == &fifo_head) { list_add(&info->list, &fifo_head); info->restore_data = true; } else { INIT_LIST_HEAD(&info->list); info->restore_data = false; } return file_desc_add(&info->d, info->fe->id, &fifo_desc_ops); } struct collect_image_info fifo_cinfo = { .fd_type = CR_FD_FIFO, .pb_type = PB_FIFO, .priv_size = sizeof(struct fifo_info), .collect = collect_one_fifo, }; static int collect_fifo_data(void *obj, ProtobufCMessage *msg, struct cr_img *img) { return do_collect_pipe_data(obj, msg, img, pd_hash_fifo); } struct collect_image_info fifo_data_cinfo = { .fd_type = CR_FD_FIFO_DATA, .pb_type = PB_PIPE_DATA, .priv_size = sizeof(struct pipe_data_rst), .collect = collect_fifo_data, }; crac-criu-1.5.0/criu/file-ids.c000066400000000000000000000041211471504326700161760ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "int.h" #include "file-ids.h" #include "rbtree.h" #include "kcmp-ids.h" #include "common/compiler.h" #include "image.h" #include "util.h" #include "irmap.h" #include "files.h" DECLARE_KCMP_TREE(fd_tree, KCMP_FILE); #define FDID_BITS 5 #define FDID_SIZE (1 << FDID_BITS) #define FDID_MASK (FDID_SIZE - 1) static inline int fdid_hashfn(unsigned int s_dev, unsigned long i_ino) { return (s_dev + i_ino) & FDID_MASK; } struct fd_id { int mnt_id; unsigned int dev; unsigned long ino; u32 id; struct fd_id *n; }; static struct fd_id *fd_id_cache[FDID_SIZE]; static void fd_id_cache_one(u32 id, struct fd_parms *p) { struct fd_id *fi; unsigned hv; fi = xmalloc(sizeof(*fi)); if (fi) { fi->dev = p->stat.st_dev; fi->ino = p->stat.st_ino; fi->mnt_id = p->mnt_id; fi->id = id; hv = fdid_hashfn(p->stat.st_dev, p->stat.st_ino); fi->n = fd_id_cache[hv]; fd_id_cache[hv] = fi; } } static struct fd_id *fd_id_cache_lookup(struct fd_parms *p) { struct stat *st = &p->stat; struct fd_id *fi; for (fi = fd_id_cache[fdid_hashfn(st->st_dev, st->st_ino)]; fi; fi = fi->n) if (fi->dev == st->st_dev && fi->ino == st->st_ino && fi->mnt_id == p->mnt_id) return fi; return NULL; } int fd_id_generate_special(struct fd_parms *p, u32 *id) { if (p) { struct fd_id *fi; fi = fd_id_cache_lookup(p); if (fi) { if (p->stat.st_mode & (S_IFCHR | S_IFBLK)) { /* Don't cache the id for mapped devices */ *id = fd_tree.subid++; return 1; } else { *id = fi->id; return 0; } } } *id = fd_tree.subid++; if (p) fd_id_cache_one(*id, p); return 1; } int fd_id_generate(pid_t pid, FdinfoEntry *fe, struct fd_parms *p) { u32 id; struct kid_elem e; int new_id = 0; e.pid = pid; e.genid = fe->id; e.idx = fe->fd; id = kid_generate_gen(&fd_tree, &e, &new_id); if (!id) return -ENOMEM; if (new_id) fd_id_cache_one(id, p); fe->id = id; return new_id; } crac-criu-1.5.0/criu/file-lock.c000066400000000000000000000373661471504326700163700ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "cr_options.h" #include "imgset.h" #include "files.h" #include "fs-magic.h" #include "kerndat.h" #include "image.h" #include "util.h" #include "mount.h" #include "proc_parse.h" #include "servicefd.h" #include "file-lock.h" #include "pstree.h" #include "files-reg.h" struct file_lock_rst { FileLockEntry *fle; struct list_head l; }; struct list_head file_lock_list = LIST_HEAD_INIT(file_lock_list); static int collect_one_file_lock(void *o, ProtobufCMessage *m, struct cr_img *i) { struct file_lock_rst *lr = o; lr->fle = pb_msg(m, FileLockEntry); list_add_tail(&lr->l, &file_lock_list); return 0; } struct collect_image_info file_locks_cinfo = { .fd_type = CR_FD_FILE_LOCKS, .pb_type = PB_FILE_LOCK, .priv_size = sizeof(struct file_lock_rst), .collect = collect_one_file_lock, }; struct file_lock *alloc_file_lock(void) { struct file_lock *flock; flock = xzalloc(sizeof(*flock)); if (!flock) return NULL; INIT_LIST_HEAD(&flock->list); flock->real_owner = -1; flock->owners_fd = -1; flock->fl_holder = -1; return flock; } void free_file_locks(void) { struct file_lock *flock, *tmp; list_for_each_entry_safe(flock, tmp, &file_lock_list, list) { xfree(flock); } INIT_LIST_HEAD(&file_lock_list); } static int dump_one_file_lock(FileLockEntry *fle) { pr_info("LOCK flag: %d,type: %d,pid: %d,fd: %d,start: %8" PRIx64 ",len: %8" PRIx64 "\n", fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); return pb_write_one(img_from_set(glob_imgset, CR_FD_FILE_LOCKS), fle, PB_FILE_LOCK); } static void fill_flock_entry(FileLockEntry *fle, int fl_kind, int fl_ltype) { fle->flag |= fl_kind; fle->type = fl_ltype; } int dump_file_locks(void) { FileLockEntry fle; struct file_lock *fl; int ret = 0; pr_info("Dumping file-locks\n"); list_for_each_entry(fl, &file_lock_list, list) { if (fl->real_owner == -1) { if (fl->fl_kind == FL_POSIX) { pr_err("Unresolved lock found pid %d ino %ld\n", fl->fl_owner, fl->i_no); return -1; } continue; } if (!opts.handle_file_locks) { pr_err("Some file locks are hold by dumping tasks! " "You can try --" OPT_FILE_LOCKS " to dump them.\n"); return -1; } file_lock_entry__init(&fle); fle.pid = fl->real_owner; fle.fd = fl->owners_fd; fill_flock_entry(&fle, fl->fl_kind, fl->fl_ltype); fle.start = fl->start; if (!strncmp(fl->end, "EOF", 3)) fle.len = 0; else fle.len = (atoll(fl->end) + 1) - fl->start; ret = dump_one_file_lock(&fle); if (ret) { pr_err("Dump file lock failed!\n"); goto err; } } err: return ret; } static int lock_btrfs_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p) { int phys_dev = MKKDEV(fl->maj, fl->min); char link[PATH_MAX], t[32]; struct ns_id *ns; int ret; snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd); ret = readlink(t, link, sizeof(link)) - 1; if (ret < 0) { pr_perror("Can't read link of fd %d", fd); return -1; } else if ((size_t)ret == sizeof(link)) { pr_err("Buffer for read link of fd %d is too small\n", fd); return -1; } link[ret] = 0; ns = lookup_nsid_by_mnt_id(p->mnt_id); return phys_stat_dev_match(p->stat.st_dev, phys_dev, ns, link); } static inline int lock_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p) { dev_t dev = p->stat.st_dev; if (fl->i_no != p->stat.st_ino) return 0; /* * Get the right devices for BTRFS. Look at phys_stat_resolve_dev() * for more details. */ if (p->fs_type == BTRFS_SUPER_MAGIC) { if (p->mnt_id != -1) { struct mount_info *m; m = lookup_mnt_id(p->mnt_id); BUG_ON(m == NULL); dev = kdev_to_odev(m->s_dev); } else /* old kernel */ return lock_btrfs_file_match(pid, fd, fl, p); } return makedev(fl->maj, fl->min) == dev; } static int lock_check_fd(int lfd, struct file_lock *fl) { int ret; if (fl->fl_ltype & LOCK_MAND) ret = flock(lfd, LOCK_MAND | LOCK_RW); else ret = flock(lfd, LOCK_EX | LOCK_NB); pr_debug(" `- %d/%d\n", ret, errno); if (ret != 0) { if (errno != EAGAIN) { pr_err("Bogus lock test result %d\n", ret); return -1; } return 0; } else { /* * The ret == 0 means, that new lock doesn't conflict * with any others on the file. But since we do know, * that there should be some other one (file is found * in /proc/locks), it means that the lock is already * on file pointed by fd. */ pr_debug(" `- downgrading lock back\n"); if (fl->fl_ltype & LOCK_MAND) ret = flock(lfd, fl->fl_ltype); else if (fl->fl_ltype == F_RDLCK) ret = flock(lfd, LOCK_SH); if (ret) { pr_err("Can't downgrade lock back %d\n", ret); return -1; } } return 1; } static int lock_ofd_check_fd(int lfd, struct file_lock *fl) { int ret; struct flock lck = { .l_whence = SEEK_SET, .l_type = F_WRLCK, .l_start = fl->start }; if (strcmp(fl->end, "EOF")) { unsigned long end; ret = sscanf(fl->end, "%lu", &end); if (ret <= 0) { pr_err("Invalid lock entry\n"); return -1; } lck.l_len = end - fl->start + 1; } else { lck.l_len = 0; } ret = fcntl(lfd, F_OFD_SETLK, &lck); pr_debug(" `- %d/%d\n", ret, errno); if (ret != 0) { if (errno != EAGAIN) { pr_err("Bogus lock test result %d\n", ret); return -1; } return 0; } else { /* * The ret == 0 means, that new lock doesn't conflict * with any others on the file. But since we do know, * that there should be some other one (file is found * in /proc/locks), it means that the lock is already * on file pointed by fd. */ pr_debug(" `- downgrading lock back\n"); if (fl->fl_ltype & LOCK_WRITE) lck.l_type = F_WRLCK; else lck.l_type = F_RDLCK; ret = fcntl(lfd, F_OFD_SETLK, &lck); if (ret) { pr_err("Can't downgrade lock back %d\n", ret); return -1; } } return 1; } static int lease_check_fd(int fd, int file_flags, struct file_lock *fl) { int file_lease_type, err; int lease_type = fl->fl_ltype & (~LEASE_BREAKING); if ((file_flags & O_ACCMODE) != O_RDONLY) { /* * Write OFD conflicts with any lease not associated * with it, therefore there is can't be other lease * or OFD for this file. */ return 1; } file_lease_type = fcntl(fd, F_GETLEASE); if (file_lease_type < 0) { pr_err("Can't get lease type\n"); return -1; } /* * Only read OFDs can be present for the file. If * read and write OFDs with at least one lease had * presented, it would have conflicted. */ if (fl->fl_ltype & LEASE_BREAKING) { /* * Only read leases are possible for read OFDs * and they all should be in breaking state, * because the current one is. */ int compatible_type = file_lease_type; if (compatible_type != F_UNLCK) { pr_err("Lease doesn't conflicts but breaks\n"); return -1; } /* * Due to activated breaking sequence we can't * get actual lease type with F_GETLEASE. * The err == 0 after lease upgrade means, that * there is already read lease on OFD. Otherwise * it would fail, because current read lease is * still set and breaking. */ err = fcntl(fd, F_SETLEASE, F_RDLCK); if (err < 0) { if (errno != EAGAIN) { pr_perror("Can't set lease (fd %i)", fd); return -1; } return 0; } return 1; } else { /* * The file can have only non-breaking read * leases, because otherwise the current one * also would have broke. */ if (lease_type != F_RDLCK) { pr_err("Incorrect lease type\n"); return -1; } if (file_lease_type == F_UNLCK) return 0; if (file_lease_type == F_RDLCK) return 1; pr_err("Invalid file lease type\n"); return -1; } } int note_file_lock(struct pid *pid, int fd, int lfd, struct fd_parms *p) { struct file_lock *fl; int ret; if (kdat.has_fdinfo_lock) return 0; list_for_each_entry(fl, &file_lock_list, list) { ret = lock_file_match(pid->real, fd, fl, p); if (ret < 0) return -1; if (ret == 0) continue; if (!opts.handle_file_locks) { pr_err("Some file locks are hold by dumping tasks!" "You can try --" OPT_FILE_LOCKS " to dump them.\n"); return -1; } if (fl->fl_kind == FL_POSIX) { /* * POSIX locks cannot belong to anyone * but creator. */ if (fl->fl_owner != pid->real) continue; } else if (fl->fl_kind == FL_LEASE) { if (fl->owners_fd >= 0) continue; if (fl->fl_owner != pid->real && fl->real_owner != -1) continue; ret = lease_check_fd(lfd, p->flags, fl); if (ret < 0) return ret; if (ret == 0) continue; } else /* fl->fl_kind == FL_FLOCK || fl->fl_kind == FL_OFD */ { int ret; /* * OFD locks & FLOCKs can be inherited across fork, * thus we can have any task as lock * owner. But the creator is preferred * anyway. */ if (fl->fl_owner != pid->real && fl->real_owner != -1) continue; pr_debug("Checking lock holder %d:%d\n", pid->real, fd); if (fl->fl_kind == FL_FLOCK) ret = lock_check_fd(lfd, fl); else ret = lock_ofd_check_fd(lfd, fl); if (ret < 0) return ret; if (ret == 0) continue; } fl->fl_holder = pid->real; fl->real_owner = pid->ns[0].virt; fl->owners_fd = fd; pr_info("Found lock entry %d.%d %d vs %d\n", pid->real, pid->ns[0].virt, fd, fl->fl_owner); } return 0; } void discard_dup_locks_tail(pid_t pid, int fd) { struct file_lock *fl, *p; list_for_each_entry_safe_reverse(fl, p, &file_lock_list, list) { if (fl->owners_fd != fd || pid != fl->fl_holder) break; list_del(&fl->list); xfree(fl); } } int correct_file_leases_type(struct pid *pid, int fd, int lfd) { struct file_lock *fl; int target_type; list_for_each_entry(fl, &file_lock_list, list) { /* owners_fd should be set before usage */ if (fl->fl_holder != pid->real || fl->owners_fd != fd) continue; if (fl->fl_kind == FL_LEASE && (fl->fl_ltype & LEASE_BREAKING)) { /* * Set lease type to actual 'target lease type' * instead of 'READ' returned by procfs. */ target_type = fcntl(lfd, F_GETLEASE); if (target_type < 0) { perror("Can't get lease type\n"); return -1; } fl->fl_ltype &= ~O_ACCMODE; fl->fl_ltype |= target_type; break; } } return 0; } static int open_break_cb(int ns_root_fd, struct reg_file_info *rfi, void *arg) { int fd, flags = *(int *)arg | O_NONBLOCK; fd = openat(ns_root_fd, rfi->path, flags); if (fd >= 0) { pr_err("Conflicting lease wasn't found\n"); close(fd); return -1; } else if (errno != EWOULDBLOCK) { pr_perror("Can't break lease"); return -1; } return 0; } static int break_lease(int lease_type, struct file_desc *desc) { int target_type = lease_type & (~LEASE_BREAKING); int break_flags; /* * Flags for open call chosen in a way to even * 'target lease type' returned by fcntl(F_GETLEASE) * and lease type from the image. */ if (target_type == F_UNLCK) { break_flags = O_WRONLY; } else if (target_type == F_RDLCK) { break_flags = O_RDONLY; } else { pr_err("Incorrect target lease type\n"); return -1; } return open_path(desc, open_break_cb, (void *)&break_flags); } static int set_file_lease(int fd, int type) { int old_fsuid, ret; struct stat st; if (fstat(fd, &st)) { pr_perror("Can't get file stat (%i)", fd); return -1; } /* * An unprivileged process may take out a lease only if * uid of the file matches the fsuid of the process. */ old_fsuid = setfsuid(st.st_uid); ret = fcntl(fd, F_SETLEASE, type); if (ret < 0) pr_perror("Can't set lease"); setfsuid(old_fsuid); return ret; } static int restore_lease_prebreaking_state(int fd, int fd_type) { int access_flags = fd_type & O_ACCMODE; int lease_type = (access_flags == O_RDONLY) ? F_RDLCK : F_WRLCK; return set_file_lease(fd, lease_type); } static struct fdinfo_list_entry *find_fd_unordered(struct pstree_item *task, int fd) { struct list_head *head = &rsti(task)->fds; struct fdinfo_list_entry *fle; list_for_each_entry_reverse(fle, head, ps_list) { if (fle->fe->fd == fd) return fle; } return NULL; } static int restore_breaking_file_lease(FileLockEntry *fle) { struct fdinfo_list_entry *fdle; int ret; fdle = find_fd_unordered(current, fle->fd); if (fdle == NULL) { pr_err("Can't get file description\n"); return -1; } ret = restore_lease_prebreaking_state(fle->fd, fdle->desc->ops->type); if (ret) return ret; /* * It could be broken by 2 types of open call: * 1. non-blocking: It failed because of the lease. * 2. blocking: It had been blocked at the moment * of dumping, otherwise lease wouldn't be broken. * Thus, it was canceled by CRIU. * * There are no files or leases in image, which will * conflict with each other. Therefore we should explicitly * break leases. Restoring can be done in any order. */ return break_lease(fle->type, fdle->desc); } static int restore_file_lease(FileLockEntry *fle) { sigset_t blockmask, oldmask; int signum_fcntl, signum, ret; if (fle->type & LEASE_BREAKING) { signum_fcntl = fcntl(fle->fd, F_GETSIG); signum = signum_fcntl ? signum_fcntl : SIGIO; if (signum_fcntl < 0) { pr_perror("Can't get file i/o signum"); return -1; } if (sigemptyset(&blockmask) || sigaddset(&blockmask, signum) || sigprocmask(SIG_BLOCK, &blockmask, &oldmask)) { pr_perror("Can't block file i/o signal"); return -1; } ret = restore_breaking_file_lease(fle); if (sigprocmask(SIG_SETMASK, &oldmask, NULL)) { pr_perror("Can't restore sigmask"); ret = -1; } return ret; } else { ret = set_file_lease(fle->fd, fle->type); if (ret < 0) pr_perror("Can't restore non breaking lease"); return ret; } } static int restore_file_lock(FileLockEntry *fle) { int ret = -1; unsigned int cmd; if (fle->flag & FL_FLOCK) { if (fle->type & LOCK_MAND) { cmd = fle->type; } else if (fle->type == F_RDLCK) { cmd = LOCK_SH; } else if (fle->type == F_WRLCK) { cmd = LOCK_EX; } else if (fle->type == F_UNLCK) { cmd = LOCK_UN; } else { pr_err("Unknown flock type!\n"); goto err; } pr_info("(flock)flag: %d, type: %d, cmd: %d, pid: %d, fd: %d\n", fle->flag, fle->type, cmd, fle->pid, fle->fd); ret = flock(fle->fd, cmd); if (ret < 0) { pr_err("Can not set flock!\n"); goto err; } } else if (fle->flag & FL_POSIX) { struct flock flk; memset(&flk, 0, sizeof(flk)); flk.l_whence = SEEK_SET; flk.l_start = fle->start; flk.l_len = fle->len; flk.l_pid = fle->pid; flk.l_type = fle->type; pr_info("(posix)flag: %d, type: %d, pid: %d, fd: %d, " "start: %8" PRIx64 ", len: %8" PRIx64 "\n", fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); ret = fcntl(fle->fd, F_SETLKW, &flk); if (ret < 0) { pr_err("Can not set posix lock!\n"); goto err; } } else if (fle->flag & FL_OFD) { struct flock flk = { .l_whence = SEEK_SET, .l_start = fle->start, .l_len = fle->len, .l_pid = 0, .l_type = fle->type }; pr_info("(ofd)flag: %d, type: %d, pid: %d, fd: %d, " "start: %8" PRIx64 ", len: %8" PRIx64 "\n", fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); ret = fcntl(fle->fd, F_OFD_SETLK, &flk); if (ret < 0) { pr_err("Can not set ofd lock!\n"); goto err; } } else if (fle->flag & FL_LEASE) { pr_info("(lease)flag: %d, type: %d, pid: %d, fd: %d, " "start: %8" PRIx64 ", len: %8" PRIx64 "\n", fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); ret = restore_file_lease(fle); if (ret < 0) goto err; } else { pr_err("Unknown file lock style!\n"); goto err; } return 0; err: return ret; } static int restore_file_locks(int pid) { int ret = 0; struct file_lock_rst *lr; list_for_each_entry(lr, &file_lock_list, l) { if (lr->fle->pid == pid) { ret = restore_file_lock(lr->fle); if (ret) break; } } return ret; } int prepare_file_locks(int pid) { if (!opts.handle_file_locks) return 0; return restore_file_locks(pid); } crac-criu-1.5.0/criu/files-ext.c000066400000000000000000000040041471504326700164020ustar00rootroot00000000000000/* An external file is a file, which is dumped with help a plugin */ #include #include "imgset.h" #include "files.h" #include "plugin.h" #include "protobuf.h" #include "images/ext-file.pb-c.h" static int dump_one_ext_file(int lfd, u32 id, const struct fd_parms *p) { int ret; struct cr_img *rimg; FileEntry fe = FILE_ENTRY__INIT; ExtFileEntry xfe = EXT_FILE_ENTRY__INIT; ret = run_plugins(DUMP_EXT_FILE, lfd, id); if (ret < 0) return ret; xfe.id = id; xfe.fown = (FownEntry *)&p->fown; fe.type = FD_TYPES__EXT; fe.id = xfe.id; fe.ext = &xfe; rimg = img_from_set(glob_imgset, CR_FD_FILES); return pb_write_one(rimg, &fe, PB_FILE); } const struct fdtype_ops ext_dump_ops = { .type = FD_TYPES__EXT, .dump = dump_one_ext_file, }; struct ext_file_info { struct file_desc d; ExtFileEntry *xfe; }; static int open_fd(struct file_desc *d, int *new_fd) { struct ext_file_info *xfi; int fd; xfi = container_of(d, struct ext_file_info, d); fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id); if (fd < 0) { pr_err("Unable to restore %#x\n", xfi->xfe->id); return -1; } if (restore_fown(fd, xfi->xfe->fown)) return -1; *new_fd = fd; return 0; } static struct file_desc_ops ext_desc_ops = { .type = FD_TYPES__EXT, .open = open_fd, }; static int collect_one_ext(void *o, ProtobufCMessage *base, struct cr_img *i) { struct ext_file_info *xfi = o; xfi->xfe = pb_msg(base, ExtFileEntry); pr_info("Collected external file with ID %#x\n", xfi->xfe->id); return file_desc_add(&xfi->d, xfi->xfe->id, &ext_desc_ops); } struct collect_image_info ext_file_cinfo = { .fd_type = CR_FD_EXT_FILES, .pb_type = PB_EXT_FILE, .priv_size = sizeof(struct ext_file_info), .collect = collect_one_ext, }; int dump_unsupp_fd(struct fd_parms *p, int lfd, char *more, char *info, FdinfoEntry *e) { int ret; ret = do_dump_gen_file(p, lfd, &ext_dump_ops, e); if (ret == 0) return 0; if (ret == -ENOTSUP) pr_err("Can't dump file %d of that type [%o] (%s %s)\n", p->fd, p->stat.st_mode, more, info); return -1; } crac-criu-1.5.0/criu/files-reg.c000066400000000000000000002003401471504326700163600ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "tty.h" #include "stats.h" #ifndef SEEK_DATA #define SEEK_DATA 3 #define SEEK_HOLE 4 #endif /* Stolen from kernel/fs/nfs/unlink.c */ #define SILLYNAME_PREF ".nfs" #define SILLYNAME_SUFF_LEN (((unsigned)sizeof(u64) << 1) + ((unsigned)sizeof(unsigned int) << 1)) /* * If the build-id exists, then it will most likely be present in the * beginning of the file. Therefore only the first 1MB will be mapped * and checked. */ #define BUILD_ID_MAP_SIZE 1048576 #define ST_UNIT 512 #define EXTENT_MAX_COUNT 512 #include "cr_options.h" #include "imgset.h" #include "file-ids.h" #include "mount.h" #include "files.h" #include "common/list.h" #include "rst-malloc.h" #include "fs-magic.h" #include "namespaces.h" #include "proc_parse.h" #include "pstree.h" #include "string.h" #include "fault-injection.h" #include "external.h" #include "memfd.h" #include "protobuf.h" #include "util.h" #include "images/regfile.pb-c.h" #include "images/remap-file-path.pb-c.h" #include "files-reg.h" #include "plugin.h" #include "string.h" int setfsuid(uid_t fsuid); int setfsgid(gid_t fsuid); /* * Ghost files are those not visible from the FS. Dumping them is * nasty and the only way we have -- just carry its contents with * us. Any brave soul to implement link unlinked file back? */ struct ghost_file { struct list_head list; u32 id; u32 dev; u32 ino; struct file_remap remap; }; static u32 ghost_file_ids = 1; static LIST_HEAD(ghost_files); /* * When opening remaps we first create a link on the remap * target, then open one, then unlink. In case the remap * source has more than one instance, these three steps * should be serialized with each other. */ static mutex_t *remap_open_lock; static inline int init_remap_lock(void) { remap_open_lock = shmalloc(sizeof(*remap_open_lock)); if (!remap_open_lock) return -1; mutex_init(remap_open_lock); return 0; } static LIST_HEAD(remaps); /* * Remember the name to delete it if needed on error or * rollback action. Note we don't expect that there will * be a HUGE number of link remaps, so in a sake of speed * we keep all data in memory. */ struct link_remap_rlb { struct list_head list; struct ns_id *mnt_ns; char *path; }; static int note_link_remap(char *path, struct ns_id *nsid) { struct link_remap_rlb *rlb; rlb = xmalloc(sizeof(*rlb)); if (!rlb) goto err; rlb->path = xstrdup(path); if (!rlb->path) goto err2; rlb->mnt_ns = nsid; list_add(&rlb->list, &remaps); return 0; err2: xfree(rlb); err: pr_err("Can't note link remap for %s\n", path); return -1; } /* Trim "a/b/c/d" to "a/b/d" */ static int trim_last_parent(char *path) { char *fname, *p; p = strrchr(path, '/'); fname = p + 1; if (!p || *fname == '\0') return -1; while (p >= path && *p == '/') p--; if (p < path) return -1; while (p >= path && *p != '/') p--; p++; while (*fname != '\0') *p++ = *fname++; *p = '\0'; return 0; } #define BUFSIZE (4096) static int copy_chunk_from_file(int fd, int img, off_t off, size_t len) { int ret; while (len > 0) { ret = sendfile(img, fd, &off, len); if (ret <= 0) { pr_perror("Can't send ghost to image"); return -1; } len -= ret; } return 0; } static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size) { GhostChunkEntry ce = GHOST_CHUNK_ENTRY__INIT; off_t data, hole = 0; while (hole < file_size) { data = lseek(fd, hole, SEEK_DATA); if (data < 0) { if (errno == ENXIO) /* No data */ break; else if (hole == 0) { /* No SEEK_HOLE/DATA by FS */ data = 0; hole = file_size; } else { pr_perror("Can't seek file data"); return -1; } } else { hole = lseek(fd, data, SEEK_HOLE); if (hole < 0) { pr_perror("Can't seek file hole"); return -1; } } ce.len = hole - data; ce.off = data; if (pb_write_one(img, &ce, PB_GHOST_CHUNK)) return -1; if (copy_chunk_from_file(fd, img_raw_fd(img), ce.off, ce.len)) return -1; } return 0; } static int skip_outstanding(struct fiemap_extent *fe, size_t file_size) { /* Skip outstanding extent */ if (fe->fe_logical > file_size) return 1; /* Skip outstanding part of the extent */ if (fe->fe_logical + fe->fe_length > file_size) fe->fe_length = file_size - fe->fe_logical; return 0; } static int copy_file_to_chunks_fiemap(int fd, struct cr_img *img, size_t file_size) { GhostChunkEntry ce = GHOST_CHUNK_ENTRY__INIT; struct fiemap *fiemap_buf; struct fiemap_extent *ext_buf; int ext_buf_size, fie_buf_size; off_t pos = 0; unsigned int i; int ret = 0; int exit_code = 0; ext_buf_size = EXTENT_MAX_COUNT * sizeof(struct fiemap_extent); fie_buf_size = sizeof(struct fiemap) + ext_buf_size; fiemap_buf = xzalloc(fie_buf_size); if (!fiemap_buf) { pr_perror("Out of memory when allocating fiemap"); return -1; } ext_buf = fiemap_buf->fm_extents; fiemap_buf->fm_length = FIEMAP_MAX_OFFSET; fiemap_buf->fm_flags |= FIEMAP_FLAG_SYNC; fiemap_buf->fm_extent_count = EXTENT_MAX_COUNT; do { fiemap_buf->fm_start = pos; memzero(ext_buf, ext_buf_size); ret = ioctl(fd, FS_IOC_FIEMAP, fiemap_buf); if (ret < 0) { if (errno == EOPNOTSUPP) { exit_code = -EOPNOTSUPP; } else { exit_code = -1; pr_perror("fiemap ioctl() failed"); } goto out; } else if (fiemap_buf->fm_mapped_extents == 0) { goto out; } for (i = 0; i < fiemap_buf->fm_mapped_extents; i++) { if (skip_outstanding(&fiemap_buf->fm_extents[i], file_size)) continue; ce.len = fiemap_buf->fm_extents[i].fe_length; ce.off = fiemap_buf->fm_extents[i].fe_logical; if (pb_write_one(img, &ce, PB_GHOST_CHUNK)) { exit_code = -1; goto out; } if (copy_chunk_from_file(fd, img_raw_fd(img), ce.off, ce.len)) { exit_code = -1; goto out; } if (fiemap_buf->fm_extents[i].fe_flags & FIEMAP_EXTENT_LAST) { /* there are no extents left, break. */ goto out; } } /* Record file's logical offset as pos */ pos = ce.len + ce.off; /* Since there are still extents left, continue. */ } while (fiemap_buf->fm_mapped_extents == EXTENT_MAX_COUNT); out: xfree(fiemap_buf); return exit_code; } static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) { int ret; while (len > 0) { if (lseek(fd, off, SEEK_SET) < 0) { pr_perror("Can't seek file"); return -1; } if (opts.stream) ret = splice(img, NULL, fd, NULL, len, SPLICE_F_MOVE); else ret = sendfile(fd, img, NULL, len); if (ret < 0) { pr_perror("Can't send data"); return -1; } off += ret; len -= ret; } return 0; } static int copy_file_from_chunks(struct cr_img *img, int fd, size_t file_size) { if (ftruncate(fd, file_size) < 0) { pr_perror("Can't make file size"); return -1; } while (1) { int ret; GhostChunkEntry *ce; ret = pb_read_one_eof(img, &ce, PB_GHOST_CHUNK); if (ret <= 0) return ret; if (copy_chunk_to_file(img_raw_fd(img), fd, ce->off, ce->len)) return -1; ghost_chunk_entry__free_unpacked(ce, NULL); } } static int mkreg_ghost(char *path, GhostFileEntry *gfe, struct cr_img *img) { int gfd, ret; gfd = open(path, O_WRONLY | O_CREAT | O_EXCL, gfe->mode); if (gfd < 0) return -1; if (gfe->chunks) { if (!gfe->has_size) { pr_err("Corrupted ghost image -> no size\n"); close(gfd); return -1; } ret = copy_file_from_chunks(img, gfd, gfe->size); } else ret = copy_file(img_raw_fd(img), gfd, 0); if (ret < 0) unlink(path); close(gfd); return ret; } static int mklnk_ghost(char *path, GhostFileEntry *gfe) { if (!gfe->symlnk_target) { pr_err("Ghost symlink target is NULL for %s. Image from old CRIU?\n", path); return -1; } if (symlink(gfe->symlnk_target, path) < 0) { /* * ENOENT case is OK * Take a look closer on create_ghost() function */ if (errno != ENOENT) pr_perror("symlink(%s, %s) failed", gfe->symlnk_target, path); return -1; } return 0; } static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) { struct timeval tv[2]; if (cr_fchpermat(AT_FDCWD, path, gfe->uid, gfe->gid, gfe->mode, AT_SYMLINK_NOFOLLOW) < 0) return -1; if (!gfe->atim) return 0; tv[0].tv_sec = gfe->atim->tv_sec; tv[0].tv_usec = gfe->atim->tv_usec; tv[1].tv_sec = gfe->mtim->tv_sec; tv[1].tv_usec = gfe->mtim->tv_usec; if (lutimes(path, tv)) { pr_perror("Can't set access and modification times on ghost %s", path); return -1; } return 0; } static int create_ghost_dentry(char *path, GhostFileEntry *gfe, struct cr_img *img) { int ret = -1; char *msg; again: if (S_ISFIFO(gfe->mode)) { if ((ret = mknod(path, gfe->mode, 0)) < 0) msg = "Can't create node for ghost file"; } else if (S_ISCHR(gfe->mode) || S_ISBLK(gfe->mode)) { if (!gfe->has_rdev) { pr_err("No rdev for ghost device\n"); goto err; } if ((ret = mknod(path, gfe->mode, gfe->rdev)) < 0) msg = "Can't create node for ghost dev"; } else if (S_ISDIR(gfe->mode)) { if ((ret = mkdirpat(AT_FDCWD, path, gfe->mode)) < 0) msg = "Can't make ghost dir"; } else if (S_ISLNK(gfe->mode)) { if ((ret = mklnk_ghost(path, gfe)) < 0) msg = "Can't create ghost symlink"; } else { if ((ret = mkreg_ghost(path, gfe, img)) < 0) msg = "Can't create ghost regfile"; } if (ret < 0) { /* Use grand parent, if parent directory does not exist */ if (errno == ENOENT) { if (trim_last_parent(path) < 0) { pr_err("trim failed: @%s@\n", path); goto err; } goto again; } pr_perror("%s", msg); goto err; } ret = 0; err: return ret; } static int nomntns_create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img) { char path[PATH_MAX]; snprintf(path, sizeof(path), "/%s", gf->remap.rpath); if (create_ghost_dentry(path, gfe, img)) return -1; if (ghost_apply_metadata(path, gfe)) return -1; __strlcpy(gf->remap.rpath, path + 1, PATH_MAX); pr_debug("Remap rpath is %s\n", gf->remap.rpath); return 0; } static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img) { struct mount_info *mi; char path[PATH_MAX], *rel_path, *rel_mp; if (!(root_ns_mask & CLONE_NEWNS)) return nomntns_create_ghost(gf, gfe, img); mi = lookup_mnt_id(gf->remap.rmnt_id); if (!mi) { pr_err("The %d mount is not found for ghost\n", gf->remap.rmnt_id); return -1; } /* Get path relative to mountpoint from path relative to mntns */ rel_path = get_relative_path(gf->remap.rpath, mi->ns_mountpoint); if (!rel_path) { pr_err("Can't get path %s relative to %s\n", gf->remap.rpath, mi->ns_mountpoint); return -1; } snprintf(path, sizeof(path), "%s%s%s", service_mountpoint(mi), rel_path[0] ? "/" : "", rel_path); pr_debug("Trying to create ghost on path %s\n", path); /* We get here while in service mntns */ if (try_remount_writable(mi, false)) return -1; if (create_ghost_dentry(path, gfe, img)) return -1; if (ghost_apply_metadata(path, gfe)) return -1; /* * Convert the path back to mntns relative, as create_ghost_dentry * might have changed it. */ rel_path = get_relative_path(path, service_mountpoint(mi)); if (!rel_path) { pr_err("Can't get path %s relative to %s\n", path, service_mountpoint(mi)); return -1; } rel_mp = get_relative_path(mi->ns_mountpoint, "/"); if (!rel_mp) { pr_err("Can't get path %s relative to %s\n", mi->ns_mountpoint, "/"); return -1; } snprintf(gf->remap.rpath, PATH_MAX, "%s%s%s", rel_mp, (rel_mp[0] && rel_path[0]) ? "/" : "", rel_path); pr_debug("Remap rpath is %s\n", gf->remap.rpath); return 0; } static inline void ghost_path(char *path, int plen, struct reg_file_info *rfi, RemapFilePathEntry *rpe) { snprintf(path, plen, "%s.cr.%x.ghost", rfi->path, rpe->remap_id); } static int collect_remap_ghost(struct reg_file_info *rfi, RemapFilePathEntry *rpe) { struct ghost_file *gf; list_for_each_entry(gf, &ghost_files, list) if (gf->id == rpe->remap_id) goto gf_found; /* * Ghost not found. We will create one in the same dir * as the very first client of it thus resolving any * issues with cross-device links. */ pr_info("Opening ghost file %#x for %s\n", rpe->remap_id, rfi->path); gf = shmalloc(sizeof(*gf)); if (!gf) return -1; /* * The rpath is shmalloc-ed because we create the ghost * file in root task context and generate its path there. * However the path should be visible by the criu task * in order to remove the ghost files from root FS (see * try_clean_remaps()). */ gf->remap.rpath = shmalloc(PATH_MAX); if (!gf->remap.rpath) return -1; gf->remap.rpath[0] = 0; gf->id = rpe->remap_id; list_add_tail(&gf->list, &ghost_files); gf_found: rfi->is_dir = gf->remap.is_dir; rfi->remap = &gf->remap; return 0; } static int open_remap_ghost(struct reg_file_info *rfi, RemapFilePathEntry *rpe) { struct ghost_file *gf = container_of(rfi->remap, struct ghost_file, remap); GhostFileEntry *gfe = NULL; struct cr_img *img; if (rfi->remap->rpath[0]) return 0; img = open_image(CR_FD_GHOST_FILE, O_RSTR, rpe->remap_id); if (!img) goto err; if (pb_read_one(img, &gfe, PB_GHOST_FILE) < 0) goto close_ifd; /* * For old formats where optional has_[dev|ino] is * not present we will have zeros here which is quite * a sign for "absent" fields. */ gf->dev = gfe->dev; gf->ino = gfe->ino; gf->remap.rmnt_id = rfi->rfe->mnt_id; if (S_ISDIR(gfe->mode)) __strlcpy(gf->remap.rpath, rfi->path, PATH_MAX); else ghost_path(gf->remap.rpath, PATH_MAX, rfi, rpe); if (create_ghost(gf, gfe, img)) goto close_ifd; close_image(img); gf->remap.is_dir = S_ISDIR(gfe->mode); gf->remap.uid = gfe->uid; gf->remap.gid = gfe->gid; ghost_file_entry__free_unpacked(gfe, NULL); return 0; close_ifd: close_image(img); err: if (gfe) ghost_file_entry__free_unpacked(gfe, NULL); return -1; } static int collect_remap_linked(struct reg_file_info *rfi, RemapFilePathEntry *rpe) { struct file_remap *rm; struct file_desc *rdesc; struct reg_file_info *rrfi; rdesc = find_file_desc_raw(FD_TYPES__REG, rpe->remap_id); if (!rdesc) { pr_err("Can't find target file %x\n", rpe->remap_id); return -1; } rm = xmalloc(sizeof(*rm)); if (!rm) return -1; rrfi = container_of(rdesc, struct reg_file_info, d); pr_info("Remapped %s -> %s\n", rfi->path, rrfi->path); rm->rpath = rrfi->path; rm->is_dir = false; rm->uid = -1; rm->gid = -1; rm->rmnt_id = rfi->rfe->mnt_id; rfi->remap = rm; return 0; } static int open_remap_linked(struct reg_file_info *rfi) { if (root_ns_mask & CLONE_NEWUSER) { int rfd; struct stat st; rfd = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id); if (fstatat(rfd, rfi->remap->rpath, &st, AT_SYMLINK_NOFOLLOW)) { pr_perror("Can't get owner of link remap %s", rfi->remap->rpath); return -1; } rfi->remap->uid = st.st_uid; rfi->remap->gid = st.st_gid; } return 0; } static int collect_remap_dead_process(struct reg_file_info *rfi, RemapFilePathEntry *rfe) { struct pstree_item *helper; helper = lookup_create_item(rfe->remap_id); if (!helper) return -1; if (helper->pid->state != TASK_UNDEF) { pr_info("Skipping helper for restoring /proc/%d; pid exists\n", rfe->remap_id); return 0; } helper->sid = root_item->sid; helper->pgid = root_item->pgid; helper->pid->ns[0].virt = rfe->remap_id; helper->parent = root_item; helper->ids = root_item->ids; if (init_pstree_helper(helper)) { pr_err("Can't init helper\n"); return -1; } list_add_tail(&helper->sibling, &root_item->children); pr_info("Added a helper for restoring /proc/%d\n", vpid(helper)); return 0; } struct remap_info { struct list_head list; RemapFilePathEntry *rpe; struct reg_file_info *rfi; }; static int collect_one_remap(void *obj, ProtobufCMessage *msg, struct cr_img *i) { struct remap_info *ri = obj; RemapFilePathEntry *rpe; struct file_desc *fdesc; ri->rpe = rpe = pb_msg(msg, RemapFilePathEntry); if (!rpe->has_remap_type) { rpe->has_remap_type = true; /* backward compatibility with images */ if (rpe->remap_id & REMAP_GHOST) { rpe->remap_id &= ~REMAP_GHOST; rpe->remap_type = REMAP_TYPE__GHOST; } else rpe->remap_type = REMAP_TYPE__LINKED; } fdesc = find_file_desc_raw(FD_TYPES__REG, rpe->orig_id); if (fdesc == NULL) { pr_err("Remap for non existing file %#x\n", rpe->orig_id); return -1; } ri->rfi = container_of(fdesc, struct reg_file_info, d); switch (rpe->remap_type) { case REMAP_TYPE__GHOST: if (collect_remap_ghost(ri->rfi, ri->rpe)) return -1; break; case REMAP_TYPE__LINKED: if (collect_remap_linked(ri->rfi, ri->rpe)) return -1; break; case REMAP_TYPE__PROCFS: if (collect_remap_dead_process(ri->rfi, rpe) < 0) return -1; break; default: break; } list_add_tail(&ri->list, &remaps); return 0; } static int prepare_one_remap(struct remap_info *ri) { int ret = -1; RemapFilePathEntry *rpe = ri->rpe; struct reg_file_info *rfi = ri->rfi; pr_info("Configuring remap %#x -> %#x\n", rfi->rfe->id, rpe->remap_id); switch (rpe->remap_type) { case REMAP_TYPE__LINKED: ret = open_remap_linked(rfi); break; case REMAP_TYPE__GHOST: ret = open_remap_ghost(rfi, rpe); break; case REMAP_TYPE__PROCFS: /* handled earlier by collect_remap_dead_process */ ret = 0; break; default: pr_err("unknown remap type %u\n", rpe->remap_type); goto out; } out: return ret; } int prepare_remaps(void) { struct remap_info *ri; int ret = 0; ret = init_remap_lock(); if (ret) return ret; list_for_each_entry(ri, &remaps, list) { ret = prepare_one_remap(ri); if (ret) break; } return ret; } static int clean_one_remap(struct remap_info *ri) { struct file_remap *remap = ri->rfi->remap; int mnt_id, ret; struct mount_info *mi; char path[PATH_MAX], *rel_path; if (remap->rpath[0] == 0) return 0; if (!(root_ns_mask & CLONE_NEWNS)) { snprintf(path, sizeof(path), "/%s", remap->rpath); goto nomntns; } mnt_id = ri->rfi->rfe->mnt_id; /* rirfirfe %) */ mi = lookup_mnt_id(mnt_id); if (!mi) { pr_err("The %d mount is not found for ghost\n", mnt_id); return -1; } rel_path = get_relative_path(remap->rpath, mi->ns_mountpoint); if (!rel_path) { pr_err("Can't get path %s relative to %s\n", remap->rpath, mi->ns_mountpoint); return -1; } snprintf(path, sizeof(path), "%s%s%s", service_mountpoint(mi), strlen(rel_path) ? "/" : "", rel_path); /* We get here while in service mntns */ if (try_remount_writable(mi, false)) return -1; nomntns: pr_info("Unlink remap %s\n", path); if (remap->is_dir) ret = rmdir(path); else ret = unlink(path); if (ret) { pr_perror("Couldn't unlink remap %s", path); return -1; } remap->rpath[0] = 0; return 0; } int try_clean_remaps(bool only_ghosts) { struct remap_info *ri; int ret = 0; list_for_each_entry(ri, &remaps, list) { if (ri->rpe->remap_type == REMAP_TYPE__GHOST) ret |= clean_one_remap(ri); else if (only_ghosts) continue; else if (ri->rpe->remap_type == REMAP_TYPE__LINKED) ret |= clean_one_remap(ri); } return ret; } static struct collect_image_info remap_cinfo = { .fd_type = CR_FD_REMAP_FPATH, .pb_type = PB_REMAP_FPATH, .priv_size = sizeof(struct remap_info), .collect = collect_one_remap, }; /* Tiny files don't need to generate chunks in ghost image. */ #define GHOST_CHUNKS_THRESH (3 * 4096) static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_dev) { struct cr_img *img; int exit_code = -1; GhostFileEntry gfe = GHOST_FILE_ENTRY__INIT; Timeval atim = TIMEVAL__INIT, mtim = TIMEVAL__INIT; char pathbuf[PATH_MAX]; pr_info("Dumping ghost file contents (id %#x)\n", id); img = open_image(CR_FD_GHOST_FILE, O_DUMP, id); if (!img) return -1; gfe.uid = userns_uid(st->st_uid); gfe.gid = userns_gid(st->st_gid); gfe.mode = st->st_mode; gfe.atim = &atim; gfe.mtim = &mtim; gfe.atim->tv_sec = st->st_atim.tv_sec; gfe.atim->tv_usec = st->st_atim.tv_nsec / 1000; gfe.mtim->tv_sec = st->st_mtim.tv_sec; gfe.mtim->tv_usec = st->st_mtim.tv_nsec / 1000; gfe.has_dev = gfe.has_ino = true; gfe.dev = phys_dev; gfe.ino = st->st_ino; if (S_ISCHR(st->st_mode) || S_ISBLK(st->st_mode)) { gfe.has_rdev = true; gfe.rdev = st->st_rdev; } if (S_ISREG(st->st_mode) && (st->st_size >= GHOST_CHUNKS_THRESH)) { gfe.has_chunks = gfe.chunks = true; gfe.has_size = true; gfe.size = st->st_size; } /* * We set gfe.symlnk_target only if we need to dump * symlink content, otherwise we leave it NULL. * It will be taken into account on restore in mklnk_ghost function. */ if (S_ISLNK(st->st_mode)) { ssize_t ret; /* * We assume that _fd opened with O_PATH | O_NOFOLLOW * flags because S_ISLNK(st->st_mode). With current kernel version, * it's looks like correct assumption in any case. */ ret = readlinkat(_fd, "", pathbuf, sizeof(pathbuf) - 1); if (ret < 0) { pr_perror("Can't readlinkat"); goto err_out; } pathbuf[ret] = 0; if (ret != st->st_size) { pr_err("Buffer for readlinkat is too small: ret %zd, st_size %" PRId64 ", buf %u %s\n", ret, st->st_size, PATH_MAX, pathbuf); goto err_out; } gfe.symlnk_target = pathbuf; } if (pb_write_one(img, &gfe, PB_GHOST_FILE)) goto err_out; if (S_ISREG(st->st_mode)) { int fd, ret; /* * Reopen file locally since it may have no read * permissions when drained */ fd = open_proc(PROC_SELF, "fd/%d", _fd); if (fd < 0) { pr_perror("Can't open ghost original file"); goto err_out; } if (gfe.chunks) { if (opts.ghost_fiemap) { ret = copy_file_to_chunks_fiemap(fd, img, st->st_size); if (ret == -EOPNOTSUPP) { pr_debug("file system don't support fiemap\n"); ret = copy_file_to_chunks(fd, img, st->st_size); } } else { ret = copy_file_to_chunks(fd, img, st->st_size); } } else { ret = copy_file(fd, img_raw_fd(img), st->st_size); } close(fd); if (ret) goto err_out; } exit_code = 0; err_out: close_image(img); return exit_code; } struct file_remap *lookup_ghost_remap(u32 dev, u32 ino) { struct ghost_file *gf; list_for_each_entry(gf, &ghost_files, list) { if (gf->ino == ino && (gf->dev == dev)) { return &gf->remap; } } return NULL; } static int dump_ghost_remap(char *path, const struct stat *st, int lfd, u32 id, struct ns_id *nsid) { struct ghost_file *gf; RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; dev_t phys_dev; pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id); if (st->st_blocks * ST_UNIT > opts.ghost_limit) { pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_blocks * ST_UNIT); return -1; } phys_dev = phys_stat_resolve_dev(nsid, st->st_dev, path); list_for_each_entry(gf, &ghost_files, list) if ((gf->dev == phys_dev) && (gf->ino == st->st_ino)) goto dump_entry; gf = xmalloc(sizeof(*gf)); if (gf == NULL) return -1; gf->dev = phys_dev; gf->ino = st->st_ino; gf->id = ghost_file_ids++; if (dump_ghost_file(lfd, gf->id, st, phys_dev)) { xfree(gf); return -1; } list_add_tail(&gf->list, &ghost_files); dump_entry: rpe.orig_id = id; rpe.remap_id = gf->id; rpe.has_remap_type = true; rpe.remap_type = REMAP_TYPE__GHOST; return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), &rpe, PB_REMAP_FPATH); } static void __rollback_link_remaps(bool do_unlink) { struct link_remap_rlb *rlb, *tmp; int mntns_root; list_for_each_entry_safe(rlb, tmp, &remaps, list) { if (do_unlink) { mntns_root = mntns_get_root_fd(rlb->mnt_ns); if (mntns_root >= 0) unlinkat(mntns_root, rlb->path, 0); else pr_err("Failed to clenaup %s link remap\n", rlb->path); } list_del(&rlb->list); xfree(rlb->path); xfree(rlb); } } void delete_link_remaps(void) { __rollback_link_remaps(true); } void free_link_remaps(void) { __rollback_link_remaps(false); } static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, gid_t gid, int flags); static void check_overlayfs_fallback(char *path, const struct fd_parms *parms, bool *fallback) { if (!fallback || parms->fs_type != OVERLAYFS_SUPER_MAGIC) return; /* * In overlayFS, linkat() fails with ENOENT if the removed file is * originated from lower layer. The cause of failure is that linkat() * sees the file has st_nlink=0, which is different than st_nlink=1 we * got from earlier fstat() on lfd. By setting *fb=true, we will fall * back to dump_ghost_remap() as it is what should have been done to * removed files with st_nlink=0. */ pr_info("Unable to link-remap %s on overlayFS, fall back to dump_ghost_remap\n", path); *fallback = true; } static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_id *nsid, const struct fd_parms *parms, bool *fallback) { char link_name[PATH_MAX], *tmp; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; FownEntry fwn = FOWN_ENTRY__INIT; int mntns_root; const struct stat *ost = &parms->stat; if (!opts.link_remap_ok) { pr_err("Can't create link remap for %s. " "Use " LREMAP_PARAM " option.\n", path); return -1; } /* * Linked remapping -- we create a hard link on a removed file * in the directory original file used to sit. * * Bad news is than we can't easily open lfd's parent dir. Thus * we have to just generate an absolute path and use it. The linkat * will fail if we chose the bad one. */ link_name[0] = '.'; memcpy(link_name + 1, path, len); tmp = link_name + len; while (*tmp != '/') { BUG_ON(tmp == link_name); tmp--; } fd_id_generate_special(NULL, idp); rfe.id = *idp; rfe.flags = 0; rfe.pos = 0; rfe.fown = &fwn; rfe.name = link_name + 1; /* Any 'unique' name works here actually. Remap works by reg-file ids. */ snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); mntns_root = mntns_get_root_fd(nsid); while (linkat_hard(lfd, "", mntns_root, link_name, ost->st_uid, ost->st_gid, AT_EMPTY_PATH) < 0) { if (errno != ENOENT) { pr_perror("Can't link remap to %s", path); return -1; } /* Use grand parent, if parent directory does not exist. */ if (trim_last_parent(link_name) < 0) { pr_err("trim failed: @%s@\n", link_name); check_overlayfs_fallback(path, parms, fallback); return -1; } } if (note_link_remap(link_name, nsid)) return -1; fe.type = FD_TYPES__REG; fe.id = rfe.id; fe.reg = &rfe; return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } static int dump_linked_remap(char *path, int len, const struct fd_parms *parms, int lfd, u32 id, struct ns_id *nsid, bool *fallback) { u32 lid; RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; if (create_link_remap(path, len, lfd, &lid, nsid, parms, fallback)) return -1; rpe.orig_id = id; rpe.remap_id = lid; rpe.has_remap_type = true; rpe.remap_type = REMAP_TYPE__LINKED; return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), &rpe, PB_REMAP_FPATH); } static pid_t *dead_pids; static int n_dead_pids; int dead_pid_conflict(void) { int i; for (i = 0; i < n_dead_pids; i++) { struct pid *node; pid_t pid = dead_pids[i]; node = pstree_pid_by_virt(pid); if (!node) continue; /* Main thread */ if (node->state != TASK_THREAD) continue; pr_err("Conflict with a dead task with the same PID as of this thread (virt %d, real %d).\n", node->ns[0].virt, node->real); return -1; } return 0; } static int have_seen_dead_pid(pid_t pid) { int i; for (i = 0; i < n_dead_pids; i++) { if (dead_pids[i] == pid) return 1; } if (xrealloc_safe(&dead_pids, sizeof(*dead_pids) * (n_dead_pids + 1))) return -1; dead_pids[n_dead_pids++] = pid; return 0; } static int dump_dead_process_remap(pid_t pid, u32 id) { RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; int ret; ret = have_seen_dead_pid(pid); if (ret < 0) return -1; if (ret) { pr_info("Found dead pid %d already, skipping remap\n", pid); return 0; } rpe.orig_id = id; rpe.remap_id = pid; rpe.has_remap_type = true; rpe.remap_type = REMAP_TYPE__PROCFS; return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), &rpe, PB_REMAP_FPATH); } static bool is_sillyrename_name(char *name) { int i; name = strrchr(name, '/'); BUG_ON(name == NULL); /* see check in dump_one_reg_file */ name++; /* * Strictly speaking this check is not bullet-proof. User * can create file with this name by hands and we have no * API to distinguish really-silly-renamed files from those * fake names :( * * But since NFS people expect .nfsXXX files to be unstable, * we treat them as such too. */ if (strncmp(name, SILLYNAME_PREF, sizeof(SILLYNAME_PREF) - 1)) return false; name += sizeof(SILLYNAME_PREF) - 1; for (i = 0; i < SILLYNAME_SUFF_LEN; i++) if (!isxdigit(name[i])) return false; return true; } static inline bool nfs_silly_rename(char *rpath, const struct fd_parms *parms) { return (parms->fs_type == NFS_SUPER_MAGIC) && is_sillyrename_name(rpath); } static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, int lfd, u32 id, struct ns_id *nsid) { char *rpath = link->name; int plen = link->len; int ret, mntns_root; struct stat pst; const struct stat *ost = &parms->stat; int flags = 0; bool fallback = false; if (parms->fs_type == PROC_SUPER_MAGIC) { /* The file points to /proc/pid/ where pid is a dead * process. We remap this file by adding this pid to be * fork()ed into a TASK_HELPER state so that we can point to it * on restore. */ pid_t pid; char *start, *end; /* skip "./proc/" */ start = strstr(rpath, "/"); if (!start) return -1; start = strstr(start + 1, "/"); if (!start) /* it's /proc */ return 0; pid = strtol(start + 1, &end, 10); /* If strtol didn't convert anything, then we are looking at * something like /proc/kmsg, which we shouldn't mess with. * Anything under /proc/ (including that directory itself) * can be c/r'd with a dead pid remap, so let's allow all such * cases. */ if (pid != 0) { bool is_dead = link_strip_deleted(link); mntns_root = mntns_get_root_fd(nsid); if (mntns_root < 0) return -1; /* /proc/ will be "/proc/1 (deleted)" when it is * dead, but a path like /proc/1/mountinfo won't have * the suffix, since it isn't actually deleted (still * exists, but the parent dir is deleted). So, if we * have a path like /proc/1/mountinfo, test if /proc/1 * exists instead, since this is what CRIU will need to * open on restore. */ if (!is_dead) { *end = 0; is_dead = faccessat(mntns_root, rpath, F_OK, 0); *end = '/'; } if (is_dead) { pr_info("Dumping dead process remap of %d\n", pid); return dump_dead_process_remap(pid, id); } } return 0; } else if (parms->fs_type == DEVPTS_SUPER_MAGIC) { /* * It's safe to call stripping here because * file paths are having predefined format for * this FS and can't have a valid " (deleted)" * postfix as a part of not deleted filename. */ link_strip_deleted(link); /* * Devpts devices/files are generated by the * kernel itself so we should not try to generate * any kind of ghost files here even if file is * no longer exist. */ return 0; } if (ost->st_nlink == 0) { /* * Unpleasant, but easy case. File is completely invisible * from the FS. Just dump its contents and that's it. But * be careful whether anybody still has any of its hardlinks * also open. */ link_strip_deleted(link); return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid); } if (nfs_silly_rename(rpath, parms)) { /* * If this is NFS silly-rename file the path we have at hands * will be accessible by fstat(), but once we kill the dumping * tasks it will disappear. So we just go ahead an dump it as * linked-remap file (NFS will allow us to create more hard * links on it) to have some persistent name at hands. */ pr_debug("Dump silly-rename linked remap for %x\n", id); return dump_linked_remap(rpath + 1, plen - 1, parms, lfd, id, nsid, NULL); } mntns_root = mntns_get_root_fd(nsid); if (mntns_root < 0) return -1; if (S_ISLNK(parms->stat.st_mode)) flags = AT_SYMLINK_NOFOLLOW; ret = fstatat(mntns_root, rpath, &pst, flags); if (ret < 0) { /* * Linked file, but path is not accessible (unless any * other error occurred). We can create a temporary link to it * using linkat with AT_EMPTY_PATH flag and remap it to this * name. */ if (errno == ENOENT) { link_strip_deleted(link); ret = dump_linked_remap(rpath + 1, plen - 1, parms, lfd, id, nsid, &fallback); if (ret < 0 && fallback) { /* fallback is true only if following conditions are true: * 1. linkat() inside dump_linked_remap() failed with ENOENT * 2. parms->fs_type == overlayFS */ return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid); } return ret; } pr_perror("Can't stat path"); return -1; } if ((pst.st_ino != ost->st_ino) || (pst.st_dev != ost->st_dev)) { if (opts.evasive_devices && (S_ISCHR(ost->st_mode) || S_ISBLK(ost->st_mode)) && pst.st_rdev == ost->st_rdev) return 0; /* * FIXME linked file, but the name we see it by is reused * by somebody else. We can dump it with linked remaps, but * we'll have difficulties on restore -- we will have to * move the existing file aside, then restore this one, * unlink, then move the original file back. It's fairly * easy to do, but we don't do it now, since unlinked files * have the "(deleted)" suffix in proc and name conflict * is unlikely :) */ pr_err("Unaccessible path opened %u:%u, need %u:%u\n", (int)pst.st_dev, (int)pst.st_ino, (int)ost->st_dev, (int)ost->st_ino); return -1; } /* * File is linked and visible by the name it is opened by * this task. Go ahead and dump it. */ return 0; } static bool should_check_size(int flags) { /* Skip size if file has O_APPEND and O_WRONLY flags (e.g. log file). */ if (((flags & O_ACCMODE) == O_WRONLY) && (flags & O_APPEND)) return false; return true; } /* * Gets the build-id (If it exists) from 32-bit ELF files. * Returns the number of bytes of the build-id if it could * be obtained, else -1. */ static int get_build_id_32(Elf32_Ehdr *file_header, unsigned char **build_id, const int fd, size_t mapped_size) { int size, num_iterations; size_t file_header_end; Elf32_Phdr *program_header, *program_header_end; Elf32_Nhdr *note_header_end, *note_header = NULL; file_header_end = (size_t)file_header + mapped_size; if (sizeof(Elf32_Ehdr) > mapped_size) return -1; /* * If the file doesn't have at least 1 program header entry, it definitely can't * have a build-id. */ if (!file_header->e_phnum) { pr_warn("Couldn't find any program headers for file with fd %d\n", fd); return -1; } program_header = (Elf32_Phdr *)(file_header->e_phoff + (char *)file_header); if (program_header <= (Elf32_Phdr *)file_header) return -1; program_header_end = (Elf32_Phdr *)(file_header_end - sizeof(Elf32_Phdr)); /* * If the file has a build-id, it will be in the PT_NOTE program header * entry AKA the note sections. */ for (num_iterations = 0; num_iterations < file_header->e_phnum; num_iterations++, program_header++) { if (program_header > program_header_end) break; if (program_header->p_type != PT_NOTE) continue; note_header = (Elf32_Nhdr *)(program_header->p_offset + (char *)file_header); if (note_header <= (Elf32_Nhdr *)file_header) { note_header = NULL; continue; } note_header_end = (Elf32_Nhdr *)min_t(char *, (char *)note_header + program_header->p_filesz, (char *)(file_header_end - sizeof(Elf32_Nhdr))); /* The note type for the build-id is NT_GNU_BUILD_ID. */ while (note_header <= note_header_end && note_header->n_type != NT_GNU_BUILD_ID) note_header = (Elf32_Nhdr *)((char *)note_header + sizeof(Elf32_Nhdr) + ALIGN(note_header->n_namesz, 4) + ALIGN(note_header->n_descsz, 4)); if (note_header > note_header_end) { note_header = NULL; continue; } break; } if (!note_header) { pr_debug("Couldn't find the build-id note for file with fd %d\n", fd); return -1; } /* * If the size of the notes description is too large or is invalid * then the build-id could not be obtained. */ if (note_header->n_descsz <= 0 || note_header->n_descsz > 512) { pr_warn("Invalid description size for build-id note for file with fd %d\n", fd); return -1; } size = note_header->n_descsz; note_header = (Elf32_Nhdr *)((char *)note_header + sizeof(Elf32_Nhdr) + ALIGN(note_header->n_namesz, 4)); note_header_end = (Elf32_Nhdr *)(file_header_end - size); if (note_header <= (Elf32_Nhdr *)file_header || note_header > note_header_end) return -1; *build_id = (unsigned char *)xmalloc(size); if (!*build_id) return -1; memcpy(*build_id, (void *)note_header, size); return size; } /* * Gets the build-id (If it exists) from 64-bit ELF files. * Returns the number of bytes of the build-id if it could * be obtained, else -1. */ static int get_build_id_64(Elf64_Ehdr *file_header, unsigned char **build_id, const int fd, size_t mapped_size) { int size, num_iterations; size_t file_header_end; Elf64_Phdr *program_header, *program_header_end; Elf64_Nhdr *note_header_end, *note_header = NULL; file_header_end = (size_t)file_header + mapped_size; if (sizeof(Elf64_Ehdr) > mapped_size) return -1; /* * If the file doesn't have at least 1 program header entry, it definitely can't * have a build-id. */ if (!file_header->e_phnum) { pr_warn("Couldn't find any program headers for file with fd %d\n", fd); return -1; } program_header = (Elf64_Phdr *)(file_header->e_phoff + (char *)file_header); if (program_header <= (Elf64_Phdr *)file_header) return -1; program_header_end = (Elf64_Phdr *)(file_header_end - sizeof(Elf64_Phdr)); /* * If the file has a build-id, it will be in the PT_NOTE program header * entry AKA the note sections. */ for (num_iterations = 0; num_iterations < file_header->e_phnum; num_iterations++, program_header++) { if (program_header > program_header_end) break; if (program_header->p_type != PT_NOTE) continue; note_header = (Elf64_Nhdr *)(program_header->p_offset + (char *)file_header); if (note_header <= (Elf64_Nhdr *)file_header) { note_header = NULL; continue; } note_header_end = (Elf64_Nhdr *)min_t(char *, (char *)note_header + program_header->p_filesz, (char *)(file_header_end - sizeof(Elf64_Nhdr))); /* The note type for the build-id is NT_GNU_BUILD_ID. */ while (note_header <= note_header_end && note_header->n_type != NT_GNU_BUILD_ID) note_header = (Elf64_Nhdr *)((char *)note_header + sizeof(Elf64_Nhdr) + ALIGN(note_header->n_namesz, 4) + ALIGN(note_header->n_descsz, 4)); if (note_header > note_header_end) { note_header = NULL; continue; } break; } if (!note_header) { pr_debug("Couldn't find the build-id note for file with fd %d\n", fd); return -1; } /* * If the size of the notes description is too large or is invalid * then the build-id could not be obtained. */ if (note_header->n_descsz <= 0 || note_header->n_descsz > 512) { pr_warn("Invalid description size for build-id note for file with fd %d\n", fd); return -1; } size = note_header->n_descsz; note_header = (Elf64_Nhdr *)((char *)note_header + sizeof(Elf64_Nhdr) + ALIGN(note_header->n_namesz, 4)); note_header_end = (Elf64_Nhdr *)(file_header_end - size); if (note_header <= (Elf64_Nhdr *)file_header || note_header > note_header_end) return -1; *build_id = (unsigned char *)xmalloc(size); if (!*build_id) return -1; memcpy(*build_id, (void *)note_header, size); return size; } /* * Finds the build-id of the file by checking if the file is an ELF file * and then calling either the 32-bit or the 64-bit function as necessary. * Returns the number of bytes of the build-id if it could be * obtained, else -1. */ static int get_build_id(const int fd, const struct stat *fd_status, unsigned char **build_id) { char *start_addr; size_t mapped_size; int ret = -1; /* * If the build-id exists, then it will most likely be present in the * beginning of the file. Therefore at most only the first 1 MB of the * file is mapped. */ mapped_size = min_t(size_t, fd_status->st_size, BUILD_ID_MAP_SIZE); start_addr = mmap(0, mapped_size, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); if ((void*)start_addr == MAP_FAILED) { pr_warn("Couldn't mmap file with fd %d\n", fd); return -1; } /* * The first 4 bytes contain a magic number identifying the file as an * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and * ‘F’, respectively. These characters are together defined as ELFMAG. */ if (memcmp(start_addr, ELFMAG, SELFMAG)) goto out; if (start_addr[EI_CLASS] == ELFCLASS32) ret = get_build_id_32((Elf32_Ehdr *)start_addr, build_id, fd, mapped_size); if (start_addr[EI_CLASS] == ELFCLASS64) ret = get_build_id_64((Elf64_Ehdr *)start_addr, build_id, fd, mapped_size); out: munmap(start_addr, mapped_size); return ret; } /* * Finds and stores the build-id of a file, if it exists, so that it can be validated * while restoring. * Returns 1 if the build-id of the file could be stored, -1 if there was an error * or 0 if the build-id could not be obtained. */ static int store_validation_data_build_id(RegFileEntry *rfe, int lfd, const struct fd_parms *p) { unsigned char *build_id = NULL; int build_id_size, allocated_size; int fd; /* * Checks whether the file is at least big enough to try and read the first * four (SELFMAG) bytes which should correspond to the ELF magic number * and the next byte which indicates whether the file is 32-bit or 64-bit. */ if (p->stat.st_size < SELFMAG + 1) return 0; fd = open_proc(PROC_SELF, "fd/%d", lfd); if (fd < 0) { pr_err("Build-ID (For validation) could not be obtained for file %s because can't open the file\n", rfe->name); return -1; } build_id_size = get_build_id(fd, &(p->stat), &build_id); close(fd); if (!build_id || build_id_size == -1) return 0; allocated_size = round_up(build_id_size, sizeof(uint32_t)); rfe->build_id = xzalloc(allocated_size); if (!rfe->build_id) { pr_warn("Build-ID (For validation) could not be set for file %s\n", rfe->name); xfree(build_id); return -1; } rfe->n_build_id = allocated_size / sizeof(uint32_t); memcpy(rfe->build_id, (void *)build_id, build_id_size); xfree(build_id); return 1; } /* * This routine stores metadata about the open file (File size, build-id, CRC32C checksum) * so that validation can be done while restoring to make sure that the right file is * being restored. * Returns true if at least some metadata was stored, if there was an error it returns false. */ static bool store_validation_data(RegFileEntry *rfe, const struct fd_parms *p, int lfd) { int result = 1; rfe->has_size = true; rfe->size = p->stat.st_size; if (opts.file_validation_method == FILE_VALIDATION_BUILD_ID) result = store_validation_data_build_id(rfe, lfd, p); if (result == -1) return false; if (!result) pr_info("Only file size could be stored for validation for file %s\n", rfe->name); return true; } int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) { struct fd_link _link, *link; struct mount_info *mi; struct cr_img *rimg; char ext_id[64]; int ret; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; bool skip_for_shell_job = false; if (!p->link) { if (fill_fdlink(lfd, p, &_link)) return -1; link = &_link; } else link = p->link; snprintf(ext_id, sizeof(ext_id), "file[%x:%" PRIx64 "]", p->mnt_id, p->stat.st_ino); if (external_lookup_id(ext_id)) { /* the first symbol will be cut on restore to get an relative path*/ rfe.name = xstrdup(ext_id); rfe.ext = true; rfe.has_ext = true; goto ext; } mi = lookup_mnt_id(p->mnt_id); if (mi == NULL) { // workaround // https://github.com/checkpoint-restore/criu/issues/860 // https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 // https://www.mail-archive.com/kernel-packages@lists.launchpad.net/msg447662.html pr_warn("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); } if (mi && mnt_is_overmounted(mi)) { if (opts.shell_job && is_tty(p->stat.st_rdev, p->stat.st_dev)) { skip_for_shell_job = true; } else { pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); return -1; } } if (!skip_for_shell_job && mnt_is_overmounted(mi)) { pr_err("Open files on overmounted mounts are not supported yet; mount=%d fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); return -1; } if (p->mnt_id >= 0 && (root_ns_mask & CLONE_NEWNS)) { rfe.mnt_id = p->mnt_id; rfe.has_mnt_id = true; } pr_info("Dumping path for %d fd via self %d [%s]\n", p->fd, lfd, &link->name[1]); /* * The regular path we can handle should start with slash. */ if (link->name[1] != '/') { pr_err("The path [%s] is not supported\n", &link->name[1]); return -1; } if (mi && !skip_for_shell_job && check_path_remap(link, p, lfd, id, mi->nsid)) return -1; rfe.name = &link->name[1]; ext: rfe.id = id; rfe.flags = p->flags; rfe.pos = p->pos; rfe.fown = (FownEntry *)&p->fown; rfe.has_mode = true; rfe.mode = p->stat.st_mode; if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags) && !store_validation_data(&rfe, p, lfd)) return -1; fe.type = FD_TYPES__REG; fe.id = rfe.id; fe.reg = &rfe; rimg = img_from_set(glob_imgset, CR_FD_FILES); ret = pb_write_one(rimg, &fe, PB_FILE); if (rfe.build_id) xfree(rfe.build_id); return ret; } const struct fdtype_ops regfile_dump_ops = { .type = FD_TYPES__REG, .dump = dump_one_reg_file, }; static void convert_path_from_another_mp(char *src, char *dst, int dlen, struct mount_info *smi, struct mount_info *dmi) { int off; /* * mi->mountpoint ./foo/bar * mi->ns_mountpoint /foo/bar * rfi->path foo/bar/baz */ off = strlen(smi->ns_mountpoint + 1); BUG_ON(strlen(smi->root) < strlen(dmi->root)); /* * Create paths relative to this mount. * Absolute path to the mount point + difference between source * and destination roots + path relative to the mountpoint. */ snprintf(dst, dlen, "./%s/%s/%s", dmi->ns_mountpoint + 1, smi->root + strlen(dmi->root), src + off); } static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, gid_t gid, int flags) { struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; struct __user_cap_header_struct hdr; int ret, old_fsuid = -1, old_fsgid = -1; int errno_save; ret = linkat(odir, opath, ndir, npath, flags); if (ret == 0) return 0; if (!((errno == EPERM || errno == EOVERFLOW) && (root_ns_mask & CLONE_NEWUSER))) { errno_save = errno; pr_warn("Can't link %s -> %s\n", opath, npath); errno = errno_save; return ret; } /* * Kernel before 4.3 has strange security restrictions about * linkat. If the fsuid of the caller doesn't equals * the uid of the file and the file is not "safe" * one, then only global CAP_CHOWN will be allowed * to link(). * * Next, when we're in user namespace we're ns root, * but not global CAP_CHOWN. Thus, even though we * ARE ns root, we will not be allowed to link() at * files that belong to regular users %) * * Fortunately, the setfsuid() requires ns-level * CAP_SETUID which we have. * * Starting with 4.8 the kernel doesn't allow to create inodes * with a uid or gid unknown to an user namespace. * 036d523641c66 ("vfs: Don't create inodes with a uid or gid unknown to the vfs") */ old_fsuid = setfsuid(uid); old_fsgid = setfsgid(gid); /* AT_EMPTY_PATH requires CAP_DAC_READ_SEARCH */ if (flags & AT_EMPTY_PATH) { hdr.version = _LINUX_CAPABILITY_VERSION_3; hdr.pid = 0; if (capget(&hdr, data) < 0) { errno_save = errno; pr_perror("capget"); goto out; } data[0].effective = data[0].permitted; data[1].effective = data[1].permitted; if (capset(&hdr, data) < 0) { errno_save = errno; pr_perror("capset"); goto out; } } ret = linkat(odir, opath, ndir, npath, flags); errno_save = errno; if (ret < 0) pr_perror("Can't link %s -> %s", opath, npath); out: setfsuid(old_fsuid); setfsgid(old_fsgid); if (setfsuid(-1) != old_fsuid) { pr_warn("Failed to restore old fsuid!\n"); /* * Don't fail here. We still have chances to run till * the pie/restorer, and if _this_ guy fails to set * the proper fsuid, then we'll abort the restore. */ } /* * Restoring PR_SET_DUMPABLE flag is required after setfsuid, * as if it not set, proc inode will be created with root cred * (see proc_pid_make_inode), which will result in permission * check fail when trying to access files in /proc/self/ */ prctl(PR_SET_DUMPABLE, 1, 0); errno = errno_save; return ret; } int rm_parent_dirs(int mntns_root, char *path, int count) { char *p, *prev = NULL; int ret = -1; while (count-- > 0) { p = strrchr(path, '/'); if (p) { /* We don't handle "//" in path */ BUG_ON(prev && (prev - p == 1)); *p = '\0'; } else { /* Inconsistent path and count */ pr_perror("Can't strrchr \"/\" in \"%s\"/\"%s\"]" " left count=%d\n", path, prev ? prev + 1 : "", count + 1); goto err; } if (prev) *prev = '/'; prev = p; if (unlinkat(mntns_root, path, AT_REMOVEDIR)) { pr_perror("Can't remove %s AT %d", path, mntns_root); goto err; } pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root); } ret = 0; err: if (prev) *prev = '/'; return ret; } /* Construct parent dir name and mkdir parent/grandparents if they're not exist */ int make_parent_dirs_if_need(int mntns_root, char *path) { char *p, *last_delim; int err, count = 0; struct stat st; p = last_delim = strrchr(path, '/'); if (!p) return 0; *p = '\0'; if (fstatat(mntns_root, path, &st, AT_EMPTY_PATH) == 0) goto out; if (errno != ENOENT) { pr_perror("Can't stat %s", path); count = -1; goto out; } p = path; do { p = strchr(p, '/'); if (p) *p = '\0'; err = mkdirat(mntns_root, path, 0777); if (err && errno != EEXIST) { pr_perror("Can't create dir: %s AT %d", path, mntns_root); /* Failing anyway -> no retcode check */ rm_parent_dirs(mntns_root, path, count); count = -1; goto out; } else if (!err) { pr_debug("Created parent dir: %s AT %d\n", path, mntns_root); count++; } if (p) *p++ = '/'; } while (p); out: *last_delim = '/'; return count; } /* * This routine properly resolves d's path handling ghost/link-remaps. * The open_cb is a routine that does actual open, it differs for * files, directories, fifos, etc. * * Return 0 on success, -1 on error and 1 to indicate soft error, which can be * retried. */ static int rfi_remap(struct reg_file_info *rfi, int *level) { struct mount_info *mi, *rmi, *tmi; char _path[PATH_MAX], *path = _path; char _rpath[PATH_MAX], *rpath = _rpath; int mntns_root; if (rfi->rfe->mnt_id == -1) { /* Know nothing about mountpoints */ mntns_root = mntns_get_root_by_mnt_id(-1); path = rfi->path; rpath = rfi->remap->rpath; goto out_root; } mi = lookup_mnt_id(rfi->rfe->mnt_id); if (mi == NULL) return -1; if (rfi->rfe->mnt_id == rfi->remap->rmnt_id) { /* Both links on the same mount point */ tmi = mi; path = rfi->path; rpath = rfi->remap->rpath; goto out; } rmi = lookup_mnt_id(rfi->remap->rmnt_id); if (rmi == NULL) return -1; /* * Find the common bind-mount. We know that one mount point was * really mounted and all other were bind-mounted from it, so the * lowest mount must contains all bind-mounts. */ for (tmi = mi; tmi->bind; tmi = tmi->bind) ; BUG_ON(tmi->s_dev != rmi->s_dev); BUG_ON(tmi->s_dev != mi->s_dev); /* Calculate paths on the device (root mount) */ convert_path_from_another_mp(rfi->path, path, sizeof(_path), mi, tmi); convert_path_from_another_mp(rfi->remap->rpath, rpath, sizeof(_rpath), rmi, tmi); out: mntns_root = mntns_get_root_fd(tmi->nsid); /* We get here while in task's mntns */ if (try_remount_writable(tmi, true)) return -1; pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path); out_root: *level = make_parent_dirs_if_need(mntns_root, path); if (*level < 0) return -1; if (linkat_hard(mntns_root, rpath, mntns_root, path, rfi->remap->uid, rfi->remap->gid, 0) < 0) { int errno_saved = errno; if (!rm_parent_dirs(mntns_root, path, *level) && errno_saved == EEXIST) { errno = errno_saved; return 1; } return -1; } return 0; } /* * Compares the file's build-id with the stored value. * Returns 1 if the build-id of the file matches the build-id that was stored * while dumping, -1 if there is a mismatch or 0 if the build-id has not been * stored or could not be obtained. */ static int validate_with_build_id(const int fd, const struct stat *fd_status, const struct reg_file_info *rfi) { unsigned char *build_id; int build_id_size; if (!rfi->rfe->has_size) return 1; if (!rfi->rfe->n_build_id) return 0; build_id = NULL; build_id_size = get_build_id(fd, fd_status, &build_id); if (!build_id || build_id_size == -1) return 0; if (round_up(build_id_size, sizeof(uint32_t)) != rfi->rfe->n_build_id * sizeof(uint32_t)) { pr_err("File %s has bad build-ID length %d (expect %d)\n", rfi->path, round_up(build_id_size, sizeof(uint32_t)), (int)(rfi->rfe->n_build_id * sizeof(uint32_t))); xfree(build_id); return -1; } if (memcmp(build_id, rfi->rfe->build_id, build_id_size)) { pr_err("File %s has bad build-ID\n", rfi->path); xfree(build_id); return -1; } xfree(build_id); return 1; } /* * This function determines whether it was the same file that was open during dump * by checking the file's size, build-id and/or checksum with the same metadata * that was stored before dumping. * Checksum is calculated with CRC32C. * Returns true if the metadata of the file matches the metadata stored while * dumping else returns false. */ static bool validate_file(const int fd, const struct stat *fd_status, const struct reg_file_info *rfi) { int result = 1; if (rfi->rfe->has_size && (fd_status->st_size != rfi->rfe->size)) { pr_err("File %s has bad size %" PRIu64 " (expect %" PRIu64 ")\n", rfi->path, fd_status->st_size, rfi->rfe->size); return false; } if (opts.file_validation_method == FILE_VALIDATION_BUILD_ID) result = validate_with_build_id(fd, fd_status, rfi); if (result == -1) return false; if (!result) pr_info("File %s could only be validated with file size\n", rfi->path); return true; } int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg) { int tmp = -1, mntns_root, level = 0; struct reg_file_info *rfi; char *orig_path = NULL; char path[PATH_MAX]; int inh_fd = -1; int ret; if (inherited_fd(d, &tmp)) return tmp; rfi = container_of(d, struct reg_file_info, d); if (rfi->rfe->ext) { tmp = inherit_fd_lookup_id(rfi->rfe->name); if (tmp >= 0) { inh_fd = tmp; /* * PROC_SELF isn't used, because only service * descriptors can be used here. */ mntns_root = open_pid_proc(getpid()); snprintf(path, sizeof(path), "fd/%d", tmp); orig_path = rfi->path; rfi->path = path; goto ext; } } if (rfi->remap) { if (fault_injected(FI_RESTORE_OPEN_LINK_REMAP)) { pr_info("fault: Open link-remap failure!\n"); kill(getpid(), SIGKILL); } mutex_lock(remap_open_lock); if (rfi->remap->is_dir) { /* * FIXME Can't make directory under new name. * Will have to open it under the ghost one :( */ orig_path = rfi->path; rfi->path = rfi->remap->rpath; } else if ((ret = rfi_remap(rfi, &level)) == 1) { static char tmp_path[PATH_MAX]; /* * The file whose name we're trying to create * exists. Need to pick some other one, we're * going to remove it anyway. * * Strictly speaking, this is cheating, file * name shouldn't change. But since NFS with * its silly-rename doesn't care, why should we? */ orig_path = rfi->path; rfi->path = tmp_path; snprintf(tmp_path, sizeof(tmp_path), "%s.cr_link", orig_path); pr_debug("Fake %s -> %s link\n", rfi->remap->rpath, rfi->path); if (rfi_remap(rfi, &level)) { pr_perror("Can't create even fake link!"); goto err; } } else if (ret < 0) { pr_perror("Can't link %s -> %s", rfi->remap->rpath, rfi->path); goto err; } } mntns_root = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id); ext: tmp = open_cb(mntns_root, rfi, arg); if (tmp < 0) { pr_perror("Can't open file %s", rfi->path); close_safe(&inh_fd); goto err; } close_safe(&inh_fd); if ((rfi->rfe->has_size || rfi->rfe->has_mode) && !rfi->size_mode_checked) { struct stat st; if (fstat(tmp, &st) < 0) { pr_perror("Can't fstat opened file"); goto err; } if (!validate_file(tmp, &st, rfi)) goto err; if (rfi->rfe->has_mode) { mode_t curr_mode = st.st_mode; mode_t saved_mode = rfi->rfe->mode; if (opts.skip_file_rwx_check) { curr_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); saved_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); } if (curr_mode != saved_mode) { pr_err("File %s has bad mode 0%o (expect 0%o)\n" "File r/w/x checks can be skipped with the --skip-file-rwx-check option\n", rfi->path, (int)curr_mode, saved_mode); goto err; } } /* * This is only visible in the current process, so * change w/o locks. Other tasks sharing the same * file will get one via unix sockets. */ rfi->size_mode_checked = true; } if (rfi->remap) { if (!rfi->remap->is_dir) { struct mount_info *mi = lookup_mnt_id(rfi->rfe->mnt_id); if (mi && try_remount_writable(mi, true)) goto err; pr_debug("Unlink: %d:%s\n", rfi->rfe->mnt_id, rfi->path); if (unlinkat(mntns_root, rfi->path, 0)) { pr_perror("Failed to unlink the remap file"); goto err; } if (rm_parent_dirs(mntns_root, rfi->path, level)) goto err; } mutex_unlock(remap_open_lock); } if (orig_path) rfi->path = orig_path; if (restore_fown(tmp, rfi->rfe->fown)) { close(tmp); return -1; } return tmp; err: if (rfi->remap) mutex_unlock(remap_open_lock); close_safe(&tmp); return -1; } int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg) { u32 flags = *(u32 *)arg; int fd; /* unnamed temporary files are restored as ghost files */ flags &= ~O_TMPFILE; fd = openat(ns_root_fd, rfi->path, flags); if (fd < 0) { pr_perror("Can't open file %s on restore", rfi->path); return fd; } return fd; } static int do_open_reg_noseek(int ns_root_fd, struct reg_file_info *rfi, void *arg) { return do_open_reg_noseek_flags(ns_root_fd, rfi, &rfi->rfe->flags); } static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) { int fd; fd = do_open_reg_noseek(ns_root_fd, rfi, arg); if (fd < 0) return fd; /* * O_PATH opened files carry empty fops in kernel, * just ignore positioning at all. */ if (!(rfi->rfe->flags & O_PATH)) { if (rfi->rfe->pos != -1ULL && lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { pr_perror("Can't restore file pos"); close(fd); return -1; } } return fd; } int open_reg_fd(struct file_desc *fd) { return open_path(fd, do_open_reg_noseek, NULL); } int open_reg_by_id(u32 id) { struct file_desc *fd; /* * This one gets called by exe link, chroot and cwd * restoring code. No need in calling lseek on either * of them. */ fd = find_file_desc_raw(FD_TYPES__REG, id); if (fd == NULL) { pr_err("Can't find regfile for %#x\n", id); return -1; } return open_reg_fd(fd); } struct filemap_ctx { u32 flags; struct file_desc *desc; int fd; /* * Whether or not to close the fd when we're about to * put a new one into ctx. * * True is used by premap, so that it just calls vm_open * in sequence, immediately mmap()s the file, then it * can be closed. * * False is used by open_vmas() which pre-opens the files * for restorer, and the latter mmap()s them and closes. * * ... */ bool close; /* ... * * but closing all vmas won't work, as some of them share * the descriptor, so only the ones that terminate the * fd-sharing chain are marked with VMA_CLOSE flag, saying * restorer to close the vma's fd. * * Said that, this vma pointer references the previously * seen vma, so that once fd changes, this one gets the * closing flag. */ struct vma_area *vma; }; static struct filemap_ctx ctx; void filemap_ctx_init(bool auto_close) { ctx.desc = NULL; /* to fail the first comparison in open_ */ ctx.fd = -1; /* not to close random fd in _fini */ ctx.vma = NULL; /* not to put spurious VMA_CLOSE in _fini */ /* flags may remain any */ ctx.close = auto_close; } void filemap_ctx_fini(void) { if (ctx.close) { if (ctx.fd >= 0) close(ctx.fd); } else { if (ctx.vma) ctx.vma->e->status |= VMA_CLOSE; } } static int open_filemap(int pid, struct vma_area *vma) { u32 flags; int ret; int plugin_fd = -1; /* * The vma->fd should have been assigned in collect_filemap * * We open file w/o lseek, as mappings don't care about it */ BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags); flags = vma->e->fdflags; /* update the new device file page offsets and file paths set during restore */ if (vma->e->status & VMA_EXT_PLUGIN) { uint64_t new_pgoff; int ret; struct reg_file_info *rfi = container_of(vma->vmfd, struct reg_file_info, d); ret = run_plugins(UPDATE_VMA_MAP, rfi->rfe->name, vma->e->start, vma->e->pgoff, &new_pgoff, &plugin_fd); if (ret == 1) { pr_info("New mmap %#016" PRIx64 ":%#016" PRIx64 "->%#016" PRIx64 " fd %d\n", vma->e->start, vma->e->pgoff, new_pgoff, plugin_fd); vma->e->pgoff = new_pgoff; } /* Device plugin will restore vma contents, so no need for write permission */ vma->e->status |= VMA_NO_PROT_WRITE; } if (ctx.flags != flags || ctx.desc != vma->vmfd) { if (plugin_fd >= 0) { /* * Vma handled by device plugin. * Some device drivers (e.g DRM) only allow the file descriptor that was used to create vma to * be used when calling mmap. In this case, use the FD returned by plugin. FD can be copied * using dup because dup returns a reference to the same struct file inside kernel, but we * cannot open a new FD. */ ret = plugin_fd; } else if (vma->e->status & VMA_AREA_MEMFD) { if (!inherited_fd(vma->vmfd, &ret)) ret = memfd_open(vma->vmfd, &flags, true); } else { ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); } if (ret < 0) return ret; filemap_ctx_fini(); ctx.flags = flags; ctx.desc = vma->vmfd; ctx.fd = ret; } ctx.vma = vma; vma->e->fd = ctx.fd; return 0; } int collect_filemap(struct vma_area *vma) { struct file_desc *fd; if (!vma->e->has_fdflags) { /* Make a wild guess for the fdflags */ vma->e->has_fdflags = true; if ((vma->e->prot & PROT_WRITE) && vma_area_is(vma, VMA_FILE_SHARED)) vma->e->fdflags = O_RDWR; else vma->e->fdflags = O_RDONLY; } if (vma->e->status & VMA_AREA_MEMFD) fd = collect_memfd(vma->e->shmid); else fd = collect_special_file(vma->e->shmid); if (!fd) return -1; vma->vmfd = fd; vma->vm_open = open_filemap; return 0; } static int open_fe_fd(struct file_desc *fd, int *new_fd) { int tmp; tmp = open_path(fd, do_open_reg, NULL); if (tmp < 0) return -1; *new_fd = tmp; return 0; } static char *reg_file_path(struct file_desc *d, char *buf, size_t s) { struct reg_file_info *rfi; rfi = container_of(d, struct reg_file_info, d); return rfi->path; } static struct file_desc_ops reg_desc_ops = { .type = FD_TYPES__REG, .open = open_fe_fd, .name = reg_file_path, }; struct file_desc *try_collect_special_file(u32 id, int optional) { struct file_desc *fdesc; /* * Files dumped for vmas/exe links can have remaps * configured. Need to bump-up users for them, otherwise * the open_path() would unlink the remap file after * the very first open. */ fdesc = find_file_desc_raw(FD_TYPES__REG, id); if (fdesc == NULL) { if (!optional) pr_err("No entry for reg-file-ID %#x\n", id); return NULL; } return fdesc; } static int collect_one_regfile(void *o, ProtobufCMessage *base, struct cr_img *i) { struct reg_file_info *rfi = o; static char dot[] = "."; rfi->rfe = pb_msg(base, RegFileEntry); /* change "/foo" into "foo" and "/" into "." */ if (rfi->rfe->name[1] == '\0') rfi->path = dot; else rfi->path = rfi->rfe->name + 1; rfi->remap = NULL; rfi->size_mode_checked = false; pr_info("Collected [%s] ID %#x\n", rfi->path, rfi->rfe->id); return file_desc_add(&rfi->d, rfi->rfe->id, ®_desc_ops); } struct collect_image_info reg_file_cinfo = { .fd_type = CR_FD_REG_FILES, .pb_type = PB_REG_FILE, .priv_size = sizeof(struct reg_file_info), .collect = collect_one_regfile, .flags = COLLECT_SHARED, }; int collect_remaps_and_regfiles(void) { if (!files_collected() && collect_image(®_file_cinfo)) return -1; if (collect_image(&remap_cinfo)) return -1; return 0; } crac-criu-1.5.0/criu/files.c000066400000000000000000001215361471504326700156160ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include "files.h" #include "file-ids.h" #include "files-reg.h" #include "file-lock.h" #include "image.h" #include "common/list.h" #include "rst-malloc.h" #include "util-caps.h" #include "common/lock.h" #include "sockets.h" #include "pstree.h" #include "tty.h" #include "pipes.h" #include "fifo.h" #include "eventfd.h" #include "eventpoll.h" #include "fsnotify.h" #include "sk-packet.h" #include "mount.h" #include "signalfd.h" #include "memfd.h" #include "namespaces.h" #include "tun.h" #include "timerfd.h" #include "imgset.h" #include "fs-magic.h" #include "fdinfo.h" #include "cr_options.h" #include "autofs.h" #include "parasite.h" #include "parasite-syscall.h" #include "string.h" #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" #include "protobuf.h" #include "util.h" #include "images/fs.pb-c.h" #include "images/ext-file.pb-c.h" #include "plugin.h" #define FDESC_HASH_SIZE 64 static struct hlist_head file_desc_hash[FDESC_HASH_SIZE]; /* file_desc's, which fle is not owned by a process, that is able to open them */ static LIST_HEAD(fake_master_head); static u32 max_file_desc_id = 0; static void init_fdesc_hash(void) { int i; for (i = 0; i < FDESC_HASH_SIZE; i++) INIT_HLIST_HEAD(&file_desc_hash[i]); } void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops) { INIT_LIST_HEAD(&d->fd_info_head); INIT_LIST_HEAD(&d->fake_master_list); INIT_HLIST_NODE(&d->hash); d->id = id; d->ops = ops; d->fds_inherited = FDIH_UNKNOWN; } int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops) { file_desc_init(d, id, ops); hlist_add_head(&d->hash, &file_desc_hash[id % FDESC_HASH_SIZE]); if (id > max_file_desc_id) max_file_desc_id = id; return 0; /* this is to make tail-calls in collect_one_foo look nice */ } struct file_desc *find_file_desc_raw(int type, u32 id) { struct file_desc *d; struct hlist_head *chain; chain = &file_desc_hash[id % FDESC_HASH_SIZE]; hlist_for_each_entry(d, chain, hash) if ((d->id == id) && (d->ops->type == type || type == FD_TYPES__UND)) /* * Warning -- old CRIU might generate matching IDs * for different file types! So any code that uses * FD_TYPES__UND for fdesc search MUST make sure it's * dealing with the merged files images where all * descs are forced to have different IDs. */ return d; return NULL; } static inline struct file_desc *find_file_desc(FdinfoEntry *fe) { return find_file_desc_raw(fe->type, fe->id); } u32 find_unused_file_desc_id(void) { return max_file_desc_id + 1; } struct fdinfo_list_entry *find_used_fd(struct pstree_item *task, int fd) { struct list_head *head; struct fdinfo_list_entry *fle; head = &rsti(task)->fds; list_for_each_entry_reverse(fle, head, ps_list) { if (fle->fe->fd == fd) return fle; /* List is ordered, so let's stop */ if (fle->fe->fd < fd) break; } return NULL; } static void collect_task_fd(struct fdinfo_list_entry *new_fle, struct rst_info *ri) { struct fdinfo_list_entry *fle; /* * fles in fds list are ordered by fd. Fds are restored from img files * in ascending order, so it is faster to insert them from the end of * the list. */ list_for_each_entry_reverse(fle, &ri->fds, ps_list) { if (fle->fe->fd < new_fle->fe->fd) break; } list_add(&new_fle->ps_list, &fle->ps_list); } unsigned int find_unused_fd(struct pstree_item *task, int hint_fd) { struct list_head *head; struct fdinfo_list_entry *fle; int fd = 0, prev_fd; if ((hint_fd >= 0) && (!find_used_fd(task, hint_fd))) { fd = hint_fd; goto out; } prev_fd = service_fd_min_fd(task) - 1; head = &rsti(task)->fds; list_for_each_entry_reverse(fle, head, ps_list) { fd = fle->fe->fd; if (prev_fd > fd) { fd++; goto out; } prev_fd = fd - 1; } BUG(); out: return fd; } int find_unused_fd_pid(pid_t pid) { struct pstree_item *task; task = pstree_item_by_virt(pid); if (!task) { pr_err("Invalid pid:%d\n", pid); return -1; } return find_unused_fd(task, -1); } int set_fds_event(pid_t virt) { struct pstree_item *item; bool is_set; item = pstree_item_by_virt(virt); BUG_ON(!item); is_set = !!test_and_set_bit_le(FDS_EVENT_BIT, &item->task_st_le_bits); if (!is_set) futex_wake(&item->task_st); return 0; } void clear_fds_event(void) { clear_bit_le(FDS_EVENT_BIT, ¤t->task_st_le_bits); } void wait_fds_event(void) { futex_t *f = ¤t->task_st; int value; value = htole32(FDS_EVENT); futex_wait_if_cond(f, value, &); clear_fds_event(); } struct fdinfo_list_entry *try_file_master(struct file_desc *d) { if (list_empty(&d->fd_info_head)) return NULL; return list_first_entry(&d->fd_info_head, struct fdinfo_list_entry, desc_list); } struct fdinfo_list_entry *file_master(struct file_desc *d) { struct fdinfo_list_entry *fle; fle = try_file_master(d); if (!fle) { pr_err("Empty list on file desc id %#x(%d)\n", d->id, d->ops ? d->ops->type : -1); BUG(); } return fle; } void show_saved_files(void) { int i; struct file_desc *fd; pr_info("File descs:\n"); for (i = 0; i < FDESC_HASH_SIZE; i++) hlist_for_each_entry(fd, &file_desc_hash[i], hash) { struct fdinfo_list_entry *le; pr_info(" `- type %d ID %#x\n", fd->ops->type, fd->id); list_for_each_entry(le, &fd->fd_info_head, desc_list) pr_info(" `- FD %d pid %d\n", le->fe->fd, le->pid); } } /* * Workaround for the OverlayFS bug present before Kernel 4.2 * * This is here only to support the Linux Kernel between versions * 3.18 and 4.2. After that, this workaround is not needed anymore, * but it will work properly on both a kernel with and without the bug. * * When a process has a file open in an OverlayFS directory, * the information in /proc//fd/ and /proc//fdinfo/ * is wrong. We can't even rely on stat()-ing /proc//fd/ since * this will show us the wrong filesystem type. * * So we grab that information from the mountinfo table instead. This is done * every time fill_fdlink is called. See lookup_overlayfs for more details. * */ static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link) { struct mount_info *m; if (!link) return 0; m = lookup_overlayfs(link->name, p->stat.st_dev, p->stat.st_ino, p->mnt_id); if (IS_ERR(m)) return -1; if (!m) return 0; p->mnt_id = m->mnt_id; /* * If the bug is present, the file path from /proc//fd * does not include the mountpoint, so we prepend it ourselves. */ if (strcmp("./", m->ns_mountpoint) != 0) { char buf[PATH_MAX]; int n; __strlcpy(buf, link->name, PATH_MAX); n = snprintf(link->name, PATH_MAX, "%s/%s", m->ns_mountpoint, buf + 2); if (n >= PATH_MAX) { pr_err("Not enough space to replace %s\n", buf); return -1; } } return 0; } /* * The gen_id thing is used to optimize the comparison of shared files. * If two files have different gen_ids, then they are different for sure. * If it matches, we don't know it and have to call sys_kcmp(). * * The kcmp-ids.c engine does this trick, see comments in it for more info. */ uint32_t make_gen_id(uint32_t st_dev, uint32_t st_ino, uint64_t pos) { uint32_t pos_hi = pos >> 32; uint32_t pos_low = pos & 0xffffffff; return st_dev ^ st_ino ^ pos_hi ^ pos_low; } int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops, FdinfoEntry *e) { int ret = -1; e->type = ops->type; e->id = make_gen_id((uint32_t)p->stat.st_dev, (uint32_t)p->stat.st_ino, (uint64_t)p->pos); e->fd = p->fd; e->flags = p->fd_flags; ret = fd_id_generate(p->pid, e, p); if (ret == 1) /* new ID generated */ ret = ops->dump(lfd, e->id, p); else /* Remove locks generated by the fd before going to the next */ discard_dup_locks_tail(p->pid, e->fd); return ret; } int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link) { int len; link->name[0] = '.'; len = read_fd_link(lfd, &link->name[1], sizeof(link->name) - 1); if (len < 0) { pr_err("Can't read link for pid %d fd %d\n", p->pid, p->fd); return -1; } link->len = len + 1; if (opts.overlayfs) if (fixup_overlayfs((struct fd_parms *)p, link) < 0) return -1; return 0; } static int fill_fd_params(struct pid *owner_pid, int fd, int lfd, struct fd_opts *opts, struct fd_parms *p) { int ret; struct statfs fsbuf; struct fdinfo_common fdinfo = { .mnt_id = -1, .owner = owner_pid->ns[0].virt }; if (fstat(lfd, &p->stat) < 0) { pr_perror("Can't stat fd %d", lfd); return -1; } if (fstatfs(lfd, &fsbuf) < 0) { pr_perror("Can't statfs fd %d", lfd); return -1; } if (parse_fdinfo_pid(owner_pid->real, fd, FD_TYPES__UND, &fdinfo)) return -1; p->fs_type = fsbuf.f_type; p->fd = fd; p->pos = fdinfo.pos; /* * The kernel artificially adds the O_CLOEXEC flag on the file pointer * flags by looking at the flags on the file descriptor (see kernel * code fs/proc/fd.c). FD_CLOEXEC is a file descriptor property, which * is saved in fd_flags. */ p->flags = fdinfo.flags & ~O_CLOEXEC; p->mnt_id = fdinfo.mnt_id; p->pid = owner_pid->real; p->fd_flags = opts->flags; fown_entry__init(&p->fown); pr_info("%d fdinfo %d: pos: %#16" PRIx64 " flags: %16o/%#x\n", owner_pid->real, fd, p->pos, p->flags, (int)p->fd_flags); if (p->flags & O_PATH) ret = 0; else ret = fcntl(lfd, F_GETSIG, 0); if (ret < 0) { pr_perror("Can't get owner signum on %d", lfd); return -1; } p->fown.signum = ret; if (opts->fown.pid == 0) return 0; p->fown.pid = opts->fown.pid; p->fown.pid_type = opts->fown.pid_type; p->fown.uid = opts->fown.uid; p->fown.euid = opts->fown.euid; return 0; } static const struct fdtype_ops *get_misc_dev_ops(int minor) { switch (minor) { case TUN_MINOR: return &tunfile_dump_ops; case AUTOFS_MINOR: return ®file_dump_ops; }; return NULL; } static const struct fdtype_ops *get_mem_dev_ops(struct fd_parms *p, int minor) { const struct fdtype_ops *ops = NULL; /* * If /dev/kmsg is opened in write-only mode the file position * should not be set up upon restore, kernel doesn't allow that. */ if (minor == 11 && (p->flags & O_ACCMODE) == O_WRONLY && p->pos == 0) p->pos = -1ULL; ops = ®file_dump_ops; return ops; } static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) { struct fd_link *link_old = p->link; int maj = major(p->stat.st_rdev); const struct fdtype_ops *ops; struct fd_link link; int err; switch (maj) { case MEM_MAJOR: ops = get_mem_dev_ops(p, minor(p->stat.st_rdev)); break; case MISC_MAJOR: ops = get_misc_dev_ops(minor(p->stat.st_rdev)); if (ops) break; /* fallthrough */ default: { char more[32]; if (is_tty(p->stat.st_rdev, p->stat.st_dev)) { if (fill_fdlink(lfd, p, &link)) return -1; p->link = &link; ops = &tty_dump_ops; break; } sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev)); err = dump_unsupp_fd(p, lfd, "chr", more, e); p->link = link_old; return err; } } err = do_dump_gen_file(p, lfd, ops, e); p->link = link_old; return err; } static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, struct parasite_ctl *ctl, FdinfoEntry *e, struct parasite_drain_fd *dfds) { struct fd_parms p = FD_PARMS_INIT; const struct fdtype_ops *ops; struct fd_link link; if (fill_fd_params(pid, fd, lfd, opts, &p) < 0) { pr_err("Can't get stat on %d\n", fd); return -1; } if (note_file_lock(pid, fd, lfd, &p)) return -1; /* Lease can be set only on regular file */ if (S_ISREG(p.stat.st_mode)) { int ret = correct_file_leases_type(pid, fd, lfd); if (ret < 0) return ret; } p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ p.dfds = dfds; /* epoll needs to verify if target fd exist */ if (S_ISSOCK(p.stat.st_mode)) return dump_socket(&p, lfd, e); if (S_ISCHR(p.stat.st_mode)) return dump_chrdev(&p, lfd, e); if (p.fs_type == ANON_INODE_FS_MAGIC) { char link[32]; if (read_fd_link(lfd, link, sizeof(link)) < 0) return -1; if (is_eventfd_link(link)) ops = &eventfd_dump_ops; else if (is_eventpoll_link(link)) ops = &eventpoll_dump_ops; else if (is_inotify_link(link)) ops = &inotify_dump_ops; else if (is_fanotify_link(link)) ops = &fanotify_dump_ops; else if (is_signalfd_link(link)) ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; #endif else return dump_unsupp_fd(&p, lfd, "anon", link, e); return do_dump_gen_file(&p, lfd, ops, e); } if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; p.link = &link; /* TODO: Dump for hugetlb fd when memfd hugetlb is not supported */ if (is_memfd(p.stat.st_dev) || (kdat.has_memfd_hugetlb && is_hugetlb_dev(p.stat.st_dev, NULL))) ops = &memfd_dump_ops; else if (link.name[1] == '/') ops = ®file_dump_ops; else if (check_ns_proc(&link)) ops = &nsfile_dump_ops; else return dump_unsupp_fd(&p, lfd, "reg", link.name + 1, e); return do_dump_gen_file(&p, lfd, ops, e); } if (S_ISFIFO(p.stat.st_mode)) { if (p.fs_type == PIPEFS_MAGIC) ops = &pipe_dump_ops; else ops = &fifo_dump_ops; return do_dump_gen_file(&p, lfd, ops, e); } /* * For debug purpose -- at least show the link * file pointing to when reporting unsupported file. * On error simply empty string here. */ if (fill_fdlink(lfd, &p, &link)) memzero(&link, sizeof(link)); return dump_unsupp_fd(&p, lfd, "unknown", link.name + 1, e); } int dump_my_file(int lfd, u32 *id, int *type) { struct pid me = {}; struct fd_opts fdo = {}; FdinfoEntry e = FDINFO_ENTRY__INIT; me.real = getpid(); me.ns[0].virt = -1; /* FIXME */ if (dump_one_file(&me, lfd, lfd, &fdo, NULL, &e, NULL)) return -1; *id = e.id; *type = e.type; return 0; } int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds) { int *lfds = NULL; struct cr_img *img = NULL; struct fd_opts *opts = NULL; int i, ret = -1; int off, nr_fds = min((int)PARASITE_MAX_FDS, dfds->nr_fds); pr_info("\n"); pr_info("Dumping opened files (pid: %d)\n", item->pid->real); pr_info("----------------------------------------\n"); lfds = xmalloc(nr_fds * sizeof(int)); if (!lfds) goto err; opts = xmalloc(nr_fds * sizeof(struct fd_opts)); if (!opts) goto err; img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id); if (!img) goto err; ret = 0; /* Don't fail if nr_fds == 0 */ for (off = 0; ret == 0 && off < dfds->nr_fds; off += nr_fds) { if (nr_fds + off > dfds->nr_fds) nr_fds = dfds->nr_fds - off; ret = parasite_drain_fds_seized(ctl, dfds, nr_fds, off, lfds, opts); if (ret) goto err; for (i = 0; i < nr_fds; i++) { FdinfoEntry e = FDINFO_ENTRY__INIT; ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e, dfds); if (ret) break; ret = pb_write_one(img, &e, PB_FDINFO); if (ret) break; } for (i = 0; i < nr_fds; i++) close(lfds[i]); } pr_info("----------------------------------------\n"); err: if (img) close_image(img); xfree(opts); xfree(lfds); return ret; } static int predump_one_fd(int pid, int fd) { const struct fdtype_ops *ops; char link[PATH_MAX], t[32]; int ret = 0; snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd); ret = readlink(t, link, sizeof(link)); if (ret < 0) { pr_perror("Can't read link of fd %d", fd); return -1; } else if ((size_t)ret == sizeof(link)) { pr_err("Buffer for read link of fd %d is too small\n", fd); return -1; } link[ret] = 0; ret = 0; if (is_inotify_link(link)) ops = &inotify_dump_ops; else if (is_fanotify_link(link)) ops = &fanotify_dump_ops; else goto out; pr_debug("Pre-dumping %d's %d fd\n", pid, fd); ret = ops->pre_dump(pid, fd); out: return ret; } int predump_task_files(int pid) { struct dirent *de; DIR *fd_dir; int ret = -1; pr_info("Pre-dump fds for %d)\n", pid); fd_dir = opendir_proc(pid, "fd"); if (!fd_dir) return -1; while ((de = readdir(fd_dir))) { if (dir_dots(de)) continue; if (predump_one_fd(pid, atoi(de->d_name))) goto out; } ret = 0; out: closedir(fd_dir); return ret; } int restore_fown(int fd, FownEntry *fown) { struct f_owner_ex owner; uid_t uids[3]; if (fown->signum) { if (fcntl(fd, F_SETSIG, fown->signum)) { pr_perror("Can't set signal"); return -1; } } /* May be untouched */ if (!fown->pid) return 0; if (getresuid(&uids[0], &uids[1], &uids[2])) { pr_perror("Can't get current UIDs"); return -1; } if (setresuid(fown->uid, fown->euid, uids[2])) { pr_perror("Can't set UIDs"); return -1; } owner.type = fown->pid_type; owner.pid = fown->pid; if (fcntl(fd, F_SETOWN_EX, &owner)) { pr_perror("Can't setup %d file owner pid", fd); return -1; } if (setresuid(uids[0], uids[1], uids[2])) { pr_perror("Can't revert UIDs back"); return -1; } if (prctl(PR_SET_DUMPABLE, 1, 0)) pr_perror("Unable to set PR_SET_DUMPABLE"); return 0; } int rst_file_params(int fd, FownEntry *fown, int flags) { if (set_fd_flags(fd, flags) < 0) return -1; if (restore_fown(fd, fown) < 0) return -1; return 0; } static struct fdinfo_list_entry *alloc_fle(int pid, FdinfoEntry *fe) { struct fdinfo_list_entry *fle; fle = shmalloc(sizeof(*fle)); if (!fle) return NULL; fle->pid = pid; fle->fe = fe; fle->received = 0; fle->fake = 0; fle->stage = FLE_INITIALIZED; fle->task = pstree_item_by_virt(pid); if (!fle->task) { pr_err("Can't find task with pid %d\n", pid); shfree_last(fle); return NULL; } return fle; } static void __collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc) { struct fdinfo_list_entry *le; list_for_each_entry_reverse(le, &fdesc->fd_info_head, desc_list) if (pid_rst_prio_eq(le->pid, new_le->pid)) break; list_add(&new_le->desc_list, &le->desc_list); } static void collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc, bool force_master) { new_le->desc = fdesc; if (!force_master) __collect_desc_fle(new_le, fdesc); else { /* Link as first entry */ list_add(&new_le->desc_list, &fdesc->fd_info_head); } } struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, struct rst_info *rst_info, struct file_desc *fdesc, bool fake, bool force_master) { struct fdinfo_list_entry *new_le; new_le = alloc_fle(pid, e); if (new_le) { new_le->fake = (!!fake); collect_desc_fle(new_le, fdesc, force_master); collect_task_fd(new_le, rst_info); } return new_le; } int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool fake) { struct file_desc *fdesc; char inh_id[32]; pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n", pid, e->fd, e->id); fdesc = find_file_desc(e); if (fdesc == NULL) { pr_err("No file for fd %d id %#x\n", e->fd, e->id); return -1; } if (!collect_fd_to(pid, e, rst_info, fdesc, fake, false)) return -1; if (inherit_fd_lookup_id(inh_id) < 0) { fdesc->fds_inherited = FDIH_UNINHERITED; } else if (fdesc->fds_inherited == FDIH_UNKNOWN) { fdesc->fds_inherited = FDIH_FROM_0 + e->fd; } return 0; } FdinfoEntry *dup_fdinfo(FdinfoEntry *old, int fd, unsigned flags) { FdinfoEntry *e; e = shmalloc(sizeof(*e)); if (!e) return NULL; fdinfo_entry__init(e); e->id = old->id; e->type = old->type; e->fd = fd; e->flags = flags; return e; } int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, int fd, unsigned flags) { FdinfoEntry *e; e = dup_fdinfo(ple->fe, fd, flags); if (!e) return -1; return collect_fd(vpid(task), e, rsti(task), false); } int prepare_fd_pid(struct pstree_item *item) { int ret = 0; struct cr_img *img; pid_t pid = vpid(item); struct rst_info *rst_info = rsti(item); INIT_LIST_HEAD(&rst_info->fds); if (item->ids == NULL) /* zombie */ return 0; if (rsti(item)->fdt && rsti(item)->fdt->pid != vpid(item)) return 0; img = open_image(CR_FD_FDINFO, O_RSTR, item->ids->files_id); if (!img) return -1; while (1) { FdinfoEntry *e; ret = pb_read_one_eof(img, &e, PB_FDINFO); if (ret <= 0) break; if (e->fd >= kdat.sysctl_nr_open) { ret = -1; pr_err("Too big FD number to restore %d\n", e->fd); break; } ret = collect_fd(pid, e, rst_info, false); if (ret < 0) { fdinfo_entry__free_unpacked(e, NULL); break; } } close_image(img); return ret; } #define SETFL_MASK (O_APPEND | O_ASYNC | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) int set_fd_flags(int fd, int flags) { int ret; ret = fcntl(fd, F_GETFL, 0); if (ret < 0) goto err; flags = (SETFL_MASK & flags) | (ret & ~SETFL_MASK); ret = fcntl(fd, F_SETFL, flags); if (ret < 0) goto err; /* Let's check, that now actual flags contains those we need */ ret = fcntl(fd, F_GETFL, 0); if (ret < 0) goto err; if (ret != flags) { pr_err("fcntl call on fd %d (flags %#o) succeeded, " "but some flags were dropped: %#o\n", fd, flags, ret); return -1; } return 0; err: pr_perror("fcntl call on fd %d (flags %x) failed", fd, flags); return -1; } struct fd_open_state { char *name; int (*cb)(int, struct fdinfo_list_entry *); }; static int receive_fd(struct fdinfo_list_entry *fle); static void transport_name_gen(struct sockaddr_un *addr, int *len, int pid) { addr->sun_family = AF_UNIX; snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%" PRIx64, pid, criu_run_id); *len = SUN_LEN(addr); *addr->sun_path = '\0'; } static bool task_fle(struct pstree_item *task, struct fdinfo_list_entry *fle) { struct fdinfo_list_entry *tmp; list_for_each_entry(tmp, &rsti(task)->fds, ps_list) if (fle == tmp) return true; return false; } static int plant_fd(struct fdinfo_list_entry *fle, int fd) { BUG_ON(fle->received); fle->received = 1; return reopen_fd_as(fle->fe->fd, fd); } static int recv_fd_from_peer(struct fdinfo_list_entry *fle) { struct fdinfo_list_entry *tmp; int fd, ret, tsock; if (fle->received) return 0; tsock = get_service_fd(TRANSPORT_FD_OFF); do { ret = __recv_fds(tsock, &fd, 1, (void *)&tmp, sizeof(struct fdinfo_list_entry *), MSG_DONTWAIT); if (ret == -EAGAIN || ret == -EWOULDBLOCK) return 1; else if (ret) return -1; pr_info("Further fle=%p, pid=%d\n", tmp, fle->pid); if (!task_fle(current, tmp)) { pr_err("Unexpected fle %p, pid=%d\n", tmp, vpid(current)); return -1; } if (plant_fd(tmp, fd)) return -1; } while (tmp != fle); return 0; } static int send_fd_to_peer(int fd, struct fdinfo_list_entry *fle) { struct sockaddr_un saddr; int len, sock, ret; sock = get_service_fd(TRANSPORT_FD_OFF); transport_name_gen(&saddr, &len, fle->pid); pr_info("\t\tSend fd %d to %s\n", fd, saddr.sun_path + 1); ret = send_fds(sock, &saddr, len, &fd, 1, (void *)&fle, sizeof(struct fdinfo_list_entry *)); if (ret < 0) return -1; return set_fds_event(fle->pid); } /* * Helpers to scatter file_desc across users for those files, that * create two descriptors from a single system call at once (e.g. * ... or better i.e. -- pipes, socketpairs and ttys) */ int recv_desc_from_peer(struct file_desc *d, int *fd) { struct fdinfo_list_entry *fle; fle = file_master(d); *fd = fle->fe->fd; return recv_fd_from_peer(fle); } int send_desc_to_peer(int fd, struct file_desc *d) { return send_fd_to_peer(fd, file_master(d)); } static int send_fd_to_self(int fd, struct fdinfo_list_entry *fle) { int dfd = fle->fe->fd; if (fd == dfd) return 0; BUG_ON(dfd == get_service_fd(TRANSPORT_FD_OFF)); pr_info("\t\t\tGoing to dup %d into %d\n", fd, dfd); if (dup2(fd, dfd) != dfd) { pr_perror("Can't dup local fd %d -> %d", fd, dfd); return -1; } if (fcntl(dfd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; } fle->received = 1; return 0; } static int serve_out_fd(int pid, int fd, struct file_desc *d) { int ret; struct fdinfo_list_entry *fle; pr_info("\t\tCreate fd for %d\n", fd); list_for_each_entry(fle, &d->fd_info_head, desc_list) { if (pid == fle->pid) ret = send_fd_to_self(fd, fle); else ret = send_fd_to_peer(fd, fle); if (ret) { pr_err("Can't sent fd %d to %d\n", fd, fle->pid); goto out; } } ret = 0; out: return ret; } int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) { struct file_desc *d = fle->desc; pid_t pid = fle->pid; if (reopen_fd_as(fle->fe->fd, new_fd)) return -1; if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; } BUG_ON(fle->stage != FLE_INITIALIZED); fle->stage = FLE_OPEN; if (serve_out_fd(pid, fle->fe->fd, d)) return -1; return 0; } static int open_fd(struct fdinfo_list_entry *fle) { struct file_desc *d = fle->desc; struct fdinfo_list_entry *fle_m; int new_fd = -1, ret; fle_m = file_master(d); if (fle != fle_m) { BUG_ON(fle->stage != FLE_INITIALIZED); ret = receive_fd(fle); if (ret != 0) return ret; goto out; } /* * Open method returns the following values: * 0 -- restore is successfully finished; * 1 -- restore is in process or can't be started * yet, because of it depends on another fles, * so the method should be called once again; * -1 -- restore failed. * In case of 0 and 1 return values, new_fd may * be not negative. In this case it contains newly * opened file descriptor, which may be served out. * For every fle, new_fd is populated only once. * See setup_and_serve_out() BUG_ON for the details. */ ret = d->ops->open(d, &new_fd); if (ret != -1 && new_fd >= 0) { if (setup_and_serve_out(fle, new_fd) < 0) return -1; } out: if (ret == 0) fle->stage = FLE_RESTORED; return ret; } static int receive_fd(struct fdinfo_list_entry *fle) { int ret; pr_info("\tReceive fd for %d\n", fle->fe->fd); ret = recv_fd_from_peer(fle); if (ret != 0) { if (ret != 1) pr_err("Can't get fd=%d, pid=%d\n", fle->fe->fd, fle->pid); return ret; } if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; } return 0; } static void close_fdinfos(struct list_head *list) { struct fdinfo_list_entry *fle; list_for_each_entry(fle, list, ps_list) close(fle->fe->fd); } static int open_fdinfos(struct pstree_item *me) { struct list_head *list = &rsti(me)->fds; struct fdinfo_list_entry *fle, *tmp; LIST_HEAD(completed); LIST_HEAD(fake); bool progress, again; int st, ret = 0; do { progress = again = false; clear_fds_event(); list_for_each_entry_safe(fle, tmp, list, ps_list) { st = fle->stage; BUG_ON(st == FLE_RESTORED); ret = open_fd(fle); if (ret == -1) { pr_err("Unable to open fd=%d id=%#x\n", fle->fe->fd, fle->fe->id); goto splice; } if (st != fle->stage || ret == 0) progress = true; if (ret == 0) { /* * We delete restored items from fds list, * so open() methods may base on this feature * and reduce number of fles in their checks. */ list_del(&fle->ps_list); if (!fle->fake) list_add(&fle->ps_list, &completed); else list_add(&fle->ps_list, &fake); } if (ret == 1) again = true; } if (!progress && again) wait_fds_event(); } while (again || progress); BUG_ON(!list_empty(list)); /* * Fake fles may be used for restore other * file types, so their closing is delayed. */ close_fdinfos(&fake); splice: list_splice(&fake, list); list_splice(&completed, list); return ret; } int close_old_fds(void) { DIR *dir; struct dirent *de; int fd, ret; /** * Close previous /proc/self/ service fd, as we don't want to reuse it * from a different task. Also there can be some junk fd in it's place * after we've moved our service fds (e.g. from other task of parents * shared fdtable), we need to close it before opendir_proc() below. */ __close_service_fd(PROC_SELF_FD_OFF); dir = opendir_proc(PROC_SELF, "fd"); if (dir == NULL) return -1; while ((de = readdir(dir))) { if (dir_dots(de)) continue; ret = sscanf(de->d_name, "%d", &fd); if (ret != 1) { pr_err("Can't parse %s\n", de->d_name); closedir(dir); close_pid_proc(); return -1; } if ((!is_any_service_fd(fd)) && (dirfd(dir) != fd)) close_safe(&fd); } closedir(dir); close_pid_proc(); return 0; } int prepare_fds(struct pstree_item *me) { u32 ret = 0; pr_info("Opening fdinfo-s\n"); /* * This must be done after forking to allow child * to get the cgroup fd so it can move into the * correct /tasks file if it is in a different cgroup * set than its parent */ sfds_protected = false; close_service_fd(CGROUP_YARD); sfds_protected = true; if (rsti(me)->fdt) { struct fdt *fdt = rsti(me)->fdt; /* * Wait all tasks, who share a current fd table. * We should be sure, that nobody use any file * descriptor while fdtable is being restored. */ futex_inc_and_wake(&fdt->fdt_lock); futex_wait_while_lt(&fdt->fdt_lock, fdt->nr); if (fdt->pid != vpid(me)) { pr_info("File descriptor table is shared with %d\n", fdt->pid); futex_wait_until(&fdt->fdt_lock, fdt->nr + 1); goto out; } } BUG_ON(current->pid->state == TASK_HELPER); ret = open_fdinfos(me); if (rsti(me)->fdt) futex_inc_and_wake(&rsti(me)->fdt->fdt_lock); out: return ret; } static int checkfdis(int fd, const char *path) { struct stat st1, st2; if (stat(path, &st1)) { pr_perror("cannot stat %s", path); return -1; } if (fstat(fd, &st2)) { pr_perror("cannot fstat %d", fd); return -1; } if (st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino) { return 0; } return 1; } static int fchroot(int fd) { if (!checkfdis(fd, "/")) { return 0; } /* * There's no such thing in syscalls. We can emulate * it using fchdir() */ if (fchdir(fd) < 0) { pr_perror("Can't chdir to proc"); return -1; } pr_debug("Going to chroot into /proc/self/fd/%d\n", fd); return chroot("."); } static int need_chroot(int saved_root) { struct stat saved_root_stat, cur_root_stat; int psd; if (fstat(saved_root, &saved_root_stat) == -1) { pr_perror("Failed to stat saved root dir"); return -1; } psd = open_pid_proc(PROC_SELF); if (psd < 0) { pr_perror("Failed to open PROC_SELF"); return -1; } if (fstatat(psd, "root", &cur_root_stat, 0) == -1) { pr_perror("Failed to stat current root dir"); return -1; } return saved_root_stat.st_ino != cur_root_stat.st_ino || saved_root_stat.st_dev != cur_root_stat.st_dev; } int restore_fs(struct pstree_item *me) { int dd_root = -1, dd_cwd = -1, ret, err = -1; struct rst_info *ri = rsti(me); bool do_chroot = true; /* * First -- open both descriptors. We will not * be able to open the cwd one after we chroot. */ dd_root = open_reg_fd(ri->root); if (dd_root < 0) { pr_err("Can't open root\n"); goto out; } dd_cwd = open_reg_fd(ri->cwd); if (dd_cwd < 0) { pr_err("Can't open cwd\n"); goto out; } /* * In unprivileged mode chroot() may fail if we don't have * sufficient privileges, therefore only do it if the process * is actually chrooted. */ if (opts.unprivileged) do_chroot = need_chroot(dd_root); /* * Now do chroot/chdir. Chroot goes first as it calls chdir into * dd_root so we'd need to fix chdir after it anyway. */ if (do_chroot) { ret = fchroot(dd_root); if (ret < 0) { pr_perror("Can't change root"); goto out; } } ret = fchdir(dd_cwd); if (ret < 0) { pr_perror("Can't change cwd"); goto out; } if (ri->has_umask) { pr_info("Restoring umask to %o\n", ri->umask); umask(ri->umask); } err = 0; out: if (dd_cwd >= 0) close(dd_cwd); if (dd_root >= 0) close(dd_root); return err; } int prepare_fs_pid(struct pstree_item *item) { pid_t pid = vpid(item); struct rst_info *ri = rsti(item); struct cr_img *img; FsEntry *fe; int ret = -1; img = open_image(CR_FD_FS, O_RSTR, pid); if (!img) goto out; ret = pb_read_one_eof(img, &fe, PB_FS); close_image(img); if (ret <= 0) goto out; ri->cwd = collect_special_file(fe->cwd_id); if (!ri->cwd) { pr_err("Can't find task cwd file\n"); goto out_f; } ri->root = collect_special_file(fe->root_id); if (!ri->root) { pr_err("Can't find task root file\n"); goto out_f; } ri->has_umask = fe->has_umask; ri->umask = fe->umask; ret = 0; out_f: fs_entry__free_unpacked(fe, NULL); out: return ret; } int shared_fdt_prepare(struct pstree_item *item) { struct pstree_item *parent = item->parent; struct fdt *fdt; if (!rsti(parent)->fdt) { fdt = shmalloc(sizeof(*rsti(item)->fdt)); if (fdt == NULL) return -1; rsti(parent)->fdt = fdt; futex_init(&fdt->fdt_lock); fdt->nr = 1; fdt->pid = vpid(parent); } else fdt = rsti(parent)->fdt; rsti(item)->fdt = fdt; rsti(item)->service_fd_id = fdt->nr; fdt->nr++; return 0; } /* * Inherit fd support. * * There are cases where a process's file descriptor cannot be restored * from the checkpointed image. For example, a pipe file descriptor with * one end in the checkpointed process and the other end in a separate * process (that was not part of the checkpointed process tree) cannot be * restored because after checkpoint the pipe would be broken and removed. * * There are also cases where the user wants to use a new file during * restore instead of the original file in the checkpointed image. For * example, the user wants to change the log file of a process from * /path/to/oldlog to /path/to/newlog. * * In these cases, criu's caller should set up a new file descriptor to be * inherited by the restored process and specify it with the --inherit-fd * command line option. The argument of --inherit-fd has the format * fd[%d]:%s, where %d tells criu which of its own file descriptor to use * for restoring file identified by %s. * * As a debugging aid, if the argument has the format debug[%d]:%s, it tells * criu to write out the string after colon to the file descriptor %d. This * can be used to leave a "restore marker" in the output stream of the process. * * It's important to note that inherit fd support breaks applications * that depend on the state of the file descriptor being inherited. So, * consider inherit fd only for specific use cases that you know for sure * won't break the application. * * For examples please visit http://criu.org/Category:HOWTO. */ struct inherit_fd { struct list_head inh_list; char *inh_id; /* file identifier */ int inh_fd; /* criu's descriptor to inherit */ int inh_fd_id; }; int inh_fd_max = -1; int inherit_fd_parse(char *optarg) { char *cp = NULL; int n = -1; int fd = -1; int dbg = 0; /* * Parse the argument. */ if (!strncmp(optarg, "fd", 2)) cp = &optarg[2]; else if (!strncmp(optarg, "debug", 5)) { cp = &optarg[5]; dbg = 1; } if (cp) { n = sscanf(cp, "[%d]:", &fd); cp = strchr(optarg, ':'); } if (n != 1 || fd < 0 || !cp || !cp[1]) { pr_err("Invalid inherit fd argument: %s\n", optarg); return -1; } /* * If the argument is a debug string, write it to fd. * Otherwise, add it to the inherit fd list. */ cp++; if (dbg) { n = strlen(cp); if (write(fd, cp, n) != n) { pr_err("Can't write debug message %s to inherit fd %d\n", cp, fd); return -1; } return 0; } return inherit_fd_add(fd, cp); } int inherit_fd_add(int fd, char *key) { struct inherit_fd *inh; struct stat sbuf; if (fstat(fd, &sbuf) == -1) { pr_perror("Can't fstat inherit fd %d", fd); return -1; } inh = xmalloc(sizeof *inh); if (inh == NULL) return -1; if (fd > inh_fd_max) inh_fd_max = fd; inh->inh_id = xstrdup(key); if (inh->inh_id == NULL) { xfree(inh); return -1; } inh->inh_fd = fd; list_add_tail(&inh->inh_list, &opts.inherit_fds); return 0; } /* * Log the inherit fd list. Called for diagnostics purposes * after the log file is initialized. */ void inherit_fd_log(void) { struct inherit_fd *inh; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { pr_info("File %s will be restored from inherit fd %d\n", inh->inh_id, inh->inh_fd); } } int inherit_fd_move_to_fdstore(void) { struct inherit_fd *inh; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { inh->inh_fd_id = fdstore_add(inh->inh_fd); if (inh->inh_fd_id < 0) return -1; close_safe(&inh->inh_fd); } return 0; } /* * Look up the inherit fd list by a file identifier. */ int inherit_fd_lookup_id(char *id) { int ret; struct inherit_fd *inh; ret = -1; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { if (!strcmp(inh->inh_id, id)) { ret = fdstore_get(inh->inh_fd_id); pr_debug("Found id %s (fd %d) in inherit fd list\n", id, ret); break; } } return ret; } bool inherited_fd(struct file_desc *d, int *fd_p) { char buf[PATH_MAX], *id_str; int i_fd; if (FDIH_FROM_0 <= d->fds_inherited) { if (fd_p) { snprintf(buf, sizeof(buf), "fd[%d]", d->fds_inherited); i_fd = inherit_fd_lookup_id(buf); if (i_fd < 0) return false; *fd_p = dup(i_fd); } pr_info("File id %" PRIu32 " will not be tried restored as all fd's are inherited\n", d->id); return true; } if (!d->ops->name) return false; id_str = d->ops->name(d, buf, sizeof(buf)); i_fd = inherit_fd_lookup_id(id_str); if (i_fd < 0) return false; if (fd_p == NULL) return true; *fd_p = i_fd; pr_info("File %s will be restored from fd %d dumped " "from inherit fd %d\n", id_str, *fd_p, i_fd); return true; } int inherit_fd_fini(void) { struct inherit_fd *inh; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { int fd; if (1 == sscanf(inh->inh_id, "fd[%d]", &fd)) { int inh_fd = fdstore_get(inh->inh_fd_id); pr_debug("Inherit fd %d(%s) -> %d\n", inh_fd, inh->inh_id, fd); reopen_fd_as_nocheck(fd, inh_fd); } } return 0; } int open_transport_socket(void) { pid_t pid = vpid(current); struct sockaddr_un saddr; int sock, slen, ret = -1; sock = socket(PF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0); if (sock < 0) { pr_perror("Can't create socket"); goto out; } transport_name_gen(&saddr, &slen, pid); if (bind(sock, (struct sockaddr *)&saddr, slen) < 0) { pr_perror("Can't bind transport socket %s", saddr.sun_path + 1); close(sock); goto out; } if (install_service_fd(TRANSPORT_FD_OFF, sock) < 0) goto out; ret = 0; out: return ret; } static int collect_one_file_entry(FileEntry *fe, u_int32_t id, ProtobufCMessage *base, struct collect_image_info *cinfo) { if (fe->id != id) { pr_err("ID mismatch %u != %u\n", fe->id, id); return -1; } return collect_entry(base, cinfo); } static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) { int ret = 0; FileEntry *fe; fe = pb_msg(base, FileEntry); switch (fe->type) { default: pr_err("Unknown file type %d\n", fe->type); return -1; case FD_TYPES__REG: ret = collect_one_file_entry(fe, fe->reg->id, &fe->reg->base, ®_file_cinfo); break; case FD_TYPES__INETSK: ret = collect_one_file_entry(fe, fe->isk->id, &fe->isk->base, &inet_sk_cinfo); break; case FD_TYPES__NS: ret = collect_one_file_entry(fe, fe->nsf->id, &fe->nsf->base, &nsfile_cinfo); break; case FD_TYPES__PACKETSK: ret = collect_one_file_entry(fe, fe->psk->id, &fe->psk->base, &packet_sk_cinfo); break; case FD_TYPES__NETLINKSK: ret = collect_one_file_entry(fe, fe->nlsk->id, &fe->nlsk->base, &netlink_sk_cinfo); break; case FD_TYPES__EVENTFD: ret = collect_one_file_entry(fe, fe->efd->id, &fe->efd->base, &eventfd_cinfo); break; case FD_TYPES__EVENTPOLL: ret = collect_one_file_entry(fe, fe->epfd->id, &fe->epfd->base, &epoll_cinfo); break; case FD_TYPES__SIGNALFD: ret = collect_one_file_entry(fe, fe->sgfd->id, &fe->sgfd->base, &signalfd_cinfo); break; case FD_TYPES__TUNF: ret = collect_one_file_entry(fe, fe->tunf->id, &fe->tunf->base, &tunfile_cinfo); break; case FD_TYPES__TIMERFD: ret = collect_one_file_entry(fe, fe->tfd->id, &fe->tfd->base, &timerfd_cinfo); break; case FD_TYPES__INOTIFY: ret = collect_one_file_entry(fe, fe->ify->id, &fe->ify->base, &inotify_cinfo); break; case FD_TYPES__FANOTIFY: ret = collect_one_file_entry(fe, fe->ffy->id, &fe->ffy->base, &fanotify_cinfo); break; case FD_TYPES__EXT: ret = collect_one_file_entry(fe, fe->ext->id, &fe->ext->base, &ext_file_cinfo); break; case FD_TYPES__UNIXSK: ret = collect_one_file_entry(fe, fe->usk->id, &fe->usk->base, &unix_sk_cinfo); break; case FD_TYPES__FIFO: ret = collect_one_file_entry(fe, fe->fifo->id, &fe->fifo->base, &fifo_cinfo); break; case FD_TYPES__PIPE: ret = collect_one_file_entry(fe, fe->pipe->id, &fe->pipe->base, &pipe_cinfo); break; case FD_TYPES__TTY: ret = collect_one_file_entry(fe, fe->tty->id, &fe->tty->base, &tty_cinfo); break; case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); break; #endif } return ret; } struct collect_image_info files_cinfo = { .fd_type = CR_FD_FILES, .pb_type = PB_FILE, .priv_size = 0, .collect = collect_one_file, .flags = COLLECT_NOFREE, }; int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); return collect_image(&files_cinfo); } crac-criu-1.5.0/criu/filesystems.c000066400000000000000000000424561471504326700170660ustar00rootroot00000000000000#include #include #include #include #include #include "common/config.h" #include "int.h" #include "common/compiler.h" #include "xmalloc.h" #include "cr_options.h" #include "filesystems.h" #include "namespaces.h" #include "mount.h" #include "pstree.h" #include "kerndat.h" #include "protobuf.h" #include "autofs.h" #include "util.h" #include "fs-magic.h" #include "tty.h" #include "images/mnt.pb-c.h" #include "images/binfmt-misc.pb-c.h" static int attach_option(struct mount_info *pm, char *opt) { if (pm->options[0] == '\0') pm->options = xstrcat(pm->options, "%s", opt); else pm->options = xstrcat(pm->options, ",%s", opt); return pm->options ? 0 : -1; } #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED struct binfmt_misc_info { BinfmtMiscEntry *bme; struct list_head list; }; LIST_HEAD(binfmt_misc_list); static int binfmt_misc_parse_or_collect(struct mount_info *pm) { opts.has_binfmt_misc = true; return 0; } static int binfmt_misc_virtual(struct mount_info *pm) { return kerndat_fs_virtualized(KERNDAT_FS_STAT_BINFMT_MISC, pm->s_dev); } static int parse_binfmt_misc_entry(struct bfd *f, BinfmtMiscEntry *bme) { while (1) { char *str; str = breadline(f); if (IS_ERR(str)) return -1; if (!str) break; if (!strncmp(str, "enabled", 7)) { bme->enabled = true; continue; } if (!strncmp(str, "disabled", 8)) continue; if (!strncmp(str, "offset ", 7)) { if (sscanf(str + 7, "%i", &bme->offset) != 1) return -1; bme->has_offset = true; continue; } #define DUP_EQUAL_AS(key, member) \ if (!strncmp(str, key, strlen(key))) { \ bme->member = xstrdup(str + strlen(key)); \ if (!bme->member) \ return -1; \ continue; \ } DUP_EQUAL_AS("interpreter ", interpreter) DUP_EQUAL_AS("flags: ", flags) DUP_EQUAL_AS("extension .", extension) DUP_EQUAL_AS("magic ", magic) DUP_EQUAL_AS("mask ", mask) #undef DUP_EQUAL_AS pr_perror("binfmt_misc: unsupported feature %s", str); return -1; } return 0; } static int dump_binfmt_misc_entry(int dfd, char *name, struct cr_img *img) { BinfmtMiscEntry bme = BINFMT_MISC_ENTRY__INIT; struct bfd f; int ret = -1; f.fd = openat(dfd, name, O_RDONLY); if (f.fd < 0) { pr_perror("binfmt_misc: can't open %s", name); return -1; } if (bfdopenr(&f)) return -1; if (parse_binfmt_misc_entry(&f, &bme)) goto err; bme.name = name; if (pb_write_one(img, &bme, PB_BINFMT_MISC)) goto err; ret = 0; err: free(bme.interpreter); free(bme.flags); free(bme.extension); free(bme.magic); free(bme.mask); bclose(&f); return ret; } static int binfmt_misc_dump(struct mount_info *pm) { static bool dumped = false; struct cr_img *img = NULL; struct dirent *de; DIR *fdir = NULL; int fd, ret; ret = binfmt_misc_virtual(pm); if (ret <= 0) return ret; if (dumped) { pr_err("Second binfmt_misc superblock\n"); return -1; } dumped = true; fd = open_mountpoint(pm); if (fd < 0) return fd; fdir = fdopendir(fd); if (fdir == NULL) { close(fd); return -1; } ret = -1; while ((de = readdir(fdir))) { if (dir_dots(de)) continue; if (!strcmp(de->d_name, "register")) continue; if (!strcmp(de->d_name, "status")) continue; if (!img) { /* Create image only if an entry exists, i.e. here */ img = open_image(CR_FD_BINFMT_MISC, O_DUMP); if (!img) goto out; } if (dump_binfmt_misc_entry(fd, de->d_name, img)) goto out; } ret = 0; out: if (img) close_image(img); closedir(fdir); return ret; } static int write_binfmt_misc_entry(char *mp, char *buf, BinfmtMiscEntry *bme) { int fd, len, ret = -1; char path[PATH_MAX + 1]; snprintf(path, PATH_MAX, "%s/register", mp); fd = open(path, O_WRONLY); if (fd < 0) { pr_perror("binfmt_misc: can't open %s", path); return -1; } len = strlen(buf); if (write(fd, buf, len) != len) { pr_perror("binfmt_misc: can't write to %s", path); goto close; } if (!bme->enabled) { close(fd); snprintf(path, PATH_MAX, "%s/%s", mp, bme->name); fd = open(path, O_WRONLY); if (fd < 0) { pr_perror("binfmt_misc: can't open %s", path); goto out; } if (write(fd, "0", 1) != 1) { pr_perror("binfmt_misc: can't write to %s", path); goto close; } } ret = 0; close: close(fd); out: return ret; } #define BINFMT_MISC_STR (1920 + 1) static int make_bfmtm_magic_str(char *buf, BinfmtMiscEntry *bme) { int i, len; /* * Format is ":name:type(M):offset:magic:mask:interpreter:flags". * Magic and mask are special fields. Kernel outputs them as * a sequence of hexadecimal numbers (abc -> 616263), and we * dump them without changes. But for registering a new entry * it expects every byte is prepended with \x, i.e. \x61\x62\x63. */ len = strlen(bme->name) + 3 /* offset < 128 */ + 2 * strlen(bme->magic) + (bme->mask ? 2 * strlen(bme->mask) : 0) + strlen(bme->interpreter) + (bme->flags ? strlen(bme->flags) : 0) + strlen(":::::::"); if ((len > BINFMT_MISC_STR - 1) || bme->offset > 128) return -1; buf += sprintf(buf, ":%s:M:%d:", bme->name, bme->offset); len = strlen(bme->magic); for (i = 0; i < len; i += 2) buf += sprintf(buf, "\\x%c%c", bme->magic[i], bme->magic[i + 1]); buf += sprintf(buf, ":"); if (bme->mask) { len = strlen(bme->mask); for (i = 0; i < len; i += 2) buf += sprintf(buf, "\\x%c%c", bme->mask[i], bme->mask[i + 1]); } sprintf(buf, ":%s:%s", bme->interpreter, bme->flags ?: "\0"); return 1; } static int binfmt_misc_restore_bme(struct mount_info *mi, BinfmtMiscEntry *bme, char *buf) { int ret; if (!bme->name || !bme->interpreter) goto bad_dump; /* Either magic or extension should be there */ if (bme->magic) { ret = make_bfmtm_magic_str(buf, bme); } else if (bme->extension) { /* :name:E::extension::interpreter:flags */ ret = snprintf(buf, BINFMT_MISC_STR, ":%s:E::%s::%s:%s", bme->name, bme->extension, bme->interpreter, bme->flags ?: "\0"); if (ret >= BINFMT_MISC_STR) /* output truncated */ ret = -1; } else ret = -1; if (ret < 0) goto bad_dump; pr_debug("binfmt_misc_pattern=%s\n", buf); ret = write_binfmt_misc_entry(service_mountpoint(mi), buf, bme); return ret; bad_dump: pr_perror("binfmt_misc: bad dump"); return -1; } static int binfmt_misc_restore(struct mount_info *mi) { struct cr_img *img; char *buf; int ret = -1; buf = xmalloc(BINFMT_MISC_STR); if (!buf) return -1; if (!list_empty(&binfmt_misc_list)) { struct binfmt_misc_info *bmi; list_for_each_entry(bmi, &binfmt_misc_list, list) { ret = binfmt_misc_restore_bme(mi, bmi->bme, buf); if (ret) break; } goto free_buf; } img = open_image(CR_FD_BINFMT_MISC_OLD, O_RSTR, mi->s_dev); if (!img) { pr_err("Can't open binfmt_misc_old image\n"); goto free_buf; } else if (empty_image(img)) { close_image(img); ret = 0; goto free_buf; } ret = 0; while (ret == 0) { BinfmtMiscEntry *bme; ret = pb_read_one_eof(img, &bme, PB_BINFMT_MISC); if (ret <= 0) break; ret = binfmt_misc_restore_bme(mi, bme, buf); binfmt_misc_entry__free_unpacked(bme, NULL); } close_image(img); free_buf: free(buf); return ret; } static int collect_one_binfmt_misc_entry(void *o, ProtobufCMessage *msg, struct cr_img *img) { struct binfmt_misc_info *bmi = o; bmi->bme = pb_msg(msg, BinfmtMiscEntry); list_add_tail(&bmi->list, &binfmt_misc_list); return 0; } struct collect_image_info binfmt_misc_cinfo = { .fd_type = CR_FD_BINFMT_MISC, .pb_type = PB_BINFMT_MISC, .priv_size = sizeof(struct binfmt_misc_info), .collect = collect_one_binfmt_misc_entry, }; int collect_binfmt_misc(void) { return collect_image(&binfmt_misc_cinfo); } #else #define binfmt_misc_dump NULL #define binfmt_misc_restore NULL #define binfmt_misc_parse_or_collect NULL #endif static int tmpfs_dump(struct mount_info *pm) { int ret = -1, fd = -1, userns_pid = -1; struct cr_img *img; int tmp_fds[3], ntmp_fds = 0, i; fd = open_mountpoint(pm); if (fd < 0) return MNT_UNREACHABLE; /* * fd should not be one of standard descriptors, because * cr_system_userns will override them. */ for (i = 0; i < 3; i++) { if (fd > 2) break; tmp_fds[ntmp_fds++] = fd; fd = dup(fd); if (fd < 0) { pr_perror("Unable to duplicate a file descriptor"); goto out; } } if (move_fd_from(&fd, STDIN_FILENO) < 0) goto out; if (fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) & ~FD_CLOEXEC) == -1) { pr_perror("Can not drop FD_CLOEXEC"); goto out; } img = open_image(CR_FD_TMPFS_DEV, O_DUMP, pm->s_dev); if (!img) goto out; if (root_ns_mask & CLONE_NEWUSER) userns_pid = root_item->pid->real; ret = cr_system_userns(fd, img_raw_fd(img), -1, "tar", (char *[]){ "tar", "--create", "--gzip", "--no-unquote", "--no-wildcards", "--one-file-system", "--check-links", "--preserve-permissions", "--sparse", "--numeric-owner", "--directory", "/proc/self/fd/0", ".", NULL }, 0, userns_pid); if (ret) pr_err("Can't dump tmpfs content\n"); close_image(img); out: for (i = 0; i < ntmp_fds; i++) close(tmp_fds[i]); close_safe(&fd); return ret; } static int tmpfs_restore(struct mount_info *pm) { int ret; struct cr_img *img; img = open_image(CR_FD_TMPFS_DEV, O_RSTR, pm->s_dev); if (empty_image(img)) { close_image(img); img = open_image(CR_FD_TMPFS_IMG, O_RSTR, pm->mnt_id); } if (!img) return -1; if (empty_image(img)) { close_image(img); return -1; } ret = cr_system(img_raw_fd(img), -1, -1, "tar", (char *[]){ "tar", "--extract", "--gzip", "--no-unquote", "--no-wildcards", "--directory", service_mountpoint(pm), NULL }, 0); close_image(img); if (ret) { pr_err("Can't restore tmpfs content\n"); return -1; } return 0; } /* * Virtualized devtmpfs on any side (dump or restore) * means, that we should try to handle it as a plain * tmpfs. * * Interesting case -- shared on dump and virtual on * restore -- will fail, since no tarball with the fs * contents will be found. */ static int devtmpfs_virtual(struct mount_info *pm) { return kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVTMPFS, pm->s_dev); } static int devtmpfs_dump(struct mount_info *pm) { int ret; ret = devtmpfs_virtual(pm); if (ret == 1) ret = tmpfs_dump(pm); return ret; } static int devtmpfs_restore(struct mount_info *pm) { int ret; ret = devtmpfs_virtual(pm); if (ret == 1) ret = tmpfs_restore(pm); return ret; } /* Is it mounted w or w/o the newinstance option */ static int devpts_parse(struct mount_info *pm) { int ret; ret = kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVPTS, pm->s_dev); if (ret <= 0) return ret; /* * Kernel hides this option, but if the fs instance * is new (virtualized) we know that it was created * with -o newinstance. */ return attach_option(pm, "newinstance"); } static int fusectl_dump(struct mount_info *pm) { int fd, ret = -1; struct dirent *de; DIR *fdir = NULL; fd = open_mountpoint(pm); if (fd < 0) return fd; fdir = fdopendir(fd); if (fdir == NULL) { close(fd); return -1; } while ((de = readdir(fdir))) { int id; struct mount_info *it; if (dir_dots(de)) continue; if (sscanf(de->d_name, "%d", &id) != 1) { pr_err("wrong number of items scanned in fusectl dump\n"); goto out; } for (it = mntinfo; it; it = it->next) { if (it->fstype->code == FSTYPE__FUSE && id == kdev_minor(it->s_dev) && !mnt_is_external_bind(it)) { pr_err("%s is a fuse mount but not external\n", it->ns_mountpoint); goto out; } } } ret = 0; out: closedir(fdir); return ret; } static int debugfs_parse(struct mount_info *pm) { /* tracefs is automounted underneath debugfs sometimes, and the * kernel's overmounting protection prevents us from mounting debugfs * first without tracefs, so let's always mount debugfs MS_REC. */ pm->flags |= MS_REC; return 0; } static int tracefs_parse(struct mount_info *pm) { return 1; } static bool cgroup_sb_equal(struct mount_info *a, struct mount_info *b) { if (a->private && b->private && strcmp(a->private, b->private)) return false; if (strcmp(a->options, b->options)) return false; return true; } static int cgroup_parse(struct mount_info *pm) { if (!(root_ns_mask & CLONE_NEWCGROUP)) return 0; /* cgroup namespaced mounts don't look rooted to CRIU, so let's fake it * here. */ pm->private = pm->root; pm->root = xstrdup("/"); if (!pm->root) return -1; return 0; } static bool btrfs_sb_equal(struct mount_info *a, struct mount_info *b) { /* There is a btrfs bug where it doesn't emit subvol= correctly when * files are bind mounted, so let's ignore it for now. * https://marc.info/?l=linux-btrfs&m=145857372803614&w=2 */ char *posa = strstr(a->options, "subvol="), *posb = strstr(b->options, "subvol="); bool equal; if (!posa || !posb) { pr_err("invalid btrfs options, no subvol argument\n"); return false; } *posa = *posb = 0; equal = !strcmp(a->options, b->options); *posa = *posb = 's'; if (!equal) return false; posa = strchr(posa, ','); posb = strchr(posb, ','); if ((posa && !posb) || (!posa && posb)) return false; if (posa && strcmp(posa, posb)) return false; return true; } static int dump_empty_fs(struct mount_info *pm) { int fd, ret = -1; fd = open_mountpoint(pm); if (fd < 0) return fd; ret = is_empty_dir(fd); if (ret == 0) { pr_err("%s isn't empty\n", pm->fstype->name); return -1; } return ret == 1 ? 0 : -1; } /* * Some fses (fuse) cannot be dumped, so we should always fail on dump/restore * of these fses. */ static int always_fail(struct mount_info *pm) { pr_err("failed to dump fs %s (%s): always fail\n", pm->ns_mountpoint, pm->fstype->name); return -1; } static struct fstype fstypes[] = { { .name = "unsupported", .code = FSTYPE__UNSUPPORTED, }, { .name = "auto_cr", .code = FSTYPE__AUTO, }, { .name = "proc", .code = FSTYPE__PROC, }, { .name = "sysfs", .code = FSTYPE__SYSFS, }, { .name = "devtmpfs", .code = FSTYPE__DEVTMPFS, .dump = devtmpfs_dump, .restore = devtmpfs_restore, }, { .name = "binfmt_misc", .parse = binfmt_misc_parse_or_collect, .collect = binfmt_misc_parse_or_collect, .code = FSTYPE__BINFMT_MISC, .dump = binfmt_misc_dump, .restore = binfmt_misc_restore, }, { .name = "tmpfs", .code = FSTYPE__TMPFS, .dump = tmpfs_dump, .restore = tmpfs_restore, }, { .name = "devpts", .parse = devpts_parse, .code = FSTYPE__DEVPTS, .restore = devpts_restore, .check_bindmount = devpts_check_bindmount, }, { .name = "simfs", .code = FSTYPE__SIMFS, }, { .name = "btrfs", .code = FSTYPE__UNSUPPORTED, .sb_equal = btrfs_sb_equal, }, { .name = "pstore", .dump = dump_empty_fs, .code = FSTYPE__PSTORE, }, { .name = "mqueue", .dump = dump_empty_fs, .code = FSTYPE__MQUEUE, }, { .name = "securityfs", .code = FSTYPE__SECURITYFS, }, { .name = "fusectl", .dump = fusectl_dump, .code = FSTYPE__FUSECTL, }, { .name = "debugfs", .code = FSTYPE__DEBUGFS, .parse = debugfs_parse, }, { .name = "tracefs", .code = FSTYPE__TRACEFS, .parse = tracefs_parse, }, { .name = "cgroup", .code = FSTYPE__CGROUP, .parse = cgroup_parse, .sb_equal = cgroup_sb_equal, }, { .name = "cgroup2", .code = FSTYPE__CGROUP2, .parse = cgroup_parse, .sb_equal = cgroup_sb_equal, }, { .name = "aufs", .code = FSTYPE__AUFS, .parse = aufs_parse, }, { .name = "fuse", .code = FSTYPE__FUSE, .dump = always_fail, .restore = always_fail, }, { .name = "overlay", .code = FSTYPE__OVERLAYFS, .parse = overlayfs_parse, }, { .name = "autofs", .code = FSTYPE__AUTOFS, .parse = autofs_parse, .dump = autofs_dump, .mount = autofs_mount, }, }; struct fstype *fstype_auto(void) { return &fstypes[1]; } static char fsauto_all[] = "all"; static char *fsauto_names; static bool css_contains(const char *css, const char *str) { int len = strlen(str); const char *cur; if (!len) return false; for (cur = css; (cur = strstr(cur, str)); cur += len) { if (cur > css && cur[-1] != ',') continue; if (cur[len] && cur[len] != ',') continue; return true; } return false; } static bool fsname_is_auto(const char *name) { if (!fsauto_names) return false; if (fsauto_names == fsauto_all) return true; return css_contains(fsauto_names, name); } bool add_fsname_auto(const char *names) { char *old = fsauto_names; if (old == fsauto_all) return true; if (css_contains(names, fsauto_all)) fsauto_names = fsauto_all; else if (!old) { fsauto_names = xstrdup(names); if (!fsauto_names) abort(); } else { if (asprintf(&fsauto_names, "%s,%s", old, names) < 0) fsauto_names = NULL; } xfree(old); return fsauto_names != NULL; } struct fstype *find_fstype_by_name(char *fst) { int i; /* * This fn is required for two things. * 1st -- to check supported filesystems (as just mounting * anything is wrong, almost every fs has its own features) * 2nd -- save some space in the image (since we scan all * names anyway) */ for (i = 1; i < ARRAY_SIZE(fstypes); i++) { struct fstype *fstype = fstypes + i; if (!strcmp(fstype->name, fst)) return fstype; } if (fsname_is_auto(fst)) return &fstypes[1]; return &fstypes[0]; } struct fstype *decode_fstype(u32 fst) { int i; if (fst == FSTYPE__UNSUPPORTED) goto uns; for (i = 1; i < ARRAY_SIZE(fstypes); i++) { struct fstype *fstype = fstypes + i; if (!fstype->name) break; if (fstype->code == fst) return fstype; } uns: return &fstypes[0]; } crac-criu-1.5.0/criu/fsnotify.c000066400000000000000000000527051471504326700163560ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/compiler.h" #include "imgset.h" #include "fsnotify.h" #include "fdinfo.h" #include "mount.h" #include "filesystems.h" #include "image.h" #include "util.h" #include "crtools.h" #include "files.h" #include "files-reg.h" #include "file-ids.h" #include "criu-log.h" #include "kerndat.h" #include "common/list.h" #include "common/lock.h" #include "irmap.h" #include "cr_options.h" #include "namespaces.h" #include "pstree.h" #include "fault-injection.h" #include #include "protobuf.h" #include "images/fsnotify.pb-c.h" #include "images/mnt.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "fsnotify: " struct fsnotify_mark_info { struct list_head list; union { InotifyWdEntry *iwe; FanotifyMarkEntry *fme; }; struct pprep_head prep; /* XXX union with remap */ struct file_remap *remap; }; struct fsnotify_file_info { union { InotifyFileEntry *ife; FanotifyFileEntry *ffe; }; struct list_head marks; struct file_desc d; }; /* File handle */ typedef struct { u32 bytes; u32 type; u64 __handle[16]; } fh_t; /* Checks if file descriptor @lfd is inotify */ int is_inotify_link(char *link) { return is_anon_link_type(link, "inotify"); } /* Checks if file descriptor @lfd is fanotify */ int is_fanotify_link(char *link) { return is_anon_link_type(link, "[fanotify]"); } static void decode_handle(fh_t *handle, FhEntry *img) { memzero(handle, sizeof(*handle)); handle->type = img->type; handle->bytes = img->bytes; memcpy(handle->__handle, img->handle, min(pb_repeated_size(img, handle), sizeof(handle->__handle))); } static int open_by_handle(void *arg, int fd, int pid) { return syscall(__NR_open_by_handle_at, fd, arg, O_PATH); } enum { ERR_NO_MOUNT = -1, ERR_NO_PATH_IN_MOUNT = -2, ERR_GENERIC = -3 }; static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle) { struct mount_info *m; fh_t handle; int fd = -1; char *path; char suitable_mount_found = 0; decode_handle(&handle, f_handle); /* * We gonna try to open the handle and then * depending on command line options and type * of the filesystem (tmpfs/devtmpfs do not * preserve their inodes between mounts) we * might need to find out an openable path * get used on restore as a watch destination. */ for (m = mntinfo; m; m = m->next) { char buf[PATH_MAX], *__path; int mntfd, openable_fd; struct stat st; if (m->s_dev != s_dev) continue; if (!mnt_is_dir(m)) continue; mntfd = __open_mountpoint(m); pr_debug("\t\tTrying via mntid %d root %s ns_mountpoint @%s (%d)\n", m->mnt_id, m->root, m->ns_mountpoint, mntfd); if (mntfd < 0) continue; fd = userns_call(open_by_handle, UNS_FDOUT, &handle, sizeof(handle), mntfd); close(mntfd); if (fd < 0) continue; suitable_mount_found = 1; if (read_fd_link(fd, buf, sizeof(buf)) < 0) { close(fd); goto err; } close(fd); /* * Convert into a relative path. */ __path = (buf[1] != '\0') ? buf + 1 : "."; pr_debug("\t\t\tlink as %s\n", __path); mntfd = mntns_get_root_fd(m->nsid); if (mntfd < 0) goto err; openable_fd = openat(mntfd, __path, O_PATH); if (openable_fd >= 0) { if (fstat(openable_fd, &st)) { pr_perror("Can't stat on %s", __path); close(openable_fd); goto err; } close(openable_fd); pr_debug("\t\t\topenable (inode %s) as %s\n", st.st_ino == i_ino ? "match" : "don't match", __path); if (st.st_ino == i_ino) { path = xstrdup(buf); if (path == NULL) return ERR_PTR(ERR_GENERIC); if (root_ns_mask & CLONE_NEWNS) { f_handle->has_mnt_id = true; f_handle->mnt_id = m->mnt_id; } return path; } } else pr_debug("\t\t\tnot openable as %s (%s)\n", __path, strerror(errno)); } err: if (suitable_mount_found) return ERR_PTR(ERR_NO_PATH_IN_MOUNT); return ERR_PTR(ERR_NO_MOUNT); } static int open_handle(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle) { struct mount_info *m; int mntfd, fd = -1; fh_t handle; decode_handle(&handle, f_handle); pr_debug("Opening fhandle %x:%llx...\n", s_dev, (unsigned long long)handle.__handle[0]); for (m = mntinfo; m; m = m->next) { if (m->s_dev != s_dev || !mnt_is_dir(m)) continue; mntfd = __open_mountpoint(m); if (mntfd < 0) { pr_warn("Can't open mount for s_dev %x, continue\n", s_dev); continue; } fd = userns_call(open_by_handle, UNS_FDOUT, &handle, sizeof(handle), mntfd); if (fd >= 0) { close(mntfd); goto out; } close(mntfd); } out: return fd; } int check_open_handle(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle) { char *path, *irmap_path; struct mount_info *mi; if (fault_injected(FI_CHECK_OPEN_HANDLE)) goto fault; /* * Always try to fetch watchee path first. There are several reasons: * * - tmpfs/devtmps do not save inode numbers between mounts, * so it is critical to have the complete path under our * hands for restore purpose; * * - in case of migration the inodes might be changed as well * so the only portable solution is to carry the whole path * to the watchee inside image. */ path = alloc_openable(s_dev, i_ino, f_handle); if (!IS_ERR_OR_NULL(path)) { pr_debug("\tHandle 0x%x:0x%lx is openable\n", s_dev, i_ino); goto out; } else if (IS_ERR(path) && PTR_ERR(path) == ERR_NO_MOUNT) { goto fault; } else if (IS_ERR(path) && PTR_ERR(path) == ERR_GENERIC) { goto err; } mi = lookup_mnt_sdev(s_dev); if (mi == NULL) { pr_err("Unable to lookup a mount by dev 0x%x\n", s_dev); goto err; } if ((mi->fstype->code == FSTYPE__TMPFS) || (mi->fstype->code == FSTYPE__DEVTMPFS)) { pr_err("Can't find suitable path for handle (dev %#x ino %#lx): %d\n", s_dev, i_ino, (int)PTR_ERR(path)); goto err; } if (!opts.force_irmap) /* * If we're not forced to do irmap, then * say we have no path for watch. Otherwise * do irmap scan even if the handle is * working. * * FIXME -- no need to open-by-handle if * we are in force-irmap and not on tempfs */ goto out_nopath; fault: pr_warn("\tHandle 0x%x:0x%lx cannot be opened\n", s_dev, i_ino); irmap_path = irmap_lookup(s_dev, i_ino); if (!irmap_path) { pr_err("\tCan't dump that handle\n"); return -1; } path = xstrdup(irmap_path); if (!path) goto err; out: pr_debug("\tDumping %s as path for handle\n", path); f_handle->path = path; out_nopath: return 0; err: return -1; } static int check_one_wd(InotifyWdEntry *we) { pr_info("wd: wd %#08x s_dev %#08x i_ino %#16" PRIx64 " mask %#08x\n", we->wd, we->s_dev, we->i_ino, we->mask); pr_info("\t[fhandle] bytes %#08x type %#08x __handle %#016" PRIx64 ":%#016" PRIx64 "\n", we->f_handle->bytes, we->f_handle->type, we->f_handle->handle[0], we->f_handle->handle[1]); if (we->mask & KERNEL_FS_EVENT_ON_CHILD) pr_warn_once("\t\tDetected FS_EVENT_ON_CHILD bit " "in mask (will be ignored on restore)\n"); if (check_open_handle(we->s_dev, we->i_ino, we->f_handle)) return -1; return 0; } static int dump_one_inotify(int lfd, u32 id, const struct fd_parms *p) { FileEntry fe = FILE_ENTRY__INIT; InotifyFileEntry ie = INOTIFY_FILE_ENTRY__INIT; int exit_code = -1, i, ret; ret = fd_has_data(lfd); if (ret < 0) return -1; else if (ret > 0) pr_warn("The %#08x inotify events will be dropped\n", id); ie.id = id; ie.flags = p->flags; ie.fown = (FownEntry *)&p->fown; if (parse_fdinfo(lfd, FD_TYPES__INOTIFY, &ie)) goto free; for (i = 0; i < ie.n_wd; i++) if (check_one_wd(ie.wd[i])) goto free; fe.type = FD_TYPES__INOTIFY; fe.id = ie.id; fe.ify = &ie; pr_info("id %#08x flags %#08x\n", ie.id, ie.flags); if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) goto free; exit_code = 0; free: for (i = 0; i < ie.n_wd; i++) xfree(ie.wd[i]); xfree(ie.wd); return exit_code; } static int pre_dump_one_inotify(int pid, int lfd) { InotifyFileEntry ie = INOTIFY_FILE_ENTRY__INIT; int i; if (parse_fdinfo_pid(pid, lfd, FD_TYPES__INOTIFY, &ie)) return -1; for (i = 0; i < ie.n_wd; i++) { InotifyWdEntry *we = ie.wd[i]; if (irmap_queue_cache(we->s_dev, we->i_ino, we->f_handle)) return -1; xfree(we); } return 0; } const struct fdtype_ops inotify_dump_ops = { .type = FD_TYPES__INOTIFY, .dump = dump_one_inotify, .pre_dump = pre_dump_one_inotify, }; static int check_one_mark(FanotifyMarkEntry *fme) { if (fme->type == MARK_TYPE__INODE) { BUG_ON(!fme->ie); pr_info("mark: s_dev %#08x i_ino %#016" PRIx64 " mask %#08x\n", fme->s_dev, fme->ie->i_ino, fme->mask); pr_info("\t[fhandle] bytes %#08x type %#08x __handle %#016" PRIx64 ":%#016" PRIx64 "\n", fme->ie->f_handle->bytes, fme->ie->f_handle->type, fme->ie->f_handle->handle[0], fme->ie->f_handle->handle[1]); if (check_open_handle(fme->s_dev, fme->ie->i_ino, fme->ie->f_handle)) return -1; } if (fme->type == MARK_TYPE__MOUNT) { struct mount_info *m; BUG_ON(!fme->me); m = lookup_mnt_id(fme->me->mnt_id); if (!m) { pr_err("Can't find mnt_id 0x%x\n", fme->me->mnt_id); return -1; } if (!(root_ns_mask & CLONE_NEWNS)) fme->me->path = m->ns_mountpoint + 1; fme->s_dev = m->s_dev; pr_info("mark: s_dev %#08x mnt_id %#08x mask %#08x\n", fme->s_dev, fme->me->mnt_id, fme->mask); } return 0; } static int dump_one_fanotify(int lfd, u32 id, const struct fd_parms *p) { FileEntry fle = FILE_ENTRY__INIT; FanotifyFileEntry fe = FANOTIFY_FILE_ENTRY__INIT; int ret = -1, i; ret = fd_has_data(lfd); if (ret < 0) return -1; else if (ret > 0) pr_warn("The %#08x fanotify events will be dropped\n", id); ret = -1; fe.id = id; fe.flags = p->flags; fe.fown = (FownEntry *)&p->fown; if (parse_fdinfo(lfd, FD_TYPES__FANOTIFY, &fe) < 0) goto free; for (i = 0; i < fe.n_mark; i++) if (check_one_mark(fe.mark[i])) goto free; pr_info("id %#08x flags %#08x\n", fe.id, fe.flags); fle.type = FD_TYPES__FANOTIFY; fle.id = fe.id; fle.ffy = &fe; ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fle, PB_FILE); free: for (i = 0; i < fe.n_mark; i++) xfree(fe.mark[i]); xfree(fe.mark); return ret; } static int pre_dump_one_fanotify(int pid, int lfd) { FanotifyFileEntry fe = FANOTIFY_FILE_ENTRY__INIT; int i; if (parse_fdinfo_pid(pid, lfd, FD_TYPES__FANOTIFY, &fe)) return -1; for (i = 0; i < fe.n_mark; i++) { FanotifyMarkEntry *me = fe.mark[i]; if (me->type == MARK_TYPE__INODE && irmap_queue_cache(me->s_dev, me->ie->i_ino, me->ie->f_handle)) return -1; xfree(me); } xfree(fe.mark); return 0; } const struct fdtype_ops fanotify_dump_ops = { .type = FD_TYPES__FANOTIFY, .dump = dump_one_fanotify, .pre_dump = pre_dump_one_fanotify, }; static char *get_mark_path(const char *who, struct file_remap *remap, FhEntry *f_handle, unsigned long i_ino, unsigned int s_dev, char *buf, int *target) { char *path = NULL; if (remap) { int mntns_root; mntns_root = mntns_get_root_by_mnt_id(remap->rmnt_id); pr_debug("\t\tRestore %s watch for %#08x:%#016lx (via %s)\n", who, s_dev, i_ino, remap->rpath); *target = openat(mntns_root, remap->rpath, O_PATH); } else if (f_handle->path) { int mntns_root; char *path = "."; uint32_t mnt_id = f_handle->has_mnt_id ? f_handle->mnt_id : -1; /* irmap cache is collected in the root namespaces. */ mntns_root = mntns_get_root_by_mnt_id(mnt_id); /* change "/foo" into "foo" and "/" into "." */ if (f_handle->path[1] != '\0') path = f_handle->path + 1; pr_debug("\t\tRestore with path hint %d:%s\n", mnt_id, path); *target = openat(mntns_root, path, O_PATH); } else *target = open_handle(s_dev, i_ino, f_handle); if (*target < 0) { pr_perror("Unable to open %s", f_handle->path); goto err; } /* * fanotify/inotify open syscalls want path to attach * watch to. But the only thing we have is an FD obtained * via fhandle. Fortunately, when trying to attach the * /proc/pid/fd/ link, we will watch the inode the link * points to, i.e. -- just what we want. */ sprintf(buf, "/proc/self/fd/%d", *target); path = buf; if (!pr_quelled(LOG_DEBUG)) { char link[PATH_MAX]; if (read_fd_link(*target, link, sizeof(link)) < 0) link[0] = '\0'; pr_debug("\t\tRestore %s watch for %#08x:%#016lx (via %s -> %s)\n", who, s_dev, i_ino, path, link); } err: return path; } static int restore_one_inotify(int inotify_fd, struct fsnotify_mark_info *info) { InotifyWdEntry *iwe = info->iwe; int ret = -1, target = -1; char buf[PSFDS], *path; uint32_t mask; path = get_mark_path("inotify", info->remap, iwe->f_handle, iwe->i_ino, iwe->s_dev, buf, &target); if (!path) goto err; mask = iwe->mask & IN_ALL_EVENTS; if (iwe->mask & ~IN_ALL_EVENTS) { pr_info("\t\tfilter event mask %#x -> %#x\n", iwe->mask, mask); } if (kdat.has_inotify_setnextwd) { if (ioctl(inotify_fd, INOTIFY_IOC_SETNEXTWD, iwe->wd)) { pr_perror("Can't set next inotify wd"); return -1; } } while (1) { int wd; wd = inotify_add_watch(inotify_fd, path, mask); if (wd < 0) { pr_perror("Can't add watch for 0x%x with 0x%x", inotify_fd, iwe->wd); break; } else if (wd == iwe->wd) { ret = 0; break; } else if (wd > iwe->wd) { pr_err("Unsorted watch 0x%x found for 0x%x with 0x%x\n", wd, inotify_fd, iwe->wd); break; } if (kdat.has_inotify_setnextwd) return -1; inotify_rm_watch(inotify_fd, wd); } err: close_safe(&target); return ret; } static int restore_one_fanotify(int fd, struct fsnotify_mark_info *mark) { FanotifyMarkEntry *fme = mark->fme; unsigned int flags = FAN_MARK_ADD; int ret = -1, target = -1; char buf[PSFDS], *path = NULL; if (fme->type == MARK_TYPE__MOUNT) { struct mount_info *m; int mntns_root; char *p = fme->me->path; struct ns_id *nsid = NULL; if (root_ns_mask & CLONE_NEWNS) { m = lookup_mnt_id(fme->me->mnt_id); if (!m) { pr_err("Can't find mount mnt_id 0x%x\n", fme->me->mnt_id); return -1; } nsid = m->nsid; p = m->ns_mountpoint; } mntns_root = mntns_get_root_fd(nsid); target = openat(mntns_root, p, O_PATH); if (target == -1) { pr_perror("Unable to open %s", p); goto err; } flags |= FAN_MARK_MOUNT; snprintf(buf, sizeof(buf), "/proc/self/fd/%d", target); path = buf; } else if (fme->type == MARK_TYPE__INODE) { path = get_mark_path("fanotify", mark->remap, fme->ie->f_handle, fme->ie->i_ino, fme->s_dev, buf, &target); if (!path) goto err; } else { pr_err("Bad fsnotify mark type 0x%x\n", fme->type); goto err; } flags |= fme->mflags; if (mark->fme->mask) { ret = fanotify_mark(fd, flags, fme->mask, AT_FDCWD, path); if (ret) { pr_err("Adding fanotify mask 0x%x on 0x%x/%s failed (%d)\n", fme->mask, fme->id, path, ret); goto err; } } if (fme->ignored_mask) { ret = fanotify_mark(fd, flags | FAN_MARK_IGNORED_MASK, fme->ignored_mask, AT_FDCWD, path); if (ret) { pr_err("Adding fanotify ignored-mask 0x%x on 0x%x/%s failed (%d)\n", fme->ignored_mask, fme->id, path, ret); goto err; } } err: close_safe(&target); return ret; } static int open_inotify_fd(struct file_desc *d, int *new_fd) { struct fsnotify_file_info *info; struct fsnotify_mark_info *wd_info; int tmp; info = container_of(d, struct fsnotify_file_info, d); tmp = inotify_init1(info->ife->flags); if (tmp < 0) { pr_perror("Can't create inotify for %#08x", info->ife->id); return -1; } list_for_each_entry(wd_info, &info->marks, list) { pr_info("\tRestore 0x%x wd for %#08x\n", wd_info->iwe->wd, wd_info->iwe->id); if (restore_one_inotify(tmp, wd_info)) { close_safe(&tmp); return -1; } pr_info("\t 0x%x wd for %#08x is restored\n", wd_info->iwe->wd, wd_info->iwe->id); } if (restore_fown(tmp, info->ife->fown)) close_safe(&tmp); *new_fd = tmp; return 0; } static int open_fanotify_fd(struct file_desc *d, int *new_fd) { struct fsnotify_file_info *info; struct fsnotify_mark_info *mark; unsigned int flags = 0; int ret; info = container_of(d, struct fsnotify_file_info, d); flags = info->ffe->faflags; if (info->ffe->flags & O_CLOEXEC) flags |= FAN_CLOEXEC; if (info->ffe->flags & O_NONBLOCK) flags |= FAN_NONBLOCK; ret = fanotify_init(flags, info->ffe->evflags); if (ret < 0) { pr_perror("Can't init fanotify mark (%d)", ret); return -1; } list_for_each_entry(mark, &info->marks, list) { pr_info("\tRestore fanotify for %#08x\n", mark->fme->id); if (restore_one_fanotify(ret, mark)) { close_safe(&ret); return -1; } } if (restore_fown(ret, info->ffe->fown)) close_safe(&ret); *new_fd = ret; return 0; } static struct file_desc_ops inotify_desc_ops = { .type = FD_TYPES__INOTIFY, .open = open_inotify_fd, }; static struct file_desc_ops fanotify_desc_ops = { .type = FD_TYPES__FANOTIFY, .open = open_fanotify_fd, }; static int inotify_resolve_remap(struct pprep_head *ph) { struct fsnotify_mark_info *m; m = container_of(ph, struct fsnotify_mark_info, prep); m->remap = lookup_ghost_remap(m->iwe->s_dev, m->iwe->i_ino); return 0; } static int fanotify_resolve_remap(struct pprep_head *ph) { struct fsnotify_mark_info *m; m = container_of(ph, struct fsnotify_mark_info, prep); m->remap = lookup_ghost_remap(m->fme->s_dev, m->fme->ie->i_ino); return 0; } static int __collect_inotify_mark(struct fsnotify_file_info *p, struct fsnotify_mark_info *mark) { struct fsnotify_mark_info *m; /* * We should put marks in wd ascending order. See comment * in restore_one_inotify() for explanation. */ list_for_each_entry(m, &p->marks, list) if (m->iwe->wd > mark->iwe->wd) break; list_add_tail(&mark->list, &m->list); mark->prep.actor = inotify_resolve_remap; add_post_prepare_cb(&mark->prep); return 0; } static int __collect_fanotify_mark(struct fsnotify_file_info *p, struct fsnotify_mark_info *mark) { list_add(&mark->list, &p->marks); if (mark->fme->type == MARK_TYPE__INODE) { mark->prep.actor = fanotify_resolve_remap; add_post_prepare_cb(&mark->prep); } return 0; } static int collect_one_inotify(void *o, ProtobufCMessage *msg, struct cr_img *img) { struct fsnotify_file_info *info = o; int i; info->ife = pb_msg(msg, InotifyFileEntry); INIT_LIST_HEAD(&info->marks); pr_info("Collected id %#08x flags %#08x\n", info->ife->id, info->ife->flags); for (i = 0; i < info->ife->n_wd; i++) { struct fsnotify_mark_info *mark; mark = xmalloc(sizeof(*mark)); if (!mark) return -1; mark->iwe = info->ife->wd[i]; INIT_LIST_HEAD(&mark->list); mark->remap = NULL; if (__collect_inotify_mark(info, mark)) return -1; } return file_desc_add(&info->d, info->ife->id, &inotify_desc_ops); } struct collect_image_info inotify_cinfo = { .fd_type = CR_FD_INOTIFY_FILE, .pb_type = PB_INOTIFY_FILE, .priv_size = sizeof(struct fsnotify_file_info), .collect = collect_one_inotify, }; static int collect_one_fanotify(void *o, ProtobufCMessage *msg, struct cr_img *img) { struct fsnotify_file_info *info = o; int i; info->ffe = pb_msg(msg, FanotifyFileEntry); INIT_LIST_HEAD(&info->marks); pr_info("Collected id %#08x flags %#08x\n", info->ffe->id, info->ffe->flags); for (i = 0; i < info->ffe->n_mark; i++) { struct fsnotify_mark_info *mark; mark = xmalloc(sizeof(*mark)); if (!mark) return -1; mark->fme = info->ffe->mark[i]; INIT_LIST_HEAD(&mark->list); mark->remap = NULL; if (__collect_fanotify_mark(info, mark)) return -1; } return file_desc_add(&info->d, info->ffe->id, &fanotify_desc_ops); } struct collect_image_info fanotify_cinfo = { .fd_type = CR_FD_FANOTIFY_FILE, .pb_type = PB_FANOTIFY_FILE, .priv_size = sizeof(struct fsnotify_file_info), .collect = collect_one_fanotify, }; static int collect_one_inotify_mark(void *o, ProtobufCMessage *msg, struct cr_img *i) { struct fsnotify_mark_info *mark = o; struct file_desc *d; if (!deprecated_ok("separate images for fsnotify marks")) return -1; mark->iwe = pb_msg(msg, InotifyWdEntry); INIT_LIST_HEAD(&mark->list); mark->remap = NULL; /* * The kernel prior 4.3 might export internal event * mask bits which are not part of user-space API. It * is fixed in kernel but we have to keep backward * compatibility with old images. So mask out * inappropriate bits (in particular fdinfo might * have FS_EVENT_ON_CHILD bit set). */ mark->iwe->mask &= ~KERNEL_FS_EVENT_ON_CHILD; d = find_file_desc_raw(FD_TYPES__INOTIFY, mark->iwe->id); if (!d) { pr_err("Can't find inotify with id %#08x\n", mark->iwe->id); return -1; } return __collect_inotify_mark(container_of(d, struct fsnotify_file_info, d), mark); } struct collect_image_info inotify_mark_cinfo = { .fd_type = CR_FD_INOTIFY_WD, .pb_type = PB_INOTIFY_WD, .priv_size = sizeof(struct fsnotify_mark_info), .collect = collect_one_inotify_mark, }; static int collect_one_fanotify_mark(void *o, ProtobufCMessage *msg, struct cr_img *i) { struct fsnotify_mark_info *mark = o; struct file_desc *d; if (!deprecated_ok("separate images for fsnotify marks")) return -1; mark->fme = pb_msg(msg, FanotifyMarkEntry); INIT_LIST_HEAD(&mark->list); mark->remap = NULL; d = find_file_desc_raw(FD_TYPES__FANOTIFY, mark->fme->id); if (!d) { pr_err("Can't find fanotify with id %#08x\n", mark->fme->id); return -1; } return __collect_fanotify_mark(container_of(d, struct fsnotify_file_info, d), mark); } struct collect_image_info fanotify_mark_cinfo = { .fd_type = CR_FD_FANOTIFY_MARK, .pb_type = PB_FANOTIFY_MARK, .priv_size = sizeof(struct fsnotify_mark_info), .collect = collect_one_fanotify_mark, }; crac-criu-1.5.0/criu/hugetlb.c000066400000000000000000000031371471504326700161420ustar00rootroot00000000000000#include "hugetlb.h" #include "kerndat.h" #include "sizes.h" // clang-format off struct htlb_info hugetlb_info[HUGETLB_MAX] = { [HUGETLB_16KB] = { SZ_16K, MAP_HUGETLB_16KB }, [HUGETLB_64KB] = { SZ_64K, MAP_HUGETLB_64KB }, [HUGETLB_512KB] = { SZ_512K, MAP_HUGETLB_512KB }, [HUGETLB_1MB] = { SZ_1M, MAP_HUGETLB_1MB }, [HUGETLB_2MB] = { SZ_2M, MAP_HUGETLB_2MB }, [HUGETLB_8MB] = { SZ_8M, MAP_HUGETLB_8MB }, [HUGETLB_16MB] = { SZ_16M, MAP_HUGETLB_16MB }, [HUGETLB_32MB] = { SZ_32M, MAP_HUGETLB_32MB }, [HUGETLB_256MB] = { SZ_256M, MAP_HUGETLB_256MB }, [HUGETLB_512MB] = { SZ_512M, MAP_HUGETLB_512MB }, [HUGETLB_1GB] = { SZ_1G, MAP_HUGETLB_1GB }, [HUGETLB_2GB] = { SZ_2G, MAP_HUGETLB_2GB }, [HUGETLB_16GB] = { SZ_16G, MAP_HUGETLB_16GB }, }; // clang-format on int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag) { int i; for (i = 0; i < HUGETLB_MAX; i++) { if (kdat.hugetlb_dev[i] == dev) { if (hugetlb_size_flag) *hugetlb_size_flag = hugetlb_info[i].flag; return 1; } } return 0; } int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma) { /* * Dump the hugetlb backed mapping using memfd_hugetlb when it is not * anonymous private mapping. */ if (kdat.has_memfd_hugetlb && is_hugetlb_dev(dev, hugetlb_size_flag) && !((vma->e->flags & MAP_PRIVATE) && !strncmp(file_path, ANON_HUGEPAGE_PREFIX, ANON_HUGEPAGE_PREFIX_LEN))) return 1; return 0; } unsigned long get_size_from_hugetlb_flag(int flag) { int i; for (i = 0; i < HUGETLB_MAX; i++) if (flag == hugetlb_info[i].flag) return hugetlb_info[i].size; return -1; } crac-criu-1.5.0/criu/image-desc.c000066400000000000000000000077621471504326700165160ustar00rootroot00000000000000#include #include "image-desc.h" #include "magic.h" #include "image.h" /* * The cr fd set is the set of files where the information * about dumped processes is stored. Each file carries some * small portion of info about the whole picture, see below * for more details. */ #define FD_ENTRY(_name, _fmt) \ [CR_FD_##_name] = { \ .fmt = _fmt ".img", \ .magic = _name##_MAGIC, \ } #define FD_ENTRY_F(_name, _fmt, _f) \ [CR_FD_##_name] = { \ .fmt = _fmt ".img", \ .magic = _name##_MAGIC, \ .oflags = _f, \ } struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY(INVENTORY, "inventory"), FD_ENTRY(FDINFO, "fdinfo-%u"), FD_ENTRY(PAGEMAP, "pagemap-%lu"), FD_ENTRY(SHMEM_PAGEMAP, "pagemap-shmem-%lu"), FD_ENTRY(REG_FILES, "reg-files"), FD_ENTRY(EXT_FILES, "ext-files"), FD_ENTRY(NS_FILES, "ns-files"), FD_ENTRY(EVENTFD_FILE, "eventfd"), FD_ENTRY(EVENTPOLL_FILE,"eventpoll"), FD_ENTRY(EVENTPOLL_TFD, "eventpoll-tfd"), FD_ENTRY(SIGNALFD, "signalfd"), FD_ENTRY(INOTIFY_FILE, "inotify"), FD_ENTRY(INOTIFY_WD, "inotify-wd"), FD_ENTRY(FANOTIFY_FILE, "fanotify"), FD_ENTRY(FANOTIFY_MARK, "fanotify-mark"), FD_ENTRY(CORE, "core-%u"), FD_ENTRY(IDS, "ids-%u"), FD_ENTRY(MM, "mm-%u"), FD_ENTRY(VMAS, "vmas-%u"), FD_ENTRY(PIPES, "pipes"), FD_ENTRY_F(PIPES_DATA, "pipes-data", O_NOBUF), /* splices data */ FD_ENTRY(FIFO, "fifo"), FD_ENTRY_F(FIFO_DATA, "fifo-data", O_NOBUF), /* the same */ FD_ENTRY(PSTREE, "pstree"), FD_ENTRY(SIGACT, "sigacts-%u"), FD_ENTRY(UNIXSK, "unixsk"), FD_ENTRY(INETSK, "inetsk"), FD_ENTRY(PACKETSK, "packetsk"), FD_ENTRY(NETLINK_SK, "netlinksk"), FD_ENTRY_F(SK_QUEUES, "sk-queues", O_NOBUF), /* lseeks the image */ FD_ENTRY(ITIMERS, "itimers-%u"), FD_ENTRY(POSIX_TIMERS, "posix-timers-%u"), FD_ENTRY(CREDS, "creds-%u"), FD_ENTRY(UTSNS, "utsns-%u"), FD_ENTRY(IPC_VAR, "ipcns-var-%u"), FD_ENTRY_F(IPCNS_SHM, "ipcns-shm-%u", O_NOBUF), /* writes segments of data */ FD_ENTRY(IPCNS_MSG, "ipcns-msg-%u"), FD_ENTRY(IPCNS_SEM, "ipcns-sem-%u"), FD_ENTRY(FS, "fs-%u"), FD_ENTRY(REMAP_FPATH, "remap-fpath"), FD_ENTRY_F(GHOST_FILE, "ghost-file-%x", O_NOBUF), FD_ENTRY_F(MEMFD_INODE, "memfd", O_NOBUF), FD_ENTRY(TCP_STREAM, "tcp-stream-%x"), FD_ENTRY(MNTS, "mountpoints-%u"), FD_ENTRY(NETDEV, "netdev-%u"), FD_ENTRY(NETNS, "netns-%u"), FD_ENTRY_F(IFADDR, "ifaddr-%u", O_NOBUF), FD_ENTRY_F(ROUTE, "route-%u", O_NOBUF), FD_ENTRY_F(ROUTE6, "route6-%u", O_NOBUF), FD_ENTRY_F(RULE, "rule-%u", O_NOBUF), FD_ENTRY_F(IPTABLES, "iptables-%u", O_NOBUF), FD_ENTRY_F(IP6TABLES, "ip6tables-%u", O_NOBUF), FD_ENTRY_F(NFTABLES, "nftables-%u", O_NOBUF), FD_ENTRY_F(TMPFS_IMG, "tmpfs-%u.tar.gz", O_NOBUF), FD_ENTRY_F(TMPFS_DEV, "tmpfs-dev-%u.tar.gz", O_NOBUF), FD_ENTRY_F(AUTOFS, "autofs-%u", O_NOBUF), FD_ENTRY(BINFMT_MISC_OLD, "binfmt-misc-%u"), FD_ENTRY(BINFMT_MISC, "binfmt-misc"), FD_ENTRY(TTY_FILES, "tty"), FD_ENTRY(TTY_INFO, "tty-info"), FD_ENTRY_F(TTY_DATA, "tty-data", O_NOBUF), FD_ENTRY(FILE_LOCKS, "filelocks"), FD_ENTRY(RLIMIT, "rlimit-%u"), FD_ENTRY_F(PAGES, "pages-%u", O_NOBUF), FD_ENTRY_F(PAGES_COMP, "pages-%u.comp", O_NOBUF), FD_ENTRY_F(PAGES_OLD, "pages-%d", O_NOBUF), FD_ENTRY_F(SHM_PAGES_OLD, "pages-shmem-%ld", O_NOBUF), FD_ENTRY(SIGNAL, "signal-s-%u"), FD_ENTRY(PSIGNAL, "signal-p-%u"), FD_ENTRY(TUNFILE, "tunfile"), FD_ENTRY(CGROUP, "cgroup"), FD_ENTRY(TIMERFD, "timerfd"), FD_ENTRY(CPUINFO, "cpuinfo"), FD_ENTRY(SECCOMP, "seccomp"), FD_ENTRY(USERNS, "userns-%u"), FD_ENTRY(NETNF_CT, "netns-ct-%u"), FD_ENTRY(NETNF_EXP, "netns-exp-%u"), FD_ENTRY(FILES, "files"), FD_ENTRY(TIMENS, "timens-%u"), FD_ENTRY(PIDNS, "pidns-%u"), FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), [CR_FD_STATS] = { .fmt = "stats-%s", .magic = STATS_MAGIC, .oflags = O_SERVICE | O_FORCE_LOCAL, }, [CR_FD_IRMAP_CACHE] = { .fmt = "irmap-cache", .magic = IRMAP_CACHE_MAGIC, .oflags = O_SERVICE | O_FORCE_LOCAL, }, }; crac-criu-1.5.0/criu/image.c000066400000000000000000000377171471504326700156050ustar00rootroot00000000000000#include #include #include #include #include #include #include "crtools.h" #include "cr_options.h" #include "imgset.h" #include "image.h" #include "pstree.h" #include "stats.h" #include "cgroup.h" #include "lsm.h" #include "protobuf.h" #include "xmalloc.h" #include "images/inventory.pb-c.h" #include "images/pagemap.pb-c.h" #include "proc_parse.h" #include "img-streamer.h" #include "namespaces.h" #include "pages-compress.h" bool ns_per_id = false; bool img_common_magic = true; TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; int check_img_inventory(bool restore) { int ret = -1; struct cr_img *img; InventoryEntry *he; img = open_image(CR_FD_INVENTORY, O_RSTR); if (!img) return -1; if (pb_read_one(img, &he, PB_INVENTORY) < 0) goto out_close; if (!he->has_fdinfo_per_id || !he->fdinfo_per_id) { pr_err("Too old image, no longer supported\n"); goto out_close; } ns_per_id = he->has_ns_per_id ? he->ns_per_id : false; if (he->root_ids) { root_ids = xmalloc(sizeof(*root_ids)); if (!root_ids) goto out_err; memcpy(root_ids, he->root_ids, sizeof(*root_ids)); } if (he->has_root_cg_set) { if (he->root_cg_set == 0) { pr_err("Corrupted root cgset\n"); goto out_err; } root_cg_set = he->root_cg_set; } if (he->has_lsmtype) image_lsm = he->lsmtype; else image_lsm = LSMTYPE__NO_LSM; switch (he->img_version) { case CRTOOLS_IMAGES_V1: /* good old images. OK */ img_common_magic = false; break; case CRTOOLS_IMAGES_V1_1: /* newer images with extra magic in the head */ break; default: pr_err("Not supported images version %u\n", he->img_version); goto out_err; } if (restore && he->tcp_close && !opts.tcp_close) { pr_err("Need to set the --tcp-close options.\n"); goto out_err; } if (restore) { if (!he->has_network_lock_method) { /* * Image files were generated with an older version of CRIU * so we should fall back to iptables because this is the * network-lock mechanism used in older versions. */ pr_info("Network lock method not found in inventory image\n"); pr_info("Falling back to iptables network lock method\n"); opts.network_lock_method = NETWORK_LOCK_IPTABLES; } else { opts.network_lock_method = he->network_lock_method; } } ret = 0; out_err: inventory_entry__free_unpacked(he, NULL); out_close: close_image(img); return ret; } int write_img_inventory(InventoryEntry *he) { struct cr_img *img; int ret; pr_info("Writing image inventory (version %u)\n", CRTOOLS_IMAGES_V1); img = open_image(CR_FD_INVENTORY, O_DUMP); if (!img) return -1; ret = pb_write_one(img, he, PB_INVENTORY); xfree(he->root_ids); close_image(img); if (ret < 0) return -1; return 0; } int inventory_save_uptime(InventoryEntry *he) { if (!opts.track_mem) return 0; /* * dump_uptime is used to detect whether a process was handled * before or it is a new process with the same pid. */ if (parse_uptime(&he->dump_uptime)) return -1; he->has_dump_uptime = true; return 0; } /* * This function is intended to get an inventory image from previous (parent) * dump iteration. We use dump_uptime from the image in detect_pid_reuse(). * * You see that these function never fails by itself, it only prints warnings * to better understand reasons why we don't found a proper image, failing here * is too early. We get to detect_pid_reuse() only if we have a parent pagemap * and that's the proper place to fail: we know that there is a parent pagemap * but we don't have (can't access, etc) parent inventory => can't detect * pid-reuse => fail. */ InventoryEntry *get_parent_inventory(void) { struct cr_img *img; InventoryEntry *ie; int dir; if (open_parent(get_service_fd(IMG_FD_OFF), &dir)) { /* * We print the warning below to be notified that we had some * unexpected problem on open. For instance we have a parent * directory but have no access. Having no parent inventory * when also having no parent directory is an expected case of * first dump iteration. */ pr_warn("Failed to open parent directory\n"); return NULL; } if (dir < 0) return NULL; img = open_image_at(dir, CR_FD_INVENTORY, O_RSTR); if (!img) { pr_warn("Failed to open parent pre-dump inventory image\n"); close(dir); return NULL; } if (pb_read_one(img, &ie, PB_INVENTORY) < 0) { pr_warn("Failed to read parent pre-dump inventory entry\n"); close_image(img); close(dir); return NULL; } if (!ie->has_dump_uptime) { pr_warn("Parent pre-dump inventory has no uptime\n"); inventory_entry__free_unpacked(ie, NULL); ie = NULL; } close_image(img); close(dir); return ie; } int prepare_inventory(InventoryEntry *he) { struct pid pid; struct { struct pstree_item i; struct dmp_info d; } crt = { .i.pid = &pid }; pr_info("Preparing image inventory (version %u)\n", CRTOOLS_IMAGES_V1); he->img_version = CRTOOLS_IMAGES_V1_1; he->fdinfo_per_id = true; he->has_fdinfo_per_id = true; he->ns_per_id = true; he->has_ns_per_id = true; he->has_lsmtype = true; he->lsmtype = host_lsm_type(); crt.i.pid->state = TASK_ALIVE; crt.i.pid->real = getpid(); if (get_task_ids(&crt.i)) return -1; if (!opts.unprivileged) he->has_root_cg_set = true; if (dump_thread_cgroup(NULL, &he->root_cg_set, NULL, -1)) return -1; he->root_ids = crt.i.ids; /* tcp_close has to be set on restore if it has been set on dump. */ if (opts.tcp_close) { he->tcp_close = true; he->has_tcp_close = true; } /* Save network lock method to reuse in restore */ he->has_network_lock_method = true; he->network_lock_method = opts.network_lock_method; return 0; } static struct cr_imgset *alloc_cr_imgset(int nr) { struct cr_imgset *cr_imgset; unsigned int i; cr_imgset = xmalloc(sizeof(*cr_imgset)); if (cr_imgset == NULL) return NULL; cr_imgset->_imgs = xmalloc(nr * sizeof(struct cr_img *)); if (cr_imgset->_imgs == NULL) { xfree(cr_imgset); return NULL; } for (i = 0; i < nr; i++) cr_imgset->_imgs[i] = NULL; cr_imgset->fd_nr = nr; return cr_imgset; } static void __close_cr_imgset(struct cr_imgset *cr_imgset) { unsigned int i; if (!cr_imgset) return; for (i = 0; i < cr_imgset->fd_nr; i++) { if (!cr_imgset->_imgs[i]) continue; close_image(cr_imgset->_imgs[i]); cr_imgset->_imgs[i] = NULL; } } void close_cr_imgset(struct cr_imgset **cr_imgset) { if (!cr_imgset || !*cr_imgset) return; __close_cr_imgset(*cr_imgset); xfree((*cr_imgset)->_imgs); xfree(*cr_imgset); *cr_imgset = NULL; } struct cr_imgset *cr_imgset_open_range(int pid, int from, int to, unsigned long flags) { struct cr_imgset *imgset; unsigned int i; imgset = alloc_cr_imgset(to - from); if (!imgset) goto err; from++; imgset->fd_off = from; for (i = from; i < to; i++) { struct cr_img *img; img = open_image(i, flags, pid); if (!img) { if (!(flags & O_CREAT)) /* caller should check himself */ continue; goto err; } imgset->_imgs[i - from] = img; } return imgset; err: close_cr_imgset(&imgset); return NULL; } struct cr_imgset *cr_task_imgset_open(int pid, int mode) { return cr_imgset_open(pid, TASK, mode); } struct cr_imgset *cr_glob_imgset_open(int mode) { return cr_imgset_open(-1 /* ignored */, GLOB, mode); } static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long flags, char *path); struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...) { struct cr_img *img; unsigned long oflags; char path[PATH_MAX]; va_list args; bool lazy = false; if (dfd == -1) { dfd = get_service_fd(IMG_FD_OFF); lazy = (flags & O_CREAT); } img = xmalloc(sizeof(*img)); if (!img) return NULL; oflags = flags | imgset_template[type].oflags; va_start(args, flags); vsnprintf(path, PATH_MAX, imgset_template[type].fmt, args); va_end(args); if (lazy) { img->fd = LAZY_IMG_FD; img->type = type; img->oflags = oflags; img->path = xstrdup(path); return img; } else { img->fd = EMPTY_IMG_FD; img->type = type; } if (do_open_image(img, dfd, type, oflags, path)) { close_image(img); return NULL; } return img; } static inline u32 head_magic(int oflags) { return oflags & O_SERVICE ? IMG_SERVICE_MAGIC : IMG_COMMON_MAGIC; } static int img_check_magic(struct cr_img *img, int oflags, int type, char *path) { u32 magic; if (read_img(img, &magic) < 0) return -1; if (img_common_magic && (type != CR_FD_INVENTORY)) { if (magic != head_magic(oflags)) { pr_err("Head magic doesn't match for %s\n", path); return -1; } if (read_img(img, &magic) < 0) return -1; } if (magic != imgset_template[type].magic) { pr_err("Magic doesn't match for %s\n", path); return -1; } return 0; } static int img_write_magic(struct cr_img *img, int oflags, int type) { if (img_common_magic && (type != CR_FD_INVENTORY)) { u32 cmagic; cmagic = head_magic(oflags); if (write_img(img, &cmagic)) return -1; } return write_img(img, &imgset_template[type].magic); } struct openat_args { char path[PATH_MAX]; int flags; int err; int mode; }; static int userns_openat(void *arg, int dfd, int pid) { struct openat_args *pa = (struct openat_args *)arg; int ret; ret = openat(dfd, pa->path, pa->flags, pa->mode); if (ret < 0) pa->err = errno; return ret; } static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long oflags, char *path) { int ret, flags; flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL); if (opts.stream && !(oflags & O_FORCE_LOCAL)) { ret = img_streamer_open(path, flags); errno = EIO; /* errno value is meaningless, only the ret value is meaningful */ } else if (root_ns_mask & CLONE_NEWUSER && type == CR_FD_PAGES && oflags & O_RDWR) { /* * For pages images dedup we need to open images read-write on * restore, that may require proper capabilities, so we ask * usernsd to do it for us */ struct openat_args pa = { .flags = flags, .err = 0, .mode = CR_FD_PERM, }; snprintf(pa.path, PATH_MAX, "%s", path); ret = userns_call(userns_openat, UNS_FDOUT, &pa, sizeof(struct openat_args), dfd); if (ret < 0) errno = pa.err; } else if (CR_FD_PAGES_COMP == type && flags == O_RDONLY) { pr_debug("Wait for decompression thread, and replace file descriptor... (%s)\n", path); ret = decompression_get_fd(); } else ret = openat(dfd, path, flags, CR_FD_PERM); if (ret < 0) { if (!(flags & O_CREAT) && (errno == ENOENT || ret == -ENOENT)) { pr_info("No %s image\n", path); img->_x.fd = EMPTY_IMG_FD; goto skip_magic; } if (type == CR_FD_STATS) { pr_warn("Unable to open %s", path); } else { pr_perror("Unable to open %s", path); } goto err; } img->_x.fd = ret; if (oflags & O_NOBUF) bfd_setraw(&img->_x); else { if (flags == O_RDONLY) ret = bfdopenr(&img->_x); else ret = bfdopenw(&img->_x); if (ret) goto err; } if (imgset_template[type].magic == RAW_IMAGE_MAGIC) goto skip_magic; if (flags == O_RDONLY) ret = img_check_magic(img, oflags, type, path); else ret = img_write_magic(img, oflags, type); if (ret) goto err; skip_magic: return 0; err: return -1; } int open_image_lazy(struct cr_img *img) { int dfd; char *path = img->path; img->path = NULL; dfd = get_service_fd(IMG_FD_OFF); if (do_open_image(img, dfd, img->type, img->oflags, path)) { xfree(path); return -1; } xfree(path); return 0; } void close_image(struct cr_img *img) { if (lazy_image(img)) { /* * Remove the image file if it's there so that * subsequent restore doesn't read wrong or fake * data from it. */ unlinkat(get_service_fd(IMG_FD_OFF), img->path, 0); xfree(img->path); } else if (!empty_image(img)) bclose(&img->_x); xfree(img); } struct cr_img *img_from_fd(int fd) { struct cr_img *img; img = xmalloc(sizeof(*img)); if (img) { img->_x.fd = fd; bfd_setraw(&img->_x); } return img; } /* * `mode` should be O_RSTR or O_DUMP depending on the intent. * This is used when opts.stream is enabled for picking the right streamer * socket name. `mode` is ignored when opts.stream is not enabled. */ int open_image_dir(char *dir, int mode) { int fd, ret; fd = open(dir, O_RDONLY); if (fd < 0) { pr_perror("Can't open dir %s", dir); return -1; } ret = install_service_fd(IMG_FD_OFF, fd); if (ret < 0) { pr_err("install_service_fd failed.\n"); return -1; } fd = ret; if (opts.stream) { if (img_streamer_init(dir, mode) < 0) goto err; } else if (opts.img_parent) { if (faccessat(fd, opts.img_parent, R_OK, 0)) { pr_perror("Invalid parent image directory provided"); goto err; } ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK); if (ret < 0 && errno != EEXIST) { pr_perror("Can't link parent snapshot"); goto err; } if (opts.img_parent[0] == '/') pr_warn("Absolute paths for parent links " "may not work on restore!\n"); } return 0; err: close_image_dir(); return -1; } void close_image_dir(void) { if (opts.stream) img_streamer_finish(); close_service_fd(IMG_FD_OFF); } int open_parent(int dfd, int *pfd) { struct stat st; *pfd = -1; /* Check if the parent symlink exists */ if (fstatat(dfd, CR_PARENT_LINK, &st, AT_SYMLINK_NOFOLLOW) && errno == ENOENT) { pr_debug("No parent images directory provided\n"); return 0; } *pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY); if (*pfd < 0) { pr_perror("Can't open parent path"); return -1; } return 0; } static unsigned long page_ids = 1; void up_page_ids_base(void) { /* * When page server and criu dump work on * the same dir, the shmem pagemaps and regular * pagemaps may have IDs conflicts. Fix this by * making page server produce page images with * higher IDs. */ BUG_ON(page_ids != 1); page_ids += 0x10000; } struct cr_img *open_pages_image_at(int dfd, unsigned long flags, struct cr_img *pmi, u32 *id) { if (flags == O_RDONLY || flags == O_RDWR) { PagemapHead *h; if (pb_read_one(pmi, &h, PB_PAGEMAP_HEAD) < 0) return NULL; *id = h->pages_id; pagemap_head__free_unpacked(h, NULL); } else { PagemapHead h = PAGEMAP_HEAD__INIT; *id = h.pages_id = page_ids++; if (pb_write_one(pmi, &h, PB_PAGEMAP_HEAD) < 0) return NULL; } if (opts.compress && flags == O_RDONLY) { return open_image_at(dfd, CR_FD_PAGES_COMP, flags, *id); } return open_image_at(dfd, CR_FD_PAGES, flags, *id); } struct cr_img *open_pages_image(unsigned long flags, struct cr_img *pmi, u32 *id) { return open_pages_image_at(get_service_fd(IMG_FD_OFF), flags, pmi, id); } /* * Write buffer @ptr of @size bytes into @fd file * Returns * 0 on success * -1 on error (error message is printed) */ int write_img_buf(struct cr_img *img, const void *ptr, int size) { int ret; ret = bwrite(&img->_x, ptr, size); if (ret == size) return 0; if (ret < 0) pr_perror("Can't write img file"); else pr_err("Img trimmed %d/%d\n", ret, size); return -1; } /* * Read buffer @ptr of @size bytes from @fd file * Returns * 1 on success * 0 on EOF (silently) * -1 on error (error message is printed) */ int read_img_buf_eof(struct cr_img *img, void *ptr, int size) { int ret; ret = bread(&img->_x, ptr, size); if (ret == size) return 1; if (ret == 0) return 0; if (ret < 0) pr_perror("Can't read img file"); else pr_err("Img trimmed %d/%d\n", ret, size); return -1; } /* * Read buffer @ptr of @size bytes from @fd file * Returns * 1 on success * -1 on error or EOF (error message is printed) */ int read_img_buf(struct cr_img *img, void *ptr, int size) { int ret; ret = read_img_buf_eof(img, ptr, size); if (ret == 0) { pr_err("Unexpected EOF\n"); ret = -1; } return ret; } /* * read_img_str -- same as read_img_buf, but allocates memory for * the buffer and puts the '\0' at the end */ int read_img_str(struct cr_img *img, char **pstr, int size) { int ret; char *str; str = xmalloc(size + 1); if (!str) return -1; ret = read_img_buf(img, str, size); if (ret < 0) { xfree(str); return -1; } str[size] = '\0'; *pstr = str; return 0; } off_t img_raw_size(struct cr_img *img) { struct stat stat; if (fstat(img->_x.fd, &stat)) { pr_perror("Failed to get image stats"); return -1; } return stat.st_size; } crac-criu-1.5.0/criu/img-streamer.c000066400000000000000000000141411471504326700171010ustar00rootroot00000000000000#include #include #include #include #include "cr_options.h" #include "img-streamer.h" #include "image.h" #include "images/img-streamer.pb-c.h" #include "protobuf.h" #include "servicefd.h" #include "rst-malloc.h" #include "common/scm.h" #include "common/lock.h" #include "action-scripts.h" /* * We use different path names for the dump and restore sockets because: * 1) The user may want to perform both at the same time (akin to live * migration). Specifying the same images-dir is convenient. * 2) It fails quickly when the user mix-up the streamer and CRIU operations. * (e.g., streamer is in capture more, while CRIU is in restore mode). */ #define IMG_STREAMER_CAPTURE_SOCKET_NAME "streamer-capture.sock" #define IMG_STREAMER_SERVE_SOCKET_NAME "streamer-serve.sock" /* All requests go through the same socket connection. We must synchronize */ static mutex_t *img_streamer_fd_lock; /* Either O_DUMP or O_RSTR */ static int img_streamer_mode; static const char *socket_name_for_mode(int mode) { switch (mode) { case O_DUMP: return IMG_STREAMER_CAPTURE_SOCKET_NAME; case O_RSTR: return IMG_STREAMER_SERVE_SOCKET_NAME; default: BUG(); return NULL; } } /* * img_streamer_init() connects to the image streamer socket. * mode should be either O_DUMP or O_RSTR. */ int img_streamer_init(const char *image_dir, int mode) { struct sockaddr_un addr; int pre_stream_ret; int sockfd; img_streamer_mode = mode; pre_stream_ret = run_scripts(ACT_PRE_STREAM); if (pre_stream_ret != 0) { pr_err("Pre-stream script failed with %d!\n", pre_stream_ret); return -1; } sockfd = socket(AF_UNIX, SOCK_STREAM, 0); if (sockfd < 0) { pr_perror("Unable to instantiate UNIX socket"); return -1; } memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_UNIX; snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", image_dir, socket_name_for_mode(mode)); if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { pr_perror("Unable to connect to image streamer socket: %s", addr.sun_path); goto err; } img_streamer_fd_lock = shmalloc(sizeof(*img_streamer_fd_lock)); if (!img_streamer_fd_lock) { pr_err("Failed to allocate memory\n"); goto err; } mutex_init(img_streamer_fd_lock); if (install_service_fd(IMG_STREAMER_FD_OFF, sockfd) < 0) return -1; return 0; err: close(sockfd); return -1; } /* * img_streamer_finish() indicates that no more files will be opened. * In other words, img_streamer_open() will no longer be called. */ void img_streamer_finish(void) { if (get_service_fd(IMG_STREAMER_FD_OFF) >= 0) { pr_info("Dismissing the image streamer\n"); close_service_fd(IMG_STREAMER_FD_OFF); } } /* * The regular protobuf APIs pb_write_one() and pb_read_one() operate over a * `struct cr_img` object. Sadly, we don't have such object. We just have a * file descriptor. The following pb_write_one_fd() and pb_read_one_fd() * provide a protobuf API over a file descriptor. The implementation is a bit * of a hack, but should be fine. At some point we can revisit to have a * proper protobuf API over fds. */ static int pb_write_one_fd(int fd, void *obj, int type) { int ret; struct cr_img img; memset(&img, 0, sizeof(img)); img._x.fd = fd; ret = pb_write_one(&img, obj, type); if (ret < 0) pr_perror("Failed to communicate with the image streamer"); return ret; } static int pb_read_one_fd(int fd, void **pobj, int type) { int ret; struct cr_img img; memset(&img, 0, sizeof(img)); img._x.fd = fd; ret = pb_read_one(&img, pobj, type); if (ret < 0) pr_perror("Failed to communicate with the image streamer"); return ret; } static int send_file_request(char *filename) { ImgStreamerRequestEntry req = IMG_STREAMER_REQUEST_ENTRY__INIT; req.filename = filename; return pb_write_one_fd(get_service_fd(IMG_STREAMER_FD_OFF), &req, PB_IMG_STREAMER_REQUEST); } static int recv_file_reply(bool *exists) { ImgStreamerReplyEntry *reply; int ret = pb_read_one_fd(get_service_fd(IMG_STREAMER_FD_OFF), (void **)&reply, PB_IMG_STREAMER_REPLY); if (ret < 0) return ret; *exists = reply->exists; free(reply); return 0; } /* * Using a pipe for image file transfers allows the data to be spliced by the * image streamer, greatly improving performance. * Transfer rates of up to 15GB/s can be seen with this technique. */ #define READ_PIPE 0 /* index of the read pipe returned by pipe() */ #define WRITE_PIPE 1 static int establish_streamer_file_pipe(void) { /* * If the other end of the pipe closes, the kernel will want to kill * us with a SIGPIPE. These signal must be ignored, which we do in * crtools.c:main() with signal(SIGPIPE, SIG_IGN). */ int ret = -1; int criu_pipe_direction = img_streamer_mode == O_DUMP ? WRITE_PIPE : READ_PIPE; int streamer_pipe_direction = 1 - criu_pipe_direction; int fds[2]; if (pipe(fds) < 0) { pr_perror("Unable to create pipe"); return -1; } if (send_fd(get_service_fd(IMG_STREAMER_FD_OFF), NULL, 0, fds[streamer_pipe_direction]) < 0) close(fds[criu_pipe_direction]); else ret = fds[criu_pipe_direction]; close(fds[streamer_pipe_direction]); return ret; } static int _img_streamer_open(char *filename) { if (send_file_request(filename) < 0) return -1; if (img_streamer_mode == O_RSTR) { /* The streamer replies whether the file exists */ bool exists; if (recv_file_reply(&exists) < 0) return -1; if (!exists) return -ENOENT; } /* * When the image streamer encounters a fatal error, it won't report * errors via protobufs. Instead, CRIU will get a broken pipe error * when trying to access a streaming pipe. This behavior is similar to * what would happen if we were connecting criu and * criu-image-streamer * via a shell pipe. */ return establish_streamer_file_pipe(); } /* * Opens an image file via a UNIX pipe with the image streamer. * * Return: * A file descriptor on success * -ENOENT when the file was not found. * -1 on any other error. */ int img_streamer_open(char *filename, int flags) { int ret; BUG_ON(flags != img_streamer_mode); mutex_lock(img_streamer_fd_lock); ret = _img_streamer_open(filename); mutex_unlock(img_streamer_fd_lock); return ret; } crac-criu-1.5.0/criu/include/000077500000000000000000000000001471504326700157635ustar00rootroot00000000000000crac-criu-1.5.0/criu/include/action-scripts.h000066400000000000000000000014041471504326700210750ustar00rootroot00000000000000#ifndef __CR_ACTION_SCRIPTS_H__ #define __CR_ACTION_SCRIPTS_H__ #include "asm/int.h" enum script_actions { ACT_PRE_STREAM, ACT_PRE_DUMP, ACT_POST_DUMP, ACT_PRE_RESTORE, ACT_POST_RESTORE, ACT_NET_LOCK, ACT_NET_UNLOCK, ACT_SETUP_NS, ACT_POST_SETUP_NS, ACT_POST_RESUME, ACT_PRE_RESUME, ACT_ORPHAN_PTS_MASTER, ACT_STATUS_READY, ACT_QUERY_EXT_FILES, ACT_MAX }; extern int add_script(char *path); extern int add_rpc_notify(int sk); extern int run_scripts(enum script_actions); extern int rpc_send_fd(enum script_actions, int fd); extern int rpc_query_external_files(void); extern int exec_rpc_query_external_files(char *name, int sk); extern int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd); #endif /* __CR_ACTION_SCRIPTS_H__ */ crac-criu-1.5.0/criu/include/aio.h000066400000000000000000000017361471504326700167130ustar00rootroot00000000000000#ifndef __CR_AIO_H__ #define __CR_AIO_H__ #include "linux/aio_abi.h" #include "images/mm.pb-c.h" unsigned int aio_estimate_nr_reqs(unsigned int size); int dump_aio_ring(MmEntry *mme, struct vma_area *vma); void free_aios(MmEntry *mme); struct parasite_ctl; int parasite_collect_aios(struct parasite_ctl *, struct vm_area_list *); unsigned long aio_rings_args_size(struct vm_area_list *); struct task_restore_args; int prepare_aios(struct pstree_item *t, struct task_restore_args *ta); struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; unsigned compat_features; unsigned incompat_features; unsigned header_length; /* size of aio_ring */ struct io_event io_events[0]; }; struct rst_aio_ring { unsigned long addr; unsigned long len; unsigned int nr_req; }; #endif /* __CR_AIO_H__ */ crac-criu-1.5.0/criu/include/asm-generic/000077500000000000000000000000001471504326700201555ustar00rootroot00000000000000crac-criu-1.5.0/criu/include/asm-generic/int.h000066400000000000000000000004031471504326700211150ustar00rootroot00000000000000#ifndef __CR_INT_H__ #define __CR_INT_H__ #include typedef uint64_t u64; typedef int64_t s64; typedef uint32_t u32; typedef int32_t s32; typedef uint16_t u16; typedef int16_t s16; typedef uint8_t u8; typedef int8_t s8; #endif /* __CR_INT_H__ */ crac-criu-1.5.0/criu/include/asm-generic/vdso.h000066400000000000000000000006241471504326700213030ustar00rootroot00000000000000#ifndef __CR_ASM_GENERIC_VDSO_H__ #define __CR_ASM_GENERIC_VDSO_H__ #define VDSO_PROT (PROT_READ | PROT_EXEC) #define VVAR_PROT (PROT_READ) /* Just in case of LPAE system PFN is u64. */ #define VDSO_BAD_PFN (-1ull) #define VVAR_BAD_PFN (-1ull) #define VDSO_BAD_ADDR (-1ul) #define VVAR_BAD_ADDR (-1ul) #define VDSO_BAD_SIZE (-1ul) #define VVAR_BAD_SIZE (-1ul) #endif /* __CR_ASM_GENERIC_VDSO_H__ */ crac-criu-1.5.0/criu/include/atomic.h000066400000000000000000000001361471504326700174100ustar00rootroot00000000000000#ifndef __CR_INC_ATOMIC_H__ #define __CR_INC_ATOMIC_H__ #include "common/asm/atomic.h" #endif crac-criu-1.5.0/criu/include/autofs.h000066400000000000000000000115061471504326700174400ustar00rootroot00000000000000#ifndef __CR_AUTOFS_H__ #define __CR_AUTOFS_H__ #ifndef AUTOFS_MINOR #define AUTOFS_MINOR 235 #endif #include bool is_autofs_pipe(unsigned long inode); struct mount_info; int autofs_parse(struct mount_info *pm); int autofs_dump(struct mount_info *pm); int autofs_mount(struct mount_info *mi, const char *source, const char *filesystemtype, unsigned long mountflags); #include #include #include #define AUTOFS_DEVICE_NAME "autofs" #define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 #define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 #define AUTOFS_DEVID_LEN 16 #define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) /* * An ioctl interface for autofs mount point control. */ struct args_protover { __u32 version; }; struct args_protosubver { __u32 sub_version; }; struct args_openmount { __u32 devid; }; struct args_ready { __u32 token; }; struct args_fail { __u32 token; __s32 status; }; struct args_setpipefd { __s32 pipefd; }; struct args_timeout { __u64 timeout; }; struct args_requester { __u32 uid; __u32 gid; }; struct args_expire { __u32 how; }; struct args_askumount { __u32 may_umount; }; struct args_ismountpoint { union { struct args_in { __u32 type; } in; struct args_out { __u32 devid; __u32 magic; } out; }; }; /* * All the ioctls use this structure. * When sending a path size must account for the total length * of the chunk of memory otherwise is is the size of the * structure. */ struct autofs_dev_ioctl { __u32 ver_major; __u32 ver_minor; __u32 size; /* total size of data passed in * including this struct */ __s32 ioctlfd; /* automount command fd */ /* Command parameters */ union { struct args_protover protover; struct args_protosubver protosubver; struct args_openmount openmount; struct args_ready ready; struct args_fail fail; struct args_setpipefd setpipefd; struct args_timeout timeout; struct args_requester requester; struct args_expire expire; struct args_askumount askumount; struct args_ismountpoint ismountpoint; }; char path[0]; }; static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) { memset(in, 0, sizeof(struct autofs_dev_ioctl)); in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; in->size = sizeof(struct autofs_dev_ioctl); in->ioctlfd = -1; return; } /* * If you change this make sure you make the corresponding change * to autofs-dev-ioctl.c:lookup_ioctl() */ enum { /* Get various version info */ AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71, AUTOFS_DEV_IOCTL_PROTOVER_CMD, AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, /* Open mount ioctl fd */ AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, /* Close mount ioctl fd */ AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, /* Mount/expire status returns */ AUTOFS_DEV_IOCTL_READY_CMD, AUTOFS_DEV_IOCTL_FAIL_CMD, /* Activate/deactivate autofs mount */ AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, AUTOFS_DEV_IOCTL_CATATONIC_CMD, /* Expiry timeout */ AUTOFS_DEV_IOCTL_TIMEOUT_CMD, /* Get mount last requesting uid and gid */ AUTOFS_DEV_IOCTL_REQUESTER_CMD, /* Check for eligible expire candidates */ AUTOFS_DEV_IOCTL_EXPIRE_CMD, /* Request busy status */ AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, /* Check if path is a mountpoint */ AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, }; #define AUTOFS_IOCTL 0x93 #define AUTOFS_DEV_IOCTL_VERSION _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_PROTOVER _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_PROTOSUBVER _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_OPENMOUNT _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_CLOSEMOUNT _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_READY _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_FAIL _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_SETPIPEFD _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_CATATONIC _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_TIMEOUT _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_REQUESTER _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_EXPIRE _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_ASKUMOUNT _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_ISMOUNTPOINT _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl) #endif crac-criu-1.5.0/criu/include/bfd.h000066400000000000000000000014571471504326700166760ustar00rootroot00000000000000#ifndef __CR_BFD_H__ #define __CR_BFD_H__ #include "common/err.h" struct bfd_buf; struct xbuf { char *mem; /* buffer */ char *data; /* position we see bytes at */ unsigned int sz; /* bytes sitting after b->pos */ struct bfd_buf *buf; }; struct bfd { int fd; bool writable; struct xbuf b; }; static inline bool bfd_buffered(struct bfd *b) { return b->b.mem != NULL; } static inline void bfd_setraw(struct bfd *b) { b->b.mem = NULL; } int bfdopenr(struct bfd *f); int bfdopenw(struct bfd *f); void bclose(struct bfd *f); char *breadline(struct bfd *f); char *breadchr(struct bfd *f, char c); int bwrite(struct bfd *f, const void *buf, int sz); struct iovec; int bwritev(struct bfd *f, const struct iovec *iov, int cnt); int bread(struct bfd *f, void *buf, int sz); int bfd_flush_images(void); #endif crac-criu-1.5.0/criu/include/bitmap.h000066400000000000000000000003171471504326700174110ustar00rootroot00000000000000#ifndef __CR_BITMAP_H__ #define __CR_BITMAP_H__ extern void bitmap_set(unsigned long *map, int start, int nr); extern void bitmap_clear(unsigned long *map, int start, int nr); #endif /* __CR_BITMAP_H__ */ crac-criu-1.5.0/criu/include/bitops.h000066400000000000000000000001321471504326700174300ustar00rootroot00000000000000#ifndef __CR_INC_BITOPS_H__ #define __CR_INC_BITOPS_H__ #include "common/bitops.h" #endif crac-criu-1.5.0/criu/include/bitsperlong.h000066400000000000000000000001511471504326700204610ustar00rootroot00000000000000#ifndef __CR_INC_BITSPERLONG_H__ #define __CR_INC_BITSPERLONG_H__ #include "common/bitsperlong.h" #endif crac-criu-1.5.0/criu/include/bpfmap.h000066400000000000000000000017061471504326700174050ustar00rootroot00000000000000#ifndef __CR_BPFMAP_H__ #define __CR_BPFMAP_H__ #include "files.h" #include "bpfmap-file.pb-c.h" #include "bpfmap-data.pb-c.h" struct bpfmap_file_info { BpfmapFileEntry *bpfe; struct file_desc d; }; struct bpfmap_data_rst { BpfmapDataEntry *bde; void *data; struct bpfmap_data_rst *next; }; #define BPFMAP_DATA_HASH_BITS 5 #define BPFMAP_DATA_TABLE_SIZE (1 << BPFMAP_DATA_HASH_BITS) #define BPFMAP_DATA_HASH_MASK (BPFMAP_DATA_TABLE_SIZE - 1) extern int is_bpfmap_link(char *link); extern int dump_one_bpfmap_data(BpfmapFileEntry *bpf, int lfd, const struct fd_parms *p); extern int do_collect_bpfmap_data(struct bpfmap_data_rst *, ProtobufCMessage *, struct cr_img *, struct bpfmap_data_rst **); extern int restore_bpfmap_data(int, uint32_t, struct bpfmap_data_rst **); extern const struct fdtype_ops bpfmap_dump_ops; extern struct collect_image_info bpfmap_cinfo; extern struct collect_image_info bpfmap_data_cinfo; #endif /* __CR_BPFMAP_H__ */ crac-criu-1.5.0/criu/include/cgroup-props.h000066400000000000000000000007771471504326700206070ustar00rootroot00000000000000#ifndef __CR_CGROUP_PROPS_H__ #define __CR_CGROUP_PROPS_H__ #include typedef struct { const char *name; size_t nr_props; const char **props; } cgp_t; extern cgp_t cgp_global; extern cgp_t cgp_global_v2; extern const cgp_t *cgp_get_props(const char *name); extern bool cgp_should_skip_controller(const char *name); extern bool cgp_add_dump_controller(const char *name); extern int cgp_init(char *stream, size_t len, char *path); extern void cgp_fini(void); #endif /* __CR_CGROUP_PROPS_H__ */ crac-criu-1.5.0/criu/include/cgroup.h000066400000000000000000000046351471504326700174430ustar00rootroot00000000000000#ifndef __CR_CGROUP_H__ #define __CR_CGROUP_H__ #include "int.h" #include "images/core.pb-c.h" struct pstree_item; struct parasite_dump_cgroup_args; extern u32 root_cg_set; int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); int dump_cgroups(void); int restore_task_cgroup(struct pstree_item *); int prepare_cgroup_namespace(struct pstree_item *); int prepare_cgroup(void); /* Restore things like cpu_limit in known cgroups. */ int prepare_cgroup_properties(void); int restore_freezer_state(void); void fini_cgroup(void); struct cg_controller; struct cgroup_prop { char *name; char *value; mode_t mode; uid_t uid; gid_t gid; struct list_head list; }; /* This describes a particular cgroup path, e.g. the '/lxc/u1' part of * 'blkio/lxc/u1' and any properties it has. */ struct cgroup_dir { char *path; mode_t mode; uid_t uid; gid_t gid; struct list_head properties; unsigned int n_properties; /* this is how children are linked together */ struct list_head siblings; /* more cgroup_dirs */ struct list_head children; unsigned int n_children; }; /* This describes a particular cgroup controller, e.g. blkio or cpuset. * The heads are subdirectories organized in their tree format. */ struct cg_controller { unsigned int n_controllers; char **controllers; /* cgroup_dirs */ struct list_head heads; unsigned int n_heads; /* for cgroup list in cgroup.c */ struct list_head l; /* controller is a threaded cgroup or not */ int is_threaded; }; struct cg_controller *new_controller(const char *name); /* parse all global cgroup information into structures */ int parse_cg_info(void); int new_cg_root_add(char *controller, char *newroot); extern struct ns_desc cgroup_ns_desc; /* * This struct describes a group controlled by one controller. * The @name is the controller name or 'name=...' for named cgroups. * The @path is the path from the hierarchy root. */ struct cg_ctl { struct list_head l; char *name; char *path; u32 cgns_prefix; }; /* * Returns the list of cg_ctl-s sorted by name */ struct list_head; struct parasite_dump_cgroup_args; extern int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *l, unsigned int *n); extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); int stop_cgroupd(void); #endif /* __CR_CGROUP_H__ */ crac-criu-1.5.0/criu/include/clone-noasan.h000066400000000000000000000003751471504326700205160ustar00rootroot00000000000000#ifndef __CR_CLONE_NOASAN_H__ #define __CR_CLONE_NOASAN_H__ int clone_noasan(int (*fn)(void *), int flags, void *arg); int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, int exit_signal, pid_t pid); #endif /* __CR_CLONE_NOASAN_H__ */ crac-criu-1.5.0/criu/include/cpu.h000066400000000000000000000004031471504326700167200ustar00rootroot00000000000000#ifndef __CR_CPU_H__ #define __CR_CPU_H__ #include extern int cpu_init(void); extern int cpu_dump_cpuinfo(void); extern int cpu_validate_cpuinfo(void); extern int cpuinfo_dump(void); extern int cpuinfo_check(void); #endif /* __CR_CPU_H__ */ crac-criu-1.5.0/criu/include/cr-errno.h000066400000000000000000000007171471504326700176700ustar00rootroot00000000000000#ifndef __CR_ERRNO_H__ #define __CR_ERRNO_H__ void set_cr_errno(int err); int get_cr_errno(void); /* * List of symbolic error names: * ESRCH - no process can be found corresponding to that specified by pid * EEXIST - process with such pid already exists * EBADRQC - bad options */ #define set_task_cr_err(new_err) atomic_cmpxchg(&task_entries->cr_err, 0, new_err) #define get_task_cr_err() atomic_read(&task_entries->cr_err) #endif /* __CR_ERRNO_H__ */ crac-criu-1.5.0/criu/include/cr-service-const.h000066400000000000000000000002371471504326700213240ustar00rootroot00000000000000#ifndef __CR_SERVICE_CONST_H__ #define __CR_SERVICE_CONST_H__ #define CR_DEFAULT_SERVICE_ADDRESS "./criu_service.socket" #endif /* __CR_SERVICE_CONST_H__ */ crac-criu-1.5.0/criu/include/cr-service.h000066400000000000000000000005351471504326700202010ustar00rootroot00000000000000#ifndef __CR_SERVICE_H__ #define __CR_SERVICE_H__ #include "images/rpc.pb-c.h" extern int cr_service(bool daemon_mode); int cr_service_work(int sk); extern int send_criu_dump_resp(int socket_fd, bool success, bool restored); extern struct _cr_service_client *cr_service_client; extern unsigned int service_sk_ino; #endif /* __CR_SERVICE_H__ */ crac-criu-1.5.0/criu/include/cr_options.h000066400000000000000000000140071471504326700203150ustar00rootroot00000000000000#ifndef __CR_OPTIONS_H__ #define __CR_OPTIONS_H__ #include #include #include "common/config.h" #include "common/list.h" #include "int.h" #include "image.h" /* Configuration and CLI parsing order defines */ #define PARSING_GLOBAL_CONF 1 #define PARSING_USER_CONF 2 #define PARSING_ENV_CONF 3 #define PARSING_CMDLINE_CONF 4 #define PARSING_ARGV 5 #define PARSING_RPC_CONF 6 #define PARSING_LAST 7 #define SET_CHAR_OPTS(__dest, __src) \ do { \ char *__src_dup = xstrdup(__src); \ if (!__src_dup) \ abort(); \ xfree(opts.__dest); \ opts.__dest = __src_dup; \ } while (0) /* * CPU capability options. */ #define CPU_CAP_NONE (0u << 0) /* Don't check capability at all */ #define CPU_CAP_FPU (1u << 0) /* Only FPU capability required */ #define CPU_CAP_CPU (1u << 1) /* Strict CPU capability required */ #define CPU_CAP_INS (1u << 2) /* Instructions CPU capability */ #define CPU_CAP_IMAGE (1u << 3) /* Write capability on dump and read on restore*/ #define CPU_CAP_ALL (CPU_CAP_FPU | CPU_CAP_CPU | CPU_CAP_INS) #define CPU_CAP_DEFAULT (CPU_CAP_FPU | CPU_CAP_INS) struct cg_root_opt { struct list_head node; char *controller; char *newroot; }; /* * Pre-dump variants */ #define PRE_DUMP_SPLICE 1 /* Pre-dump using parasite */ #define PRE_DUMP_READ 2 /* Pre-dump using process_vm_readv syscall */ /* * Cgroup management options. */ #define CG_MODE_IGNORE (0u << 0) /* Zero is important here */ #define CG_MODE_NONE (1u << 0) #define CG_MODE_PROPS (1u << 1) #define CG_MODE_SOFT (1u << 2) #define CG_MODE_FULL (1u << 3) #define CG_MODE_STRICT (1u << 4) #define CG_MODE_DEFAULT (CG_MODE_SOFT) /* * Network locking method */ enum NETWORK_LOCK_METHOD { NETWORK_LOCK_IPTABLES, NETWORK_LOCK_NFTABLES, NETWORK_LOCK_SKIP, }; #define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES /* * Ghost file size we allow to carry by default. */ #define DEFAULT_GHOST_LIMIT (1 << 20) #define DEFAULT_TIMEOUT 10 enum FILE_VALIDATION_OPTIONS { /* * This constant indicates that the file validation should be tried with the * file size method by default. */ FILE_VALIDATION_FILE_SIZE, /* * This constant indicates that the file validation should be tried with the * build-ID method by default. */ FILE_VALIDATION_BUILD_ID }; /* This constant dictates which file validation method should be tried by default. */ #define FILE_VALIDATION_DEFAULT FILE_VALIDATION_BUILD_ID /* This constant dictates that criu use fiemap to copy ghost file by default.*/ #define FIEMAP_DEFAULT 1 struct irmap; struct irmap_path_opt { struct list_head node; struct irmap *ir; }; enum criu_mode { CR_UNSET = 0, CR_DUMP, CR_PRE_DUMP, CR_RESTORE, CR_LAZY_PAGES, CR_CHECK, CR_PAGE_SERVER, CR_SERVICE, CR_SWRK, CR_DEDUP, CR_CPUINFO, CR_EXEC_DEPRECATED, CR_SHOW_DEPRECATED, }; struct cr_options { int final_state; int check_extra_features; int check_experimental_features; union { int restore_detach; bool daemon_mode; }; int restore_sibling; bool ext_unix_sk; int shell_job; int handle_file_locks; int tcp_established_ok; int tcp_close; int evasive_devices; int link_remap_ok; int log_file_per_pid; int pre_dump_mode; bool swrk_restore; char *output; char *root; char *pidfile; char *freeze_cgroup; struct list_head ext_mounts; struct list_head inherit_fds; struct list_head external; struct list_head join_ns; char *libdir; int use_page_server; unsigned short port; char *addr; int ps_socket; int track_mem; char *img_parent; int auto_dedup; unsigned int cpu_cap; int force_irmap; char **exec_cmd; unsigned int manage_cgroups; char *new_global_cg_root; char *cgroup_props; char *cgroup_props_file; struct list_head new_cgroup_roots; char *cgroup_yard; bool autodetect_ext_mounts; int enable_external_sharing; int enable_external_masters; bool aufs; /* auto-detected, not via cli */ bool overlayfs; int ghost_fiemap; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED bool has_binfmt_misc; /* auto-detected */ #endif size_t ghost_limit; struct list_head irmap_scan_paths; bool lsm_supplied; char *lsm_profile; char *lsm_mount_context; unsigned int timeout; unsigned int empty_ns; int tcp_skip_in_flight; bool lazy_pages; char *work_dir; int network_lock_method; int skip_file_rwx_check; /* * When we scheduler for removal some functionality we first * deprecate it and it sits in criu for some time. By default * the deprecated stuff is not working, but it's still possible * to turn one ON while the code is in. */ int deprecated_ok; int display_stats; int weak_sysctls; int status_fd; bool orphan_pts_master; int stream; pid_t tree_id; int log_level; char *imgs_dir; char *tls_cacert; char *tls_cacrl; char *tls_cert; char *tls_key; int tls; int tls_no_cn_verify; int mmap_page_image; int ptrace_allowed; /* This stores which method to use for file validation. */ int file_validation_method; /* Shows the mode criu is running at the moment: dump/pre-dump/restore/... */ enum criu_mode mode; int mntns_compat_mode; /* Enables dump compression/decompression */ int compress; /* Remember the program name passed to main() so we can use it in * error messages elsewhere. */ char *argv_0; /* * This contains the eUID of the current CRIU user. It * will only be set to a non-zero value if CRIU has * the necessary capabilities to run as non root. * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN */ uid_t uid; /* This contains the value from capget()->effective */ u32 cap_eff[_LINUX_CAPABILITY_U32S_3]; /* * If CRIU should be running as non-root with the help of * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN the user should * explicitly request it as it comes with many limitations. */ int unprivileged; }; extern struct cr_options opts; extern char *rpc_cfg_file; extern int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, int state); extern int check_options(void); extern void init_opts(void); #endif /* __CR_OPTIONS_H__ */ crac-criu-1.5.0/criu/include/criu-log.h000066400000000000000000000030431471504326700176550ustar00rootroot00000000000000/* This file defines types and macros for CRIU plugins. Copyright (C) 2013 Parallels, Inc This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __CRIU_LOG_H__ #define __CRIU_LOG_H__ #include "log.h" #include extern int log_init(const char *output); extern void log_fini(void); extern int log_init_by_pid(pid_t pid); extern void log_closedir(void); extern int log_keep_err(void); extern char *log_first_err(void); extern void log_set_fd(int fd); extern int log_get_fd(void); extern void log_set_loglevel(unsigned int loglevel); extern unsigned int log_get_loglevel(void); struct timeval; extern void log_get_logstart(struct timeval *); extern int write_pidfile(int pid); #define DEFAULT_LOG_FILENAME "criu.log" static inline int pr_quelled(unsigned int loglevel) { return log_get_loglevel() < loglevel && loglevel != LOG_MSG; } #endif /* __CR_LOG_LEVELS_H__ */ crac-criu-1.5.0/criu/include/criu-plugin.h000066400000000000000000000121611471504326700203730ustar00rootroot00000000000000/* * This file defines types and macros for CRIU plugins. * Copyright (C) 2013-2014 Parallels, Inc * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __CRIU_PLUGIN_H__ #define __CRIU_PLUGIN_H__ #include #include #include #include #define CRIU_PLUGIN_GEN_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + (c)) #define CRIU_PLUGIN_VERSION_MAJOR 0 #define CRIU_PLUGIN_VERSION_MINOR 2 #define CRIU_PLUGIN_VERSION_SUBLEVEL 0 #define CRIU_PLUGIN_VERSION_OLD CRIU_PLUGIN_GEN_VERSION(0, 1, 0) #define CRIU_PLUGIN_VERSION \ CRIU_PLUGIN_GEN_VERSION(CRIU_PLUGIN_VERSION_MAJOR, CRIU_PLUGIN_VERSION_MINOR, CRIU_PLUGIN_VERSION_SUBLEVEL) /* * Plugin hook points and their arguments in hooks. */ enum { CR_PLUGIN_HOOK__DUMP_UNIX_SK = 0, CR_PLUGIN_HOOK__RESTORE_UNIX_SK = 1, CR_PLUGIN_HOOK__DUMP_EXT_FILE = 2, CR_PLUGIN_HOOK__RESTORE_EXT_FILE = 3, CR_PLUGIN_HOOK__DUMP_EXT_MOUNT = 4, CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT = 5, CR_PLUGIN_HOOK__DUMP_EXT_LINK = 6, CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA = 7, CR_PLUGIN_HOOK__UPDATE_VMA_MAP = 8, CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9, CR_PLUGIN_HOOK__MAX }; #define DECLARE_PLUGIN_HOOK_ARGS(__hook, ...) typedef int(__hook##_t)(__VA_ARGS__) DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct stat *stat); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); enum { CR_PLUGIN_STAGE__DUMP, CR_PLUGIN_STAGE__PRE_DUMP, CR_PLUGIN_STAGE__RESTORE, CR_PLUGIN_STAGE_MAX }; /* * Plugin descriptor. */ typedef struct { const char *name; int (*init)(int stage); void (*exit)(int stage, int ret); unsigned int version; unsigned int max_hooks; void *hooks[CR_PLUGIN_HOOK__MAX]; } cr_plugin_desc_t; extern cr_plugin_desc_t CR_PLUGIN_DESC; #define CR_PLUGIN_REGISTER(___name, ___init, ___exit) \ cr_plugin_desc_t CR_PLUGIN_DESC = { \ .name = ___name, \ .init = ___init, \ .exit = ___exit, \ .version = CRIU_PLUGIN_VERSION, \ .max_hooks = CR_PLUGIN_HOOK__MAX, \ }; static inline int cr_plugin_dummy_init(int stage) { return 0; } static inline void cr_plugin_dummy_exit(int stage, int ret) { } #define CR_PLUGIN_REGISTER_DUMMY(___name) \ cr_plugin_desc_t CR_PLUGIN_DESC = { \ .name = ___name, \ .init = cr_plugin_dummy_init, \ .exit = cr_plugin_dummy_exit, \ .version = CRIU_PLUGIN_VERSION, \ .max_hooks = CR_PLUGIN_HOOK__MAX, \ }; #define CR_PLUGIN_REGISTER_HOOK(__hook, __func) \ static void __attribute__((constructor)) cr_plugin_register_hook_##__func(void) \ { \ CR_PLUGIN_DESC.hooks[__hook] = (void *)__func; \ } /* Public API */ extern int criu_get_image_dir(void); /* * Deprecated, will be removed in next version. */ typedef int(cr_plugin_init_t)(void); typedef void(cr_plugin_fini_t)(void); typedef int(cr_plugin_dump_unix_sk_t)(int fd, int id); typedef int(cr_plugin_restore_unix_sk_t)(int id); typedef int(cr_plugin_dump_file_t)(int fd, int id); typedef int(cr_plugin_restore_file_t)(int id); typedef int(cr_plugin_dump_ext_mount_t)(char *mountpoint, int id); typedef int(cr_plugin_restore_ext_mount_t)(int id, char *mountpoint, char *old_root, int *is_file); typedef int(cr_plugin_dump_ext_link_t)(int index, int type, char *kind); typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat); typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); typedef int(cr_plugin_resume_devices_late_t)(int pid); #endif /* __CRIU_PLUGIN_H__ */ crac-criu-1.5.0/criu/include/crtools.h000066400000000000000000000027561471504326700176330ustar00rootroot00000000000000#ifndef __CR_CRTOOLS_H__ #define __CR_CRTOOLS_H__ #include #include "common/list.h" #include "servicefd.h" #include "images/inventory.pb-c.h" #define CR_FD_PERM (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) extern int check_img_inventory(bool restore); extern int write_img_inventory(InventoryEntry *he); extern int inventory_save_uptime(InventoryEntry *he); extern InventoryEntry *get_parent_inventory(void); extern int prepare_inventory(InventoryEntry *he); struct pprep_head { int (*actor)(struct pprep_head *); struct pprep_head *next; }; extern void add_post_prepare_cb(struct pprep_head *); extern bool deprecated_ok(char *what); extern int cr_dump_tasks(pid_t pid); extern int cr_pre_dump_tasks(pid_t pid); extern int cr_restore_tasks(void); extern int convert_to_elf(char *elf_path, int fd_core); extern int cr_check(void); extern int check_caps(void); extern int cr_dedup(void); extern int cr_lazy_pages(bool daemon); extern int check_add_feature(char *arg); extern void pr_check_features(const char *offset, const char *sep, int width); #define PPREP_HEAD_INACTIVE ((struct pprep_head *)-1) #define add_post_prepare_cb_once(phead) \ do { \ if ((phead)->next == PPREP_HEAD_INACTIVE) \ add_post_prepare_cb(phead); \ } while (0) #define MAKE_PPREP_HEAD(name) \ struct pprep_head name = { \ .next = PPREP_HEAD_INACTIVE, \ .actor = name##_cb, \ } #endif /* __CR_CRTOOLS_H__ */ crac-criu-1.5.0/criu/include/dump.h000066400000000000000000000002411471504326700170760ustar00rootroot00000000000000#ifndef __CR_INC_DUMP_H__ #define __CR_INC_DUMP_H__ #include "asm/dump.h" extern int arch_set_thread_regs(struct pstree_item *item, bool with_threads); #endif crac-criu-1.5.0/criu/include/eventfd.h000066400000000000000000000003571471504326700175740ustar00rootroot00000000000000#ifndef __CR_EVENTFD_H__ #define __CR_EVENTFD_H__ #include "files.h" extern int is_eventfd_link(char *link); extern const struct fdtype_ops eventfd_dump_ops; extern struct collect_image_info eventfd_cinfo; #endif /* __CR_EVENTFD_H__ */ crac-criu-1.5.0/criu/include/eventpoll.h000066400000000000000000000005301471504326700201420ustar00rootroot00000000000000#ifndef __CR_EVENTPOLL_H__ #define __CR_EVENTPOLL_H__ #include "files.h" extern int is_eventpoll_link(char *link); extern int flush_eventpoll_dinfo_queue(void); extern const struct fdtype_ops eventpoll_dump_ops; extern struct collect_image_info epoll_tfd_cinfo; extern struct collect_image_info epoll_cinfo; #endif /* __CR_EVENTPOLL_H__ */ crac-criu-1.5.0/criu/include/external.h000066400000000000000000000011301471504326700177510ustar00rootroot00000000000000#ifndef __CR_EXTERNAL_H__ #define __CR_EXTERNAL_H__ struct external { struct list_head node; char *id; void *data; }; extern int add_external(char *key); extern bool external_lookup_id(char *id); extern char *external_lookup_by_key(char *id); extern void *external_lookup_data(char *id); extern int external_for_each_type(char *type, int (*cb)(struct external *, void *), void *arg); static inline char *external_val(struct external *e) { char *aux; aux = strchr(e->id, '['); if (aux) { aux = strchr(aux + 1, ']'); if (aux && aux[1] == ':') return aux + 2; } return NULL; } #endif crac-criu-1.5.0/criu/include/fault-injection.h000066400000000000000000000015531471504326700212330ustar00rootroot00000000000000#ifndef __CR_FAULT_INJECTION_H__ #define __CR_FAULT_INJECTION_H__ #include enum faults { FI_NONE = 0, FI_DUMP_EARLY, FI_RESTORE_ROOT_ONLY, FI_DUMP_PAGES, FI_RESTORE_OPEN_LINK_REMAP, FI_PARASITE_CONNECT, FI_POST_RESTORE, /* not fatal */ FI_VDSO_TRAMPOLINES = 127, FI_CHECK_OPEN_HANDLE = 128, FI_NO_MEMFD = 129, FI_NO_BREAKPOINTS = 130, FI_PARTIAL_PAGES = 131, FI_HUGE_ANON_SHMEM_ID = 132, FI_CANNOT_MAP_VDSO = 133, FI_CORRUPT_EXTREGS = 134, FI_MAX, }; static inline bool __fault_injected(enum faults f, enum faults fi_strategy) { return fi_strategy == f; } #define FI_HUGE_ANON_SHMEM_ID_BASE (0xfffffffflu) #ifndef CR_NOGLIBC extern enum faults fi_strategy; #define fault_injected(f) __fault_injected(f, fi_strategy) extern int fault_injection_init(void); #else /* CR_NOGLIBC */ extern bool fault_injected(enum faults f); #endif #endif crac-criu-1.5.0/criu/include/fcntl.h000066400000000000000000000016721471504326700172500ustar00rootroot00000000000000#ifndef __CR_ASM_GENERIC_FCNTL_H__ #define __CR_ASM_GENERIC_FCNTL_H__ #include #include #ifndef F_SETOWN_EX #define F_SETOWN_EX 15 #define F_GETOWN_EX 16 struct f_owner_ex { int type; pid_t pid; }; #endif #ifndef F_GETOWNER_UIDS #define F_GETOWNER_UIDS 17 #endif /* * These things are required to compile on CentOS-6 */ #ifndef F_LINUX_SPECIFIC_BASE #define F_LINUX_SPECIFIC_BASE 1024 #endif #ifndef F_SETPIPE_SZ #define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) #endif #ifndef F_GETPIPE_SZ #define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) #endif #ifndef F_ADD_SEALS #define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) #endif #ifndef F_GET_SEALS #define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) #endif #ifndef O_PATH #define O_PATH 010000000 #endif #ifndef __O_TMPFILE #define __O_TMPFILE 020000000 #endif #ifndef O_TMPFILE #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) #endif #endif /* __CR_ASM_GENERIC_FCNTL_H__ */ crac-criu-1.5.0/criu/include/fdinfo.h000066400000000000000000000007041471504326700174020ustar00rootroot00000000000000#ifndef __CR_FDINFO_H__ #define __CR_FDINFO_H__ #include "common/list.h" #include "images/eventfd.pb-c.h" #include "images/eventpoll.pb-c.h" #include "images/signalfd.pb-c.h" #include "images/fsnotify.pb-c.h" #include "images/timerfd.pb-c.h" struct fdinfo_common { off64_t pos; int flags; int mnt_id; int owner; }; extern int parse_fdinfo(int fd, int type, void *arg); extern int parse_fdinfo_pid(int pid, int fd, int type, void *arg); #endif crac-criu-1.5.0/criu/include/fdstore.h000066400000000000000000000005241471504326700176030ustar00rootroot00000000000000#ifndef __CRIU_FDSTORE_H__ #define __CRIU_FDSTORE_H__ /* * fdstore is a storage for file descriptors which is shared * between processes. */ int fdstore_init(void); /* Add a file descriptor to the storage and return its id */ int fdstore_add(int fd); /* Get a file descriptor from a storage by id */ int fdstore_get(int id); #endif crac-criu-1.5.0/criu/include/fifo.h000066400000000000000000000003721471504326700170610ustar00rootroot00000000000000#ifndef __CR_FIFO_H__ #define __CR_FIFO_H__ struct fd_parms; struct cr_imgset; extern const struct fdtype_ops fifo_dump_ops; extern struct collect_image_info fifo_cinfo; extern struct collect_image_info fifo_data_cinfo; #endif /* __CR_FIFO_H__ */ crac-criu-1.5.0/criu/include/file-ids.h000066400000000000000000000007041471504326700176310ustar00rootroot00000000000000#ifndef __CR_FILE_IDS_H__ #define __CR_FILE_IDS_H__ #include "common/compiler.h" #include "rbtree.h" #include "images/fdinfo.pb-c.h" #define FD_PID_INVALID (-2U) #define FD_DESC_INVALID (-3U) struct fdinfo_entry; struct stat; struct fd_parms; extern int fd_id_generate(pid_t pid, FdinfoEntry *fe, struct fd_parms *p); extern int fd_id_generate_special(struct fd_parms *p, u32 *id); extern struct kid_tree fd_tree; #endif /* __CR_FILE_IDS_H__ */ crac-criu-1.5.0/criu/include/file-lock.h000066400000000000000000000036411471504326700200050ustar00rootroot00000000000000#ifndef __FILE_LOCK_H__ #define __FILE_LOCK_H__ #include "common/list.h" #include "protobuf.h" #include "images/file-lock.pb-c.h" #define FL_UNKNOWN -1 #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_OFD 4 #define FL_LEASE 8 /* for posix fcntl() and lockf() */ #ifndef F_RDLCK #define F_RDLCK 0 #define F_WRLCK 1 #define F_UNLCK 2 #endif /* for OFD locks fcntl() */ #ifndef F_OFD_GETLK #define F_OFD_GETLK 36 #define F_OFD_SETLK 37 #define F_OFD_SETLKW 38 #endif /* operations for bsd flock(), also used by the kernel implementation */ #define LOCK_SH 1 /* shared lock */ #define LOCK_EX 2 /* exclusive lock */ #define LOCK_NB \ 4 /* or'd with one of the above to prevent blocking */ #define LOCK_UN 8 /* remove lock */ #define LOCK_MAND 32 /* This is a mandatory flock ... */ #define LOCK_READ 64 /* which allows concurrent read operations */ #define LOCK_WRITE 128 /* which allows concurrent write operations */ #define LOCK_RW 192 /* which allows concurrent read & write ops */ /* for leases */ #define LEASE_BREAKING 4 struct file_lock { long long fl_id; int fl_kind; int fl_ltype; pid_t fl_owner; /* process, which created the lock */ pid_t fl_holder; /* pid of fd on whose the lock is found */ int maj, min; unsigned long i_no; long long start; char end[32]; struct list_head list; /* list of all file locks */ int real_owner; int owners_fd; }; extern struct list_head file_lock_list; extern struct file_lock *alloc_file_lock(void); extern void free_file_locks(void); extern int prepare_file_locks(int pid); extern struct collect_image_info file_locks_cinfo; struct pid; struct fd_parms; extern void discard_dup_locks_tail(pid_t pid, int fd); extern int correct_file_leases_type(struct pid *, int fd, int lfd); extern int note_file_lock(struct pid *, int fd, int lfd, struct fd_parms *); extern int dump_file_locks(void); #define OPT_FILE_LOCKS "file-locks" #endif /* __FILE_LOCK_H__ */ crac-criu-1.5.0/criu/include/files-reg.h000066400000000000000000000033171471504326700200150ustar00rootroot00000000000000#ifndef __CR_FILES_REG_H__ #define __CR_FILES_REG_H__ #include "files.h" #include "util.h" #include "images/regfile.pb-c.h" #include "images/ghost-file.pb-c.h" struct cr_imgset; struct fd_parms; struct file_remap { char *rpath; bool is_dir; int rmnt_id; uid_t uid; gid_t gid; }; struct reg_file_info { struct file_desc d; RegFileEntry *rfe; struct file_remap *remap; bool size_mode_checked; bool is_dir; char *path; }; extern int open_reg_by_id(u32 id); extern int open_reg_fd(struct file_desc *); extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, struct reg_file_info *, void *), void *arg); extern const struct fdtype_ops regfile_dump_ops; extern int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg); extern int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p); extern struct file_remap *lookup_ghost_remap(u32 dev, u32 ino); extern struct file_desc *try_collect_special_file(u32 id, int optional); #define collect_special_file(id) try_collect_special_file(id, 0) extern int collect_filemap(struct vma_area *); extern void filemap_ctx_init(bool auto_close); extern void filemap_ctx_fini(void); extern struct collect_image_info reg_file_cinfo; extern int collect_remaps_and_regfiles(void); extern void delete_link_remaps(void); extern void free_link_remaps(void); extern int prepare_remaps(void); extern int try_clean_remaps(bool only_ghosts); static inline int link_strip_deleted(struct fd_link *link) { return strip_deleted(link->name, link->len); } extern int dead_pid_conflict(void); extern int rm_parent_dirs(int mntns_root, char *path, int count); extern int make_parent_dirs_if_need(int mntns_root, char *path); #endif /* __CR_FILES_REG_H__ */ crac-criu-1.5.0/criu/include/files.h000066400000000000000000000142511471504326700172410ustar00rootroot00000000000000#ifndef __CR_FILES_H__ #define __CR_FILES_H__ #include #include "int.h" #include "common/compiler.h" #include "fcntl.h" #include "common/lock.h" #include "common/list.h" #include "pid.h" #include "rst_info.h" #include "images/fdinfo.pb-c.h" #include "images/fown.pb-c.h" #include "images/vma.pb-c.h" struct parasite_drain_fd; struct pstree_item; struct file_desc; struct cr_imgset; struct rst_info; struct parasite_ctl; struct fd_link { union { /* Link info for generic file (path) */ struct { char name[PATH_MAX]; size_t len; }; /* Link info for proc-ns file */ struct { struct ns_desc *ns_d; unsigned int ns_kid; }; }; }; struct fd_parms { int fd; off_t pos; unsigned int flags; char fd_flags; struct stat stat; pid_t pid; FownEntry fown; struct fd_link *link; long fs_type; int mnt_id; struct parasite_ctl *fd_ctl; struct parasite_drain_fd *dfds; }; #define FD_PARMS_INIT \ (struct fd_parms) \ { \ .fd = FD_DESC_INVALID, .fown = FOWN_ENTRY__INIT, .link = NULL, .mnt_id = -1, \ } extern int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link); extern uint32_t make_gen_id(uint32_t st_dev, uint32_t st_ino, uint64_t pos); struct file_desc; enum { FLE_INITIALIZED, /* * FLE is open (via open() or socket() or etc syscalls), and * common file setting are set up (type-specific are not yet). * Most possible, the master was already served out. */ FLE_OPEN, /* * File-type specific settings and preparations are finished, * and FLE is completely restored. */ FLE_RESTORED, }; struct fdinfo_list_entry { struct list_head desc_list; /* To chain on @fd_info_head */ struct file_desc *desc; /* Associated file descriptor */ struct list_head ps_list; /* To chain per-task files */ struct pstree_item *task; FdinfoEntry *fe; int pid; u8 received : 1; u8 stage : 3; u8 fake : 1; }; extern int inh_fd_max; /* reports whether fd_a takes prio over fd_b */ static inline int fdinfo_rst_prio(struct fdinfo_list_entry *fd_a, struct fdinfo_list_entry *fd_b) { return pid_rst_prio(fd_a->pid, fd_b->pid) || ((fd_a->pid == fd_b->pid) && (fd_a->fe->fd < fd_b->fe->fd)); } struct file_desc_ops { /* fd_types from images/fdinfo.proto */ unsigned int type; /* * Opens a file by whatever syscall is required for that. * The returned descriptor may be closed (dup2-ed to another) * so it shouldn't be saved for any post-actions. */ int (*open)(struct file_desc *d, int *new_fd); char *(*name)(struct file_desc *, char *b, size_t s); }; int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool ghost); struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, struct rst_info *rst_info, struct file_desc *fdesc, bool fake, bool force_master); u32 find_unused_file_desc_id(void); unsigned int find_unused_fd(struct pstree_item *, int hint_fd); struct fdinfo_list_entry *find_used_fd(struct pstree_item *, int fd); enum fd_inherit_state { FDIH_UNINHERITED = -2, FDIH_UNKNOWN = -1, FDIH_FROM_0 = 0, }; struct file_desc { u32 id; /* File id, unique */ struct hlist_node hash; /* Descriptor hashing and lookup */ struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */ struct file_desc_ops *ops; /* Associated operations */ struct list_head fake_master_list; /* To chain in the list of file_desc, which don't * have a fle in a task, that having permissions */ enum fd_inherit_state fds_inherited; }; struct fdtype_ops { unsigned int type; int (*dump)(int lfd, u32 id, const struct fd_parms *p); int (*pre_dump)(int pid, int lfd); }; struct cr_img; extern int dump_my_file(int lfd, u32 *, int *type); extern int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops, FdinfoEntry *e); struct parasite_drain_fd; int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds); int predump_task_files(int pid); extern void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops); extern int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops); extern struct fdinfo_list_entry *try_file_master(struct file_desc *d); extern struct fdinfo_list_entry *file_master(struct file_desc *d); extern struct file_desc *find_file_desc_raw(int type, u32 id); extern int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd); extern int recv_desc_from_peer(struct file_desc *d, int *fd); extern int send_desc_to_peer(int fd, struct file_desc *d); extern int restore_fown(int fd, FownEntry *fown); extern int rst_file_params(int fd, FownEntry *fown, int flags); extern void show_saved_files(void); extern int prepare_fds(struct pstree_item *me); extern int prepare_fd_pid(struct pstree_item *me); extern int prepare_files(void); extern int restore_fs(struct pstree_item *); extern int prepare_fs_pid(struct pstree_item *); extern int set_fd_flags(int fd, int flags); extern struct collect_image_info files_cinfo; #define files_collected() (files_cinfo.flags & COLLECT_HAPPENED) extern int close_old_fds(void); #ifndef AT_EMPTY_PATH #define AT_EMPTY_PATH 0x1000 #endif #define LREMAP_PARAM "link-remap" extern int shared_fdt_prepare(struct pstree_item *item); extern struct collect_image_info ext_file_cinfo; extern int dump_unsupp_fd(struct fd_parms *p, int lfd, char *more, char *info, FdinfoEntry *); extern int inherit_fd_parse(char *optarg); extern int inherit_fd_add(int fd, char *key); extern void inherit_fd_log(void); extern int inherit_fd_move_to_fdstore(void); extern int inherit_fd_lookup_id(char *id); extern bool inherited_fd(struct file_desc *, int *fdp); extern int inherit_fd_fini(void); extern FdinfoEntry *dup_fdinfo(FdinfoEntry *old, int fd, unsigned flags); int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, int fd, unsigned flags); extern int open_transport_socket(void); extern int set_fds_event(pid_t virt); extern void wait_fds_event(void); int find_unused_fd_pid(pid_t pid); #endif /* __CR_FILES_H__ */ crac-criu-1.5.0/criu/include/filesystems.h000066400000000000000000000016661471504326700205140ustar00rootroot00000000000000#ifndef __CR_FILESYSTEMS_H__ #define __CR_FILESYSTEMS_H__ extern struct fstype *find_fstype_by_name(char *fst); extern struct fstype *decode_fstype(u32 fst); extern bool add_fsname_auto(const char *names); struct mount_info; typedef int (*mount_fn_t)(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags); struct fstype { char *name; int code; int (*dump)(struct mount_info *pm); int (*restore)(struct mount_info *pm); int (*check_bindmount)(struct mount_info *pm); int (*parse)(struct mount_info *pm); int (*collect)(struct mount_info *pm); bool (*sb_equal)(struct mount_info *a, struct mount_info *b); mount_fn_t mount; }; extern struct fstype *fstype_auto(void); /* callback for AUFS support */ extern int aufs_parse(struct mount_info *mi); /* callback for OverlayFS support */ extern int overlayfs_parse(struct mount_info *mi); /* FIXME -- remove */ extern struct list_head binfmt_misc_list; #endif crac-criu-1.5.0/criu/include/fs-magic.h000066400000000000000000000017741471504326700176330ustar00rootroot00000000000000#ifndef __CR_FS_MAGIC_H__ #define __CR_FS_MAGIC_H__ #include /* * Gather magic numbers in case if distros * do not provide appropriate entry in * linux/magic.h. */ #ifndef NFS_SUPER_MAGIC #define NFS_SUPER_MAGIC 0x6969 #endif #ifndef PIPEFS_MAGIC #define PIPEFS_MAGIC 0x50495045 #endif #ifndef ANON_INODE_FS_MAGIC #define ANON_INODE_FS_MAGIC 0x09041934 #endif #ifndef TMPFS_MAGIC #define TMPFS_MAGIC 0x01021994 #endif #ifndef SOCKFS_MAGIC #define SOCKFS_MAGIC 0x534f434b #endif #ifndef DEVPTS_SUPER_MAGIC #define DEVPTS_SUPER_MAGIC 0x1cd1 #endif #ifndef BTRFS_SUPER_MAGIC #define BTRFS_SUPER_MAGIC 0x9123683E #endif #ifndef AUFS_SUPER_MAGIC #define AUFS_SUPER_MAGIC 0x61756673 #endif #ifndef PROC_SUPER_MAGIC #define PROC_SUPER_MAGIC 0x9fa0 #endif #ifndef BINFMTFS_MAGIC #define BINFMTFS_MAGIC 0x42494e4d #endif #ifndef AUTOFS_SUPER_MAGIC #define AUTOFS_SUPER_MAGIC 0x0187 #endif #ifndef OVERLAYFS_SUPER_MAGIC #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #endif #endif /* __CR_FS_MAGIC_H__ */ crac-criu-1.5.0/criu/include/fsnotify.h000066400000000000000000000012471471504326700200010ustar00rootroot00000000000000#ifndef __CR_FSNOTIFY_H__ #define __CR_FSNOTIFY_H__ #include "files.h" #include "protobuf.h" #include "images/fsnotify.pb-c.h" #define KERNEL_FS_EVENT_ON_CHILD 0x08000000 #ifndef INOTIFY_IOC_SETNEXTWD #define INOTIFY_IOC_SETNEXTWD _IOW('I', 0, __s32) #endif extern int is_inotify_link(char *link); extern int is_fanotify_link(char *link); extern const struct fdtype_ops inotify_dump_ops; extern const struct fdtype_ops fanotify_dump_ops; extern struct collect_image_info inotify_cinfo; extern struct collect_image_info inotify_mark_cinfo; extern struct collect_image_info fanotify_cinfo; extern struct collect_image_info fanotify_mark_cinfo; #endif /* __CR_FSNOTIFY_H__ */ crac-criu-1.5.0/criu/include/hugetlb.h000066400000000000000000000031211471504326700175630ustar00rootroot00000000000000#ifndef __CR_HUGETLB_H_ #define __CR_HUGETLB_H_ #include #include #include "vma.h" #define ANON_HUGEPAGE_PREFIX "/anon_hugepage" #define ANON_HUGEPAGE_PREFIX_LEN (sizeof(ANON_HUGEPAGE_PREFIX) - 1) enum hugepage_size { HUGETLB_16KB, HUGETLB_64KB, HUGETLB_512KB, HUGETLB_1MB, HUGETLB_2MB, HUGETLB_8MB, HUGETLB_16MB, HUGETLB_32MB, HUGETLB_256MB, HUGETLB_512MB, HUGETLB_1GB, HUGETLB_2GB, HUGETLB_16GB, HUGETLB_MAX }; #define MAP_HUGETLB_SHIFT 26 #define MAP_HUGETLB_SIZE_MASK (0x3f << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_16KB (14 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_64KB (16 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_512KB (19 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_1MB (20 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_2MB (21 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_8MB (23 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_16MB (24 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_32MB (25 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_256MB (28 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_512MB (29 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_1GB (30 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_2GB (31 << MAP_HUGETLB_SHIFT) #define MAP_HUGETLB_16GB (34 << MAP_HUGETLB_SHIFT) struct htlb_info { unsigned long long size; int flag; }; extern struct htlb_info hugetlb_info[HUGETLB_MAX]; int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag); int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma); unsigned long get_size_from_hugetlb_flag(int flag); #ifndef MFD_HUGETLB #define MFD_HUGETLB 4 #endif #endif crac-criu-1.5.0/criu/include/image-desc.h000066400000000000000000000037751471504326700201460ustar00rootroot00000000000000#ifndef __CR_IMAGE_DESC_H__ #define __CR_IMAGE_DESC_H__ #include "int.h" enum { CR_FD_INVENTORY, CR_FD_STATS, /* * Task entries */ _CR_FD_TASK_FROM, CR_FD_CORE, CR_FD_IDS, CR_FD_MM, CR_FD_CREDS, CR_FD_FS, _CR_FD_TASK_TO, CR_FD_PAGEMAP, /* * NS entries */ CR_FD_UTSNS, CR_FD_MNTS, CR_FD_USERNS, CR_FD_TIMENS, CR_FD_PIDNS, _CR_FD_IPCNS_FROM, CR_FD_IPC_VAR, CR_FD_IPCNS_SHM, CR_FD_IPCNS_MSG, CR_FD_IPCNS_SEM, _CR_FD_IPCNS_TO, _CR_FD_NETNS_FROM, CR_FD_NETDEV, CR_FD_IFADDR, CR_FD_ROUTE, CR_FD_ROUTE6, CR_FD_RULE, CR_FD_IPTABLES, CR_FD_IP6TABLES, CR_FD_NFTABLES, CR_FD_NETNS, CR_FD_NETNF_CT, CR_FD_NETNF_EXP, _CR_FD_NETNS_TO, CR_FD_PSTREE, CR_FD_SHMEM_PAGEMAP, CR_FD_GHOST_FILE, CR_FD_TCP_STREAM, CR_FD_FDINFO, _CR_FD_GLOB_FROM, CR_FD_FILES, CR_FD_SK_QUEUES, CR_FD_PIPES_DATA, CR_FD_FIFO_DATA, CR_FD_TTY_INFO, CR_FD_TTY_DATA, CR_FD_REMAP_FPATH, CR_FD_CGROUP, CR_FD_FILE_LOCKS, CR_FD_SECCOMP, CR_FD_APPARMOR, CR_FD_MEMFD_INODE, CR_FD_BPFMAP_FILE, CR_FD_BPFMAP_DATA, _CR_FD_GLOB_TO, CR_FD_TMPFS_IMG, CR_FD_TMPFS_DEV, CR_FD_BINFMT_MISC, CR_FD_BINFMT_MISC_OLD, CR_FD_PAGES, CR_FD_PAGES_COMP, CR_FD_SIGACT, CR_FD_VMAS, CR_FD_PAGES_OLD, CR_FD_SHM_PAGES_OLD, CR_FD_RLIMIT, CR_FD_ITIMERS, CR_FD_POSIX_TIMERS, CR_FD_IRMAP_CACHE, CR_FD_CPUINFO, CR_FD_SIGNAL, CR_FD_PSIGNAL, CR_FD_INOTIFY_WD, CR_FD_FANOTIFY_MARK, CR_FD_EVENTPOLL_TFD, CR_FD_REG_FILES, CR_FD_INETSK, CR_FD_NS_FILES, CR_FD_PACKETSK, CR_FD_NETLINK_SK, CR_FD_EVENTFD_FILE, CR_FD_EVENTPOLL_FILE, CR_FD_SIGNALFD, CR_FD_TUNFILE, CR_FD_TIMERFD, CR_FD_INOTIFY_FILE, CR_FD_FANOTIFY_FILE, CR_FD_EXT_FILES, CR_FD_UNIXSK, CR_FD_FIFO, CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, CR_FD_AUTOFS, CR_FD_MAX }; /* file descriptors template */ struct cr_fd_desc_tmpl { const char *fmt; /* format for the name */ u32 magic; /* magic in the header */ int oflags; /* flags for image_open */ }; extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX]; #endif /* __CR_IMAGE_DESC_H__ */ crac-criu-1.5.0/criu/include/image.h000066400000000000000000000117071471504326700172240ustar00rootroot00000000000000#ifndef __CR_IMAGE_H__ #define __CR_IMAGE_H__ #include #include "common/compiler.h" #include "servicefd.h" #include "image-desc.h" #include "fcntl.h" #include "magic.h" #include "bfd.h" #include "log.h" #include "common/bug.h" #define PAGE_RSS 1 #define PAGE_ANON 2 /* * Top bit set in the tgt id means we've remapped * to a ghost file. */ #define REMAP_GHOST (1 << 31) /* * VMA_AREA status: * * - none * VmaEntry is just allocated and has not been used * for anything yet * - regular * VmaEntry represent some memory area which should be * dumped and restored; this is a general sign that we * should not skip the area content from processing in * compare with special areas such as vsyscall * - stack * the memory area is used in application stack so we * should be careful about guard page here * - vsyscall * special memory area injected into the task memory * space by the kernel itself, represent virtual syscall * implementation and it is specific to every kernel version, * its contents should not be dumped ever * - vdso,vvar * the vDSO area, it might require additional memory * contents modification especially when tasks are * migrating between different kernel versions * - heap * "heap" area in application, currently for information only * - file private * stands for privately memory mapped files * - file shared * stands for shared memory mapped files * - anon shared * represent shared anonymous memory areas * - anon private * represent private anonymous memory areas * - SysV IPC * IPC shared memory area * - socket * memory map for socket * - AIO ring * memory area serves AIO buffers * - unsupported * stands for any unknown memory areas, usually means * we don't know how to work with it and should stop * processing exiting with error; while the rest of bits * are part of image ABI, this particular one must never * be used in image. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) #define VMA_AREA_STACK (1 << 1) #define VMA_AREA_VSYSCALL (1 << 2) #define VMA_AREA_VDSO (1 << 3) #define VMA_AREA_HEAP (1 << 5) #define VMA_FILE_PRIVATE (1 << 6) #define VMA_FILE_SHARED (1 << 7) #define VMA_ANON_SHARED (1 << 8) #define VMA_ANON_PRIVATE (1 << 9) #define VMA_AREA_SYSVIPC (1 << 10) #define VMA_AREA_SOCKET (1 << 11) #define VMA_AREA_VVAR (1 << 12) #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) #define VMA_NO_PROT_WRITE (1 << 29) #define VMA_PREMMAPED (1 << 30) #define VMA_UNSUPP (1 << 31) #define CR_CAP_SIZE 2 #define TASK_COMM_LEN 16 #define CR_PARENT_LINK "parent" extern bool ns_per_id; extern bool img_common_magic; #define O_NOBUF (O_DIRECT) #define O_SERVICE (O_DIRECTORY) #define O_DUMP (O_WRONLY | O_CREAT | O_TRUNC) #define O_RSTR (O_RDONLY) #define O_FORCE_LOCAL (O_SYNC) struct cr_img { union { struct bfd _x; struct { int fd; /* should be first to coincide with _x.fd */ int type; unsigned long oflags; char *path; }; }; }; #define EMPTY_IMG_FD (-404) #define LAZY_IMG_FD (-505) static inline bool empty_image(struct cr_img *img) { return img && img->_x.fd == EMPTY_IMG_FD; } static inline bool lazy_image(struct cr_img *img) { return img->_x.fd == LAZY_IMG_FD; } extern int open_image_lazy(struct cr_img *img); static inline int img_raw_fd(struct cr_img *img) { if (!img) return -1; if (lazy_image(img) && open_image_lazy(img)) return -1; BUG_ON(bfd_buffered(&img->_x)); return img->_x.fd; } extern off_t img_raw_size(struct cr_img *img); extern int open_image_dir(char *dir, int mode); extern void close_image_dir(void); /* * Return -1 -- parent symlink points to invalid target * Return 0 && pfd < 0 -- parent symlink does not exist * Return 0 && pfd >= 0 -- opened */ extern int open_parent(int dfd, int *pfd); extern struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...); #define open_image(typ, flags, ...) open_image_at(-1, typ, flags, ##__VA_ARGS__) extern int open_image_lazy(struct cr_img *img); extern struct cr_img *open_pages_image(unsigned long flags, struct cr_img *pmi, u32 *pages_id); extern struct cr_img *open_pages_image_at(int dfd, unsigned long flags, struct cr_img *pmi, u32 *pages_id); extern void up_page_ids_base(void); extern struct cr_img *img_from_fd(int fd); /* for cr-show mostly */ extern int write_img_buf(struct cr_img *, const void *ptr, int size); #define write_img(img, ptr) write_img_buf((img), (ptr), sizeof(*(ptr))) extern int read_img_buf_eof(struct cr_img *, void *ptr, int size); #define read_img_eof(img, ptr) read_img_buf_eof((img), (ptr), sizeof(*(ptr))) extern int read_img_buf(struct cr_img *, void *ptr, int size); #define read_img(img, ptr) read_img_buf((img), (ptr), sizeof(*(ptr))) extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); #endif /* __CR_IMAGE_H__ */ crac-criu-1.5.0/criu/include/img-streamer.h000066400000000000000000000003611471504326700205300ustar00rootroot00000000000000#ifndef IMAGE_STREAMER_H #define IMAGE_STREAMER_H extern int img_streamer_init(const char *image_dir, int mode); extern void img_streamer_finish(void); extern int img_streamer_open(char *filename, int flags); #endif /* IMAGE_STREAMER_H */ crac-criu-1.5.0/criu/include/imgset.h000066400000000000000000000016231471504326700174260ustar00rootroot00000000000000#ifndef __CR_IMGSET_H__ #define __CR_IMGSET_H__ #include "image-desc.h" #include "log.h" #include "common/bug.h" #include "image.h" struct cr_imgset { int fd_off; int fd_nr; struct cr_img **_imgs; }; static inline struct cr_img *img_from_set(const struct cr_imgset *imgset, int type) { int idx; idx = type - imgset->fd_off; BUG_ON(idx > imgset->fd_nr); return imgset->_imgs[idx]; } extern struct cr_imgset *glob_imgset; extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX]; extern struct cr_imgset *cr_task_imgset_open(int pid, int mode); extern struct cr_imgset *cr_imgset_open_range(int pid, int from, int to, unsigned long flags); #define cr_imgset_open(pid, type, flags) cr_imgset_open_range(pid, _CR_FD_##type##_FROM, _CR_FD_##type##_TO, flags) extern struct cr_imgset *cr_glob_imgset_open(int mode); extern void close_cr_imgset(struct cr_imgset **cr_imgset); #endif /* __CR_IMGSET_H__ */ crac-criu-1.5.0/criu/include/inet_diag.h000066400000000000000000000047351471504326700200700ustar00rootroot00000000000000#ifndef __CR_INET_DIAG_H__ #define __CR_INET_DIAG_H__ #include /* Just some random number */ #define TCPDIAG_GETSOCK 18 #define DCCPDIAG_GETSOCK 19 #define INET_DIAG_GETSOCK_MAX 24 /* Socket identity */ struct inet_diag_sockid { __be16 idiag_sport; __be16 idiag_dport; __be32 idiag_src[4]; __be32 idiag_dst[4]; __u32 idiag_if; __u32 idiag_cookie[2]; #define INET_DIAG_NOCOOKIE (~0U) }; /* Request structure */ struct inet_diag_req_compat { __u8 idiag_family; /* Family of addresses. */ __u8 idiag_src_len; __u8 idiag_dst_len; __u8 idiag_ext; /* Query extended information */ struct inet_diag_sockid id; __u32 idiag_states; /* States to dump */ __u32 idiag_dbs; /* Tables to dump (NI) */ }; struct inet_diag_req_v2 { __u8 sdiag_family; __u8 sdiag_protocol; __u8 idiag_ext; __u8 pad; __u32 idiag_states; struct inet_diag_sockid id; }; enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, }; #define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE /* Bytecode is sequence of 4 byte commands followed by variable arguments. * All the commands identified by "code" are conditional jumps forward: * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be * length of the command and its arguments. */ struct inet_diag_bc_op { unsigned char code; unsigned char yes; unsigned short no; }; enum { INET_DIAG_BC_NOP, INET_DIAG_BC_JMP, INET_DIAG_BC_S_GE, INET_DIAG_BC_S_LE, INET_DIAG_BC_D_GE, INET_DIAG_BC_D_LE, INET_DIAG_BC_AUTO, INET_DIAG_BC_S_COND, INET_DIAG_BC_D_COND, }; struct inet_diag_hostcond { __u8 family; __u8 prefix_len; int port; __be32 addr[0]; }; /* Base info structure. It contains socket identity (addrs/ports/cookie) * and, alas, the information shown by netstat. */ struct inet_diag_msg { __u8 idiag_family; __u8 idiag_state; __u8 idiag_timer; __u8 idiag_retrans; struct inet_diag_sockid id; __u32 idiag_expires; __u32 idiag_rqueue; __u32 idiag_wqueue; __u32 idiag_uid; __u32 idiag_inode; }; /* Extensions */ enum { INET_DIAG_NONE, INET_DIAG_MEMINFO, INET_DIAG_INFO, INET_DIAG_VEGASINFO, INET_DIAG_CONG, INET_DIAG_TOS, INET_DIAG_TCLASS, INET_DIAG_SKMEMINFO, INET_DIAG_SHUTDOWN, }; #define INET_DIAG_MAX INET_DIAG_SHUTDOWN /* INET_DIAG_MEM */ struct inet_diag_meminfo { __u32 idiag_rmem; __u32 idiag_wmem; __u32 idiag_fmem; __u32 idiag_tmem; }; /* INET_DIAG_VEGASINFO */ struct tcpvegas_info { __u32 tcpv_enabled; __u32 tcpv_rttcnt; __u32 tcpv_rtt; __u32 tcpv_minrtt; }; #endif /* __CR_INET_DIAG_H__ */ crac-criu-1.5.0/criu/include/infect-pie.h000066400000000000000000000003631471504326700201610ustar00rootroot00000000000000#ifndef __CR_INFECT_PIE_H__ #define __CR_INFECT_PIE_H__ extern int parasite_daemon_cmd(int cmd, void *args); extern int parasite_trap_cmd(int cmd, void *args); extern void parasite_cleanup(void); extern int parasite_get_rpc_sock(void); #endif crac-criu-1.5.0/criu/include/int.h000066400000000000000000000001161471504326700167240ustar00rootroot00000000000000#ifndef __CR_INC_INT_H__ #define __CR_INC_INT_H__ #include "asm/int.h" #endif crac-criu-1.5.0/criu/include/ipc_ns.h000066400000000000000000000002721471504326700174100ustar00rootroot00000000000000#ifndef __CR_IPC_NS_H__ #define __CR_IPC_NS_H__ extern int dump_ipc_ns(int ns_id); extern int prepare_ipc_ns(int pid); extern struct ns_desc ipc_ns_desc; #endif /* __CR_IPC_NS_H__ */ crac-criu-1.5.0/criu/include/irmap.h000066400000000000000000000006521471504326700172470ustar00rootroot00000000000000#ifndef __CR_IRMAP__H__ #define __CR_IRMAP__H__ #include "images/fh.pb-c.h" char *irmap_lookup(unsigned int s_dev, unsigned long i_ino); int irmap_queue_cache(unsigned int dev, unsigned long ino, FhEntry *fh); int irmap_predump_prep(void); int irmap_predump_run(void); int check_open_handle(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle); int irmap_load_cache(void); int irmap_scan_path_add(char *path); #endif crac-criu-1.5.0/criu/include/kcmp-ids.h000066400000000000000000000012471471504326700176470ustar00rootroot00000000000000#ifndef __CR_KCMP_IDS_H__ #define __CR_KCMP_IDS_H__ #include #include #include "kcmp.h" struct kid_tree { struct rb_root root; unsigned int kcmp_type; unsigned long subid; }; #define DECLARE_KCMP_TREE(name, type) \ struct kid_tree name = { \ .root = RB_ROOT, \ .kcmp_type = type, \ .subid = 1, \ } struct kid_elem { pid_t pid; unsigned int genid; unsigned int idx; }; extern uint32_t kid_generate_gen(struct kid_tree *tree, struct kid_elem *elem, int *new_id); extern struct kid_elem *kid_lookup_epoll_tfd(struct kid_tree *tree, struct kid_elem *elem, kcmp_epoll_slot_t *slot); #endif /* __CR_KCMP_IDS_H__ */ crac-criu-1.5.0/criu/include/kcmp.h000066400000000000000000000007021471504326700170650ustar00rootroot00000000000000#ifndef __CR_KCMP_H__ #define __CR_KCMP_H__ #include enum kcmp_type { KCMP_FILE, KCMP_VM, KCMP_FILES, KCMP_FS, KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, KCMP_EPOLL_TFD, KCMP_TYPES, }; /* Slot for KCMP_EPOLL_TFD */ typedef struct { uint32_t efd; /* epoll file descriptor */ uint32_t tfd; /* target file number */ uint32_t toff; /* target offset within same numbered sequence */ } kcmp_epoll_slot_t; #endif /* __CR_KCMP_H__ */ crac-criu-1.5.0/criu/include/kerndat.h000066400000000000000000000045411471504326700175700ustar00rootroot00000000000000#ifndef __CR_KERNDAT_H__ #define __CR_KERNDAT_H__ #include #include "int.h" #include "common/config.h" #include "asm/kerndat.h" #include "util-vdso.h" #include "hugetlb.h" #include struct stat; /* * kerndat stands for "kernel data" and is a collection * of run-time information about current kernel */ extern int kerndat_init(void); enum pagemap_func { PM_UNKNOWN, PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */ PM_FLAGS_ONLY, /* pagemap zeroes pfn part (user mode) */ PM_FULL, }; enum loginuid_func { LUID_NONE, LUID_READ, LUID_FULL, }; struct kerndat_s { u32 magic1, magic2; dev_t shmem_dev; int last_cap; u64 zero_page_pfn; bool has_dirty_track; bool has_memfd; bool has_memfd_hugetlb; bool has_fdinfo_lock; unsigned long task_size; bool ipv6; enum loginuid_func luid; bool compat_cr; bool sk_ns; bool sk_unix_file; bool tun_ns; enum pagemap_func pmap; unsigned int has_xtlocks; unsigned long mmap_min_addr; bool has_tcp_half_closed; bool stack_guard_gap_hidden; int lsm; bool apparmor_ns_dumping_enabled; bool has_uffd; unsigned long uffd_features; bool has_thp_disable; bool can_map_vdso; bool vdso_hint_reliable; struct vdso_symtable vdso_sym; #ifdef CONFIG_COMPAT struct vdso_symtable vdso_sym_compat; #endif bool has_nsid; bool has_link_nsid; unsigned int sysctl_nr_open; bool x86_has_ptrace_fpu_xsave_bug; bool has_inotify_setnextwd; bool has_kcmp_epoll_tfd; bool has_fsopen; bool has_clone3_set_tid; bool has_timens; bool has_newifindex; bool has_pidfd_open; bool has_pidfd_getfd; bool has_nspid; bool has_nftables_concat; bool has_sockopt_buf_lock; dev_t hugetlb_dev[HUGETLB_MAX]; bool has_move_mount_set_group; bool has_openat2; bool has_rseq; bool has_ptrace_get_rseq_conf; struct __ptrace_rseq_configuration libc_rseq_conf; bool has_ipv6_freebind; bool has_membarrier_get_registrations; }; extern struct kerndat_s kdat; enum { KERNDAT_FS_STAT_DEVPTS, KERNDAT_FS_STAT_DEVTMPFS, KERNDAT_FS_STAT_BINFMT_MISC, KERNDAT_FS_STAT_MAX }; /* * Check whether the fs @which with kdevice @kdev * is the same as host's. If yes, this means that * the fs mount is shared with host, if no -- it's * a new (likely virtuzlized) fs instance. */ extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); extern int kerndat_has_nspid(void); #endif /* __CR_KERNDAT_H__ */ crac-criu-1.5.0/criu/include/libnetlink.h000066400000000000000000000013541471504326700202720ustar00rootroot00000000000000#ifndef __CR_LIBNETLINK_H__ #define __CR_LIBNETLINK_H__ #define CR_NLMSG_SEQ 24680 /* arbitrary chosen */ struct ns_id; extern int do_rtnl_req(int nl, void *req, int size, int (*receive_callback)(struct nlmsghdr *h, struct ns_id *ns, void *), int (*error_callback)(int err, struct ns_id *ns, void *), struct ns_id *ns, void *); extern int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, int alen); extern int32_t nla_get_s32(const struct nlattr *nla); #define NLMSG_TAIL(nmsg) ((struct rtattr *)(((void *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) #ifndef NETNS_RTA #define NETNS_RTA(r) ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct rtgenmsg)))) #endif #endif /* __CR_LIBNETLINK_H__ */ crac-criu-1.5.0/criu/include/linux/000077500000000000000000000000001471504326700171225ustar00rootroot00000000000000crac-criu-1.5.0/criu/include/linux/aio_abi.h000066400000000000000000000005771471504326700206670ustar00rootroot00000000000000#ifndef __LINUX__AIO_ABI_H #define __LINUX__AIO_ABI_H typedef __kernel_ulong_t aio_context_t; /* read() from /dev/aio returns these structures. */ struct io_event { __u64 data; /* the data field from the iocb */ __u64 obj; /* what iocb this event came from */ __s64 res; /* result code for this event */ __s64 res2; /* secondary result */ }; #endif /* __LINUX__AIO_ABI_H */ crac-criu-1.5.0/criu/include/linux/mount.h000066400000000000000000000031521471504326700204360ustar00rootroot00000000000000#ifndef _CRIU_LINUX_MOUNT_H #define _CRIU_LINUX_MOUNT_H #include "common/config.h" #include "compel/plugins/std/syscall-codes.h" /* Copied from /usr/include/sys/mount.h */ #ifndef FSOPEN_CLOEXEC /* The type of fsconfig call made. */ enum fsconfig_command { FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ #define FSCONFIG_SET_FLAG FSCONFIG_SET_FLAG FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ #define FSCONFIG_SET_STRING FSCONFIG_SET_STRING FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ #define FSCONFIG_SET_BINARY FSCONFIG_SET_BINARY FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ #define FSCONFIG_SET_PATH FSCONFIG_SET_PATH FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ #define FSCONFIG_SET_PATH_EMPTY FSCONFIG_SET_PATH_EMPTY FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ #define FSCONFIG_SET_FD FSCONFIG_SET_FD FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ #define FSCONFIG_CMD_CREATE FSCONFIG_CMD_CREATE FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ #define FSCONFIG_CMD_RECONFIGURE FSCONFIG_CMD_RECONFIGURE }; #endif // FSOPEN_CLOEXEC /* fsopen flags. With the redundant definition, we check if the kernel, * glibc value and our value still match. */ #define FSOPEN_CLOEXEC 0x00000001 #ifndef MS_MGC_VAL /* Magic mount flag number. Has to be or-ed to the flag values. */ #define MS_MGC_VAL 0xc0ed0000 /* Magic flag number to indicate "new" flags */ #define MS_MGC_MSK 0xffff0000 /* Magic flag number mask */ #endif #endif crac-criu-1.5.0/criu/include/linux/openat2.h000066400000000000000000000003751471504326700206500ustar00rootroot00000000000000#ifndef _CRIU_LINUX_OPENAT2_H #define _CRIU_LINUX_OPENAT2_H #include #include "common/config.h" #ifdef CONFIG_HAS_OPENAT2 #include #else struct open_how { __u64 flags; __u64 mode; __u64 resolve; }; #endif #endif crac-criu-1.5.0/criu/include/linux/rseq.h000066400000000000000000000124561471504326700202550ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ #ifndef _UAPI_LINUX_RSEQ_H #define _UAPI_LINUX_RSEQ_H #ifdef __has_include #if __has_include("sys/rseq.h") #include #endif #endif #include "asm/thread_pointer.h" #include #include #include "common/config.h" #ifdef CONFIG_HAS_NO_LIBC_RSEQ_DEFS /* * linux/rseq.h * * Restartable sequences system call API * * Copyright (c) 2015-2018 Mathieu Desnoyers */ enum rseq_cpu_id_state { RSEQ_CPU_ID_UNINITIALIZED = -1, RSEQ_CPU_ID_REGISTRATION_FAILED = -2, }; enum rseq_flags { RSEQ_FLAG_UNREGISTER = (1 << 0), }; enum rseq_cs_flags_bit { RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, }; enum rseq_cs_flags { RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), }; #endif /* CONFIG_HAS_NO_LIBC_RSEQ_DEFS */ /* * Let's use our own definition of struct rseq_cs because some distros * (for example Mariner GNU/Linux) declares this structure their-own way. * This makes trouble with inconsistency between printf formatters and * struct rseq_cs field types. */ /* * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. It is usually declared as * link-time constant data. */ struct criu_rseq_cs { /* Version of this structure. */ __u32 version; /* enum rseq_cs_flags */ __u32 flags; __u64 start_ip; /* Offset from start_ip. */ __u64 post_commit_offset; __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); /* * We have to have our own copy of struct rseq definition because * of breaking UAPI change: * https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=bfdf4e6208051ed7165b2e92035b4bf11f43eb63 */ /* * struct rseq is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. * * A single struct rseq per thread is allowed. */ struct criu_rseq { /* * Restartable sequences cpu_id_start field. Updated by the * kernel. Read by user-space with single-copy atomicity * semantics. This field should only be read by the thread which * registered this data structure. Aligned on 32-bit. Always * contains a value in the range of possible CPUs, although the * value may not be the actual current CPU (e.g. if rseq is not * initialized). This CPU number value should always be compared * against the value of the cpu_id field before performing a rseq * commit or returning a value read from a data structure indexed * using the cpu_id_start value. */ __u32 cpu_id_start; /* * Restartable sequences cpu_id field. Updated by the kernel. * Read by user-space with single-copy atomicity semantics. This * field should only be read by the thread which registered this * data structure. Aligned on 32-bit. Values * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED * have a special semantic: the former means "rseq uninitialized", * and latter means "rseq initialization failed". This value is * meant to be read within rseq critical sections and compared * with the cpu_id_start value previously read, before performing * the commit instruction, or read and compared with the * cpu_id_start value before returning a value loaded from a data * structure indexed using the cpu_id_start value. */ __u32 cpu_id; /* * Restartable sequences rseq_cs field. * * Contains NULL when no critical section is active for the current * thread, or holds a pointer to the currently active struct rseq_cs. * * Updated by user-space, which sets the address of the currently * active rseq_cs at the beginning of assembly instruction sequence * block, and set to NULL by the kernel when it restarts an assembly * instruction sequence block, as well as when the kernel detects that * it is preempting or delivering a signal outside of the range * targeted by the rseq_cs. Also needs to be set to NULL by user-space * before reclaiming memory that contains the targeted struct rseq_cs. * * Read and set by the kernel. Set by user-space with single-copy * atomicity semantics. This field should only be updated by the * thread which registered this data structure. Aligned on 64-bit. * * 32-bit architectures should update the low order bits of the * rseq_cs field, leaving the high order bits initialized to 0. */ __u64 rseq_cs; /* * Restartable sequences flags field. * * This field should only be updated by the thread which * registered this data structure. Read by the kernel. * Mainly used for single-stepping through rseq critical sections * with debuggers. * * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT * Inhibit instruction sequence block restart on preemption * for this thread. * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL * Inhibit instruction sequence block restart on signal * delivery for this thread. * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE * Inhibit instruction sequence block restart on migration for * this thread. */ __u32 flags; } __attribute__((aligned(4 * sizeof(__u64)))); #endif /* _UAPI_LINUX_RSEQ_H */ crac-criu-1.5.0/criu/include/linux/userfaultfd.h000066400000000000000000000142011471504326700216150ustar00rootroot00000000000000/* * include/linux/userfaultfd.h * * Copyright (C) 2007 Davide Libenzi * Copyright (C) 2015 Red Hat, Inc. * */ #ifndef _LINUX_USERFAULTFD_H #define _LINUX_USERFAULTFD_H #include /* * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) #define UFFD_API_FEATURES \ (UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE | UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | UFFD_FEATURE_MISSING_SHMEM) #define UFFD_API_IOCTLS ((__u64)1 << _UFFDIO_REGISTER | (__u64)1 << _UFFDIO_UNREGISTER | (__u64)1 << _UFFDIO_API) #define UFFD_API_RANGE_IOCTLS ((__u64)1 << _UFFDIO_WAKE | (__u64)1 << _UFFDIO_COPY | (__u64)1 << _UFFDIO_ZEROPAGE) #define UFFD_API_RANGE_IOCTLS_BASIC ((__u64)1 << _UFFDIO_WAKE | (__u64)1 << _UFFDIO_COPY) /* * Valid ioctl command number range with this API is from 0x00 to * 0x3F. UFFDIO_API is the fixed number, everything else can be * changed by implementing a different UFFD_API. If sticking to the * same UFFD_API more ioctl can be added and userland will be aware of * which ioctl the running kernel implements through the ioctl command * bitmask written by the UFFDIO_API. */ #define _UFFDIO_REGISTER (0x00) #define _UFFDIO_UNREGISTER (0x01) #define _UFFDIO_WAKE (0x02) #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ #define UFFDIO 0xAA #define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, struct uffdio_api) #define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, struct uffdio_register) #define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, struct uffdio_range) #define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, struct uffdio_range) #define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, struct uffdio_copy) #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, struct uffdio_zeropage) /* read() structure */ struct uffd_msg { __u8 event; __u8 reserved1; __u16 reserved2; __u32 reserved3; union { struct { __u64 flags; __u64 address; } pagefault; struct { __u32 ufd; } fork; struct { __u64 from; __u64 to; __u64 len; } remap; struct { __u64 start; __u64 end; } remove; struct { /* unused reserved fields */ __u64 reserved1; __u64 reserved2; __u64 reserved3; } reserved; } arg; } __packed; /* * Start at 0x12 and not at 0 to be more strict against bugs. */ #define UFFD_EVENT_PAGEFAULT 0x12 #define UFFD_EVENT_FORK 0x13 #define UFFD_EVENT_REMAP 0x14 #define UFFD_EVENT_REMOVE 0x15 #define UFFD_EVENT_UNMAP 0x16 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1 << 0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1 << 1) /* If reason is VM_UFFD_WP */ struct uffdio_api { /* userland asks for an API number and the features to enable */ __u64 api; /* * Kernel answers below with the all available features for * the API, this notifies userland of which events and/or * which flags for each event are enabled in the current * kernel. * * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE * are to be considered implicitly always enabled in all kernels as * long as the uffdio_api.api requested matches UFFD_API. * * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on * hugetlbfs virtual memory ranges. Adding or not adding * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has * no real functional effect after UFFDIO_API returns, but * it's only useful for an initial feature set probe at * UFFDIO_API time. There are two ways to use it: * * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the * uffdio_api.features before calling UFFDIO_API, an error * will be returned by UFFDIO_API on a kernel without * hugetlbfs missing support * * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in * uffdio_api.features and instead it will be set by the * kernel in the uffdio_api.features if the kernel supports * it, so userland can later check if the feature flag is * present in uffdio_api.features after UFFDIO_API * succeeded. * * UFFD_FEATURE_MISSING_SHMEM works the same as * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem * (i.e. tmpfs and other shmem based APIs). */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1 << 0) #define UFFD_FEATURE_EVENT_FORK (1 << 1) #define UFFD_FEATURE_EVENT_REMAP (1 << 2) #define UFFD_FEATURE_EVENT_REMOVE (1 << 3) #define UFFD_FEATURE_MISSING_HUGETLBFS (1 << 4) #define UFFD_FEATURE_MISSING_SHMEM (1 << 5) #define UFFD_FEATURE_EVENT_UNMAP (1 << 6) __u64 features; __u64 ioctls; }; struct uffdio_range { __u64 start; __u64 len; }; struct uffdio_register { struct uffdio_range range; #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1 << 0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1 << 1) __u64 mode; /* * kernel answers which ioctl commands are available for the * range, keep at the end as the last 8 bytes aren't read. */ __u64 ioctls; }; struct uffdio_copy { __u64 dst; __u64 src; __u64 len; /* * There will be a wrprotection flag later that allows to map * pages wrprotected on the fly. And such a flag will be * available if the wrprotection ioctl are implemented for the * range according to the uffdio_register.ioctls. */ #define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1 << 0) __u64 mode; /* * "copy" is written by the ioctl and must be at the end: the * copy_from_user will not read the last 8 bytes. */ __s64 copy; }; struct uffdio_zeropage { struct uffdio_range range; #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1 << 0) __u64 mode; /* * "zeropage" is written by the ioctl and must be at the end: * the copy_from_user will not read the last 8 bytes. */ __s64 zeropage; }; #endif /* _LINUX_USERFAULTFD_H */ crac-criu-1.5.0/criu/include/log.h000066400000000000000000000042261471504326700167210ustar00rootroot00000000000000#ifndef __CR_LOG_H__ #define __CR_LOG_H__ #include #ifndef CR_NOGLIBC #include #include #include #endif /* CR_NOGLIBC */ #define LOG_UNSET (-1) #define LOG_MSG (0) /* Print message regardless of log level */ #define LOG_ERROR (1) /* Errors only, when we're in trouble */ #define LOG_WARN (2) /* Warnings, dazen and confused but trying to continue */ #define LOG_INFO (3) /* Informative, everything is fine */ #define LOG_DEBUG (4) /* Debug only */ #define DEFAULT_LOGLEVEL LOG_WARN /* * This is low-level printing helper, try hard not to use it directly * and use the pr_foo() helpers below. */ extern void print_on_level(unsigned int loglevel, const char *format, ...) __attribute__((__format__(__printf__, 2, 3))); #ifndef LOG_PREFIX #define LOG_PREFIX #endif void flush_early_log_buffer(int fd); #define print_once(loglevel, fmt, ...) \ do { \ static bool __printed; \ if (!__printed) { \ print_on_level(loglevel, fmt, ##__VA_ARGS__); \ __printed = 1; \ } \ } while (0) #define pr_msg(fmt, ...) print_on_level(LOG_MSG, fmt, ##__VA_ARGS__) #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) #define pr_err(fmt, ...) print_on_level(LOG_ERROR, "Error (%s:%d): " LOG_PREFIX fmt, __FILE__, __LINE__, ##__VA_ARGS__) #define pr_err_once(fmt, ...) print_once(LOG_ERROR, fmt, ##__VA_ARGS__) #define pr_warn(fmt, ...) print_on_level(LOG_WARN, "Warn (%s:%d): " LOG_PREFIX fmt, __FILE__, __LINE__, ##__VA_ARGS__) #define pr_warn_once(fmt, ...) print_once(LOG_WARN, "Warn (%s:%d): " LOG_PREFIX fmt, __FILE__, __LINE__, ##__VA_ARGS__) #define pr_debug(fmt, ...) print_on_level(LOG_DEBUG, LOG_PREFIX fmt, ##__VA_ARGS__) #ifndef CR_NOGLIBC #define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) #define pr_pwarn(fmt, ...) pr_warn(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) #endif /* CR_NOGLIBC */ #endif /* __CR_LOG_H__ */ crac-criu-1.5.0/criu/include/lsm.h000066400000000000000000000023521471504326700167310ustar00rootroot00000000000000#ifndef __CR_LSM_H__ #define __CR_LSM_H__ #include "images/inventory.pb-c.h" #include "images/creds.pb-c.h" #include "images/fdinfo.pb-c.h" #define AA_SECURITYFS_PATH "/sys/kernel/security/apparmor" /* * Get the Lsmtype for the current host. */ extern Lsmtype host_lsm_type(void); /* * Initialize the Lsmtype for the current host */ extern void kerndat_lsm(void); int collect_and_suspend_lsm(void); int unsuspend_lsm(void); /* * Validate that the LSM profiles can be correctly applied (must happen after * pstree is set up). */ int validate_lsm(char *profile); /* * Render the profile name in the way that the LSM wants it written to * /proc//attr/current, according to whatever is in the images and * specified by --lsm-profile. */ int render_lsm_profile(char *profile, char **val); extern int lsm_check_opts(void); #ifdef CONFIG_HAS_SELINUX int dump_xattr_security_selinux(int fd, FdinfoEntry *e); int run_setsockcreatecon(FdinfoEntry *e); int reset_setsockcreatecon(void); #else static inline int dump_xattr_security_selinux(int fd, FdinfoEntry *e) { return 0; } static inline int run_setsockcreatecon(FdinfoEntry *e) { return 0; } static inline int reset_setsockcreatecon(void) { return 0; } #endif #endif /* __CR_LSM_H__ */ crac-criu-1.5.0/criu/include/magic.h000066400000000000000000000120621471504326700172150ustar00rootroot00000000000000#ifndef __CR_MAGIC_H__ #define __CR_MAGIC_H__ /* * Basic multi-file images */ #define CRTOOLS_IMAGES_V1 1 /* * v1.1 has common magic in the head of each image file, * except for inventory */ #define CRTOOLS_IMAGES_V1_1 2 /* * Raw images are images in which data is stored in some * non-crtool format (ip tool dumps, tarballs, etc.) */ #define RAW_IMAGE_MAGIC 0x0 /* * Images have the IMG_COMMON_MAGIC in the head. Service files * such as stats and irmap-cache have the IMG_SERVICE_MAGIC. */ #define IMG_COMMON_MAGIC 0x54564319 /* Sarov (a.k.a. Arzamas-16) */ #define IMG_SERVICE_MAGIC 0x55105940 /* Zlatoust */ /* * The magic-s below correspond to coordinates * of various towns in the NNNNEEEE form. */ #define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ #define PSTREE_MAGIC 0x50273030 /* Kyiv */ #define FDINFO_MAGIC 0x56213732 /* Dmitrov */ #define PAGEMAP_MAGIC 0x56084025 /* Vladimir */ #define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC #define PAGES_MAGIC RAW_IMAGE_MAGIC #define PAGES_COMP_MAGIC RAW_IMAGE_MAGIC #define CORE_MAGIC 0x55053847 /* Kolomna */ #define IDS_MAGIC 0x54432030 /* Konigsberg */ #define VMAS_MAGIC 0x54123737 /* Tula */ #define PIPES_MAGIC 0x56513555 /* Tver */ #define PIPES_DATA_MAGIC 0x56453709 /* Dubna */ #define FIFO_MAGIC 0x58364939 /* Kirov */ #define FIFO_DATA_MAGIC 0x59333054 /* Tosno */ #define SIGACT_MAGIC 0x55344201 /* Murom */ #define UNIXSK_MAGIC 0x54373943 /* Ryazan */ #define INETSK_MAGIC 0x56443851 /* Pereslavl */ #define PACKETSK_MAGIC 0x60454618 /* Veliky Ustyug */ #define ITIMERS_MAGIC 0x57464056 /* Kostroma */ #define POSIX_TIMERS_MAGIC 0x52603957 /* Lipetsk */ #define SK_QUEUES_MAGIC 0x56264026 /* Suzdal */ #define UTSNS_MAGIC 0x54473203 /* Smolensk */ #define CREDS_MAGIC 0x54023547 /* Kozelsk */ #define IPC_VAR_MAGIC 0x53115007 /* Samara */ #define IPCNS_SHM_MAGIC 0x46283044 /* Odessa */ #define IPCNS_MSG_MAGIC 0x55453737 /* Moscow */ #define IPCNS_SEM_MAGIC 0x59573019 /* St. Petersburg */ #define REG_FILES_MAGIC 0x50363636 /* Belgorod */ #define EXT_FILES_MAGIC 0x59255641 /* Usolye */ #define FS_MAGIC 0x51403912 /* Voronezh */ #define MM_MAGIC 0x57492820 /* Pskov */ #define REMAP_FPATH_MAGIC 0x59133954 /* Vologda */ #define GHOST_FILE_MAGIC 0x52583605 /* Oryol */ #define TCP_STREAM_MAGIC 0x51465506 /* Orenburg */ #define EVENTFD_FILE_MAGIC 0x44523722 /* Anapa */ #define EVENTPOLL_FILE_MAGIC 0x45023858 /* Krasnodar */ #define EVENTPOLL_TFD_MAGIC 0x44433746 /* Novorossiysk */ #define SIGNALFD_MAGIC 0x57323820 /* Uglich */ #define INOTIFY_FILE_MAGIC 0x48424431 /* Volgograd */ #define INOTIFY_WD_MAGIC 0x54562009 /* Svetlogorsk (Rauschen) */ #define MNTS_MAGIC 0x55563928 /* Petushki */ #define NETDEV_MAGIC 0x57373951 /* Yaroslavl */ #define NETNS_MAGIC 0x55933752 /* Dolgoprudny */ #define TTY_FILES_MAGIC 0x59433025 /* Pushkin */ #define TTY_INFO_MAGIC 0x59453036 /* Kolpino */ #define TTY_DATA_MAGIC 0x59413026 /* Pavlovsk */ #define FILE_LOCKS_MAGIC 0x54323616 /* Kaluga */ #define RLIMIT_MAGIC 0x57113925 /* Rostov */ #define FANOTIFY_FILE_MAGIC 0x55096122 /* Chelyabinsk */ #define FANOTIFY_MARK_MAGIC 0x56506035 /* Yekaterinburg */ #define SIGNAL_MAGIC 0x59255647 /* Berezniki */ #define PSIGNAL_MAGIC SIGNAL_MAGIC #define NETLINK_SK_MAGIC 0x58005614 /* Perm */ #define NS_FILES_MAGIC 0x61394011 /* Nyandoma */ #define TUNFILE_MAGIC 0x57143751 /* Kalyazin */ #define CGROUP_MAGIC 0x59383330 /* Tikhvin */ #define TIMERFD_MAGIC 0x50493712 /* Korocha */ #define CPUINFO_MAGIC 0x61404013 /* Nyandoma */ #define USERNS_MAGIC 0x55474906 /* Kazan */ #define SECCOMP_MAGIC 0x64413049 /* Kostomuksha */ #define BINFMT_MISC_MAGIC 0x67343323 /* Apatity */ #define AUTOFS_MAGIC 0x49353943 /* Sochi */ #define FILES_MAGIC 0x56303138 /* Toropets */ #define MEMFD_INODE_MAGIC 0x48453499 /* Dnipro */ #define TIMENS_MAGIC 0x43114433 /* Beslan */ #define PIDNS_MAGIC 0x61157326 /* Surgut */ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC #define ROUTE6_MAGIC RAW_IMAGE_MAGIC #define RULE_MAGIC RAW_IMAGE_MAGIC #define TMPFS_IMG_MAGIC RAW_IMAGE_MAGIC #define TMPFS_DEV_MAGIC RAW_IMAGE_MAGIC #define IPTABLES_MAGIC RAW_IMAGE_MAGIC #define IP6TABLES_MAGIC RAW_IMAGE_MAGIC #define NFTABLES_MAGIC RAW_IMAGE_MAGIC #define NETNF_CT_MAGIC RAW_IMAGE_MAGIC #define NETNF_EXP_MAGIC RAW_IMAGE_MAGIC #define PAGES_OLD_MAGIC PAGEMAP_MAGIC #define SHM_PAGES_OLD_MAGIC PAGEMAP_MAGIC #define BINFMT_MISC_OLD_MAGIC BINFMT_MISC_MAGIC /* * These are special files, not exactly images */ #define STATS_MAGIC 0x57093306 /* Ostashkov */ #define IRMAP_CACHE_MAGIC 0x57004059 /* Ivanovo */ /* * Main magic for kerndat_s structure. */ #define KDAT_MAGIC 0x57023458 /* Torzhok */ #endif /* __CR_MAGIC_H__ */ crac-criu-1.5.0/criu/include/mem.h000066400000000000000000000031011471504326700167050ustar00rootroot00000000000000#ifndef __CR_MEM_H__ #define __CR_MEM_H__ #include #include "int.h" #include "vma.pb-c.h" #include "pid.h" #include "proc_parse.h" #include "inventory.pb-c.h" struct parasite_ctl; struct vm_area_list; struct page_pipe; struct pstree_item; struct vma_area; struct mem_dump_ctl { bool pre_dump; bool lazy; struct proc_pid_stat *stat; InventoryEntry *parent_ie; }; extern bool vma_has_guard_gap_hidden(struct vma_area *vma); extern bool page_is_zero(u64 pme); extern bool page_in_parent(bool dirty); extern int prepare_mm_pid(struct pstree_item *i); extern void prepare_cow_vmas(void); extern int do_task_reset_dirty_track(int pid); extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl); #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) #define PME_FILE (1ULL << 61) #define PME_SOFT_DIRTY (1ULL << 55) #define PME_PSHIFT_BITS (6) #define PME_STATUS_BITS (3) #define PME_STATUS_OFFSET (64 - PME_STATUS_BITS) #define PME_PSHIFT_OFFSET (PME_STATUS_OFFSET - PME_PSHIFT_BITS) #define PME_PFRAME_MASK ((1ULL << PME_PSHIFT_OFFSET) - 1) #define PME_PFRAME(x) ((x)&PME_PFRAME_MASK) struct task_restore_args; int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); bool should_dump_page(VmaEntry *vmae, u64 pme); #endif /* __CR_MEM_H__ */ crac-criu-1.5.0/criu/include/memfd.h000066400000000000000000000015411471504326700172250ustar00rootroot00000000000000#ifndef __CR_MEMFD_H__ #define __CR_MEMFD_H__ #include #include #include "int.h" #include "common/config.h" struct fd_parms; struct file_desc; extern int is_memfd(dev_t dev); extern int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms); extern const struct fdtype_ops memfd_dump_ops; extern int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap); extern struct collect_image_info memfd_cinfo; extern struct file_desc *collect_memfd(u32 id); extern int apply_memfd_seals(void); extern int prepare_memfd_inodes(void); #ifdef CONFIG_HAS_MEMFD_CREATE #include #else #include #include static inline int memfd_create(const char *name, unsigned int flags) { return syscall(SYS_memfd_create, name, flags); } #endif /* CONFIG_HAS_MEMFD_CREATE */ #endif /* __CR_MEMFD_H__ */ crac-criu-1.5.0/criu/include/mman.h000066400000000000000000000004461471504326700170700ustar00rootroot00000000000000#ifndef __CR_MMAN_H__ #define __CR_MMAN_H__ #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 #endif #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif #ifndef MADV_NOHUGEPAGE #define MADV_NOHUGEPAGE 15 #endif #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif #endif /* __CR_MMAN_H__ */ crac-criu-1.5.0/criu/include/mount-v2.h000066400000000000000000000052321471504326700176250ustar00rootroot00000000000000#ifndef __CR_MOUNT_V2_H__ #define __CR_MOUNT_V2_H__ #include "linux/mount.h" #include "linux/openat2.h" #include "common/list.h" #include #ifndef MOVE_MOUNT_SET_GROUP #define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ #endif #ifndef MOVE_MOUNT_F_EMPTY_PATH #define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ #endif #ifndef MOVE_MOUNT_T_EMPTY_PATH #define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ #endif static inline int sys_move_mount(int from_dirfd, const char *from_pathname, int to_dirfd, const char *to_pathname, unsigned int flags) { return syscall(__NR_move_mount, from_dirfd, from_pathname, to_dirfd, to_pathname, flags); } #ifndef OPEN_TREE_CLONE #define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ #endif #ifndef OPEN_TREE_CLOEXEC #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ #endif #ifndef AT_SYMLINK_NOFOLLOW #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ #endif #ifndef AT_NO_AUTOMOUNT #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ #endif #ifndef AT_EMPTY_PATH #define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ #endif #ifndef AT_RECURSIVE #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ #endif static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) { return syscall(__NR_open_tree, dfd, filename, flags); } #ifndef RESOLVE_NO_XDEV #define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings (includes bind-mounts). */ #endif static inline long sys_openat2(int dirfd, const char *pathname, struct open_how *how, size_t size) { return syscall(__NR_openat2, dirfd, pathname, how, size); } extern int check_mount_v2(void); struct sharing_group { /* This pair identifies the group */ int shared_id; int master_id; /* List of shared groups */ struct list_head list; /* List of mounts in this group */ struct list_head mnt_list; /* * List of dependent shared groups: * - all siblings have equal master_id * - the parent has shared_id equal to children's master_id * * This is a bit tricky: parent pointer indicates if there is one * parent sharing_group in list or only siblings. * So for traversal if parent pointer is set we can do: * list_for_each_entry(t, &sg->parent->children, siblings) * and otherwise we can do: * list_for_each_entry(t, &sg->siblings, siblings) */ struct list_head children; struct list_head siblings; struct sharing_group *parent; char *source; }; extern int resolve_shared_mounts_v2(void); extern int prepare_mnt_ns_v2(void); #endif /* __CR_MOUNT_V2_H__ */ crac-criu-1.5.0/criu/include/mount.h000066400000000000000000000173371471504326700173110ustar00rootroot00000000000000#ifndef __CR_MOUNT_H__ #define __CR_MOUNT_H__ #include #include "common/list.h" struct proc_mountinfo; struct pstree_item; struct fstype; struct ns_id; #define MS_PROPAGATE (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE | MS_SLAVE) /* * Here are a set of flags which we know how to handle for the one mount call. * All of them except MS_RDONLY are set only as mnt flags. * MS_RDONLY is set for both mnt and sb flags, so we can restore it for one * mount call only if it set for both masks. */ #define MS_MNT_KNOWN_FLAGS (MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_RDONLY) #define BINFMT_MISC_HOME "proc/sys/fs/binfmt_misc" #define HELPER_MNT_ID 0 #define MOUNT_INVALID_DEV (0) #define MNT_UNREACHABLE INT_MIN /* * We have remounted these mount writable temporary, and we * should return it back to readonly at the end of file restore. */ #define REMOUNTED_RW 1 /* * We have remounted these mount writable in service mount namespace, * thus we shouldn't return it back to readonly, as service mntns * will be destroyed anyway. */ #define REMOUNTED_RW_SERVICE 2 struct rst_mount_info { int remounted_rw; }; struct mount_info { int mnt_id; int parent_mnt_id; unsigned int s_dev; unsigned int s_dev_rt; char *root; /* * During dump mountpoint contains path with dot at the * beginning. It allows to use openat, statat, etc without * creating a temporary copy of the path. * * On restore mountpoint is prepended with so called ns * root path -- it's a place in fs where the namespace * mount tree is constructed. Check mnt_roots for details. * The ns_mountpoint contains path w/o this prefix. */ char *mountpoint; char *ns_mountpoint; /* Mount-v2 specific */ char *plain_mountpoint; int is_dir; int mp_fd_id; int mnt_fd_id; struct sharing_group *sg; struct list_head mnt_sharing; int fd; unsigned flags; unsigned sb_flags; int master_id; int shared_id; struct fstype *fstype; char *source; char *options; char *fsname; union { bool mounted; bool dumped; }; bool need_plugin; bool is_ns_root; bool deleted; int deleted_level; struct list_head deleted_list; struct mount_info *next; struct ns_id *nsid; char *external; bool internal_sharing; /* tree linkage */ struct mount_info *parent; struct mount_info *bind; struct list_head children; struct list_head siblings; struct list_head mnt_bind; /* circular list of derivatives of one real mount */ bool mnt_bind_is_populated; /* indicate that mnt_bind list is ready to use */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list; /* list of slave mounts */ struct list_head mnt_slave; /* slave list entry */ struct list_head mnt_ext_slave; /* external slave list entry */ struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */ struct list_head mnt_propagate; /* circular list of mounts which propagate from each other */ struct list_head mnt_notprop; /* temporary list used in can_mount_now */ struct list_head mnt_unbindable; /* list of mounts with delayed unbindable */ struct list_head postpone; int is_overmounted; struct rst_mount_info *rmi; void *private; /* associated filesystem data */ }; extern struct mount_info *mntinfo; extern void mntinfo_add_list_before(struct mount_info **head, struct mount_info *new); /* * Put a : in here since those are invalid on * the cli, so we know it's autogenerated in * debugging. */ #define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED" #define EXTERNAL_DEV_MOUNT "CRIU:EXTERNAL_DEV" #define NO_ROOT_MOUNT "CRIU:NO_ROOT" static inline bool mnt_is_dev_external(struct mount_info *mi) { return mi->external && !strcmp(mi->external, EXTERNAL_DEV_MOUNT); } static inline bool mnt_is_nodev_external(struct mount_info *mi) { return mi->external && strcmp(mi->external, EXTERNAL_DEV_MOUNT); } extern struct ns_desc mnt_ns_desc; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED extern int collect_binfmt_misc(void); #else static inline int collect_binfmt_misc(void) { return 0; } #endif extern struct mount_info *mnt_entry_alloc(bool rst); extern void mnt_entry_free(struct mount_info *mi); extern int __mntns_get_root_fd(pid_t pid); extern int mntns_get_root_fd(struct ns_id *ns); extern int mntns_get_root_by_mnt_id(int mnt_id); extern struct ns_id *lookup_nsid_by_mnt_id(int mnt_id); extern int open_mount(unsigned int s_dev); extern int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinfo); extern int check_mountpoint_fd(struct mount_info *pm, int mnt_fd); extern int __open_mountpoint(struct mount_info *pm); extern int mnt_is_dir(struct mount_info *pm); extern int open_mountpoint(struct mount_info *pm); extern struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump); extern int prepare_mnt_ns(void); extern int pivot_root(const char *new_root, const char *put_old); extern struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev, unsigned int st_ino, unsigned int mnt_id); extern struct mount_info *lookup_mnt_id(unsigned int id); extern struct mount_info *lookup_mnt_sdev(unsigned int s_dev); extern dev_t phys_stat_resolve_dev(struct ns_id *, dev_t st_dev, const char *path); extern bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, struct ns_id *, const char *path); extern int restore_task_mnt_ns(struct pstree_item *current); extern void fini_restore_mntns(void); extern int depopulate_roots_yard(int mntns_root, bool clean_remaps); extern int rst_get_mnt_root(int mnt_id, char *path, int plen); extern int ext_mount_add(char *key, char *val); extern int ext_mount_parse_auto(char *key); extern int mntns_maybe_create_roots(void); extern int read_mnt_ns_img(void); extern void cleanup_mnt_ns(void); extern void clean_cr_time_mounts(void); extern char *get_plain_mountpoint(int mnt_id, char *name); extern bool add_skip_mount(const char *mountpoint); extern int get_sdev_from_fd(int fd, unsigned int *sdev, bool parse_mountinfo); extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump); extern int check_mnt_id(void); extern int remount_readonly_mounts(void); extern int try_remount_writable(struct mount_info *mi, bool ns); extern bool mnt_is_overmounted(struct mount_info *mi); extern struct mount_info *mnt_get_external_bind(struct mount_info *mi); extern bool mnt_is_external_bind(struct mount_info *mi); extern bool has_mounted_external_bind(struct mount_info *mi); extern bool rst_mnt_is_root(struct mount_info *mi); extern struct mount_info *mnt_get_root_bind(struct mount_info *mi); extern bool mnt_is_root_bind(struct mount_info *mi); extern struct mount_info *mnt_get_external_bind_nodev(struct mount_info *mi); extern struct mount_info *mnt_bind_pick(struct mount_info *mi, bool (*pick)(struct mount_info *mi, struct mount_info *bind)); extern int mnt_tree_for_each(struct mount_info *start, int (*fn)(struct mount_info *)); extern char *service_mountpoint(const struct mount_info *mi); extern int validate_mounts(struct mount_info *info, bool for_dump); extern __maybe_unused struct mount_info *add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev, bool rst); extern char *resolve_source(struct mount_info *mi); extern int fetch_rt_stat(struct mount_info *m, const char *where); extern int do_simple_mount(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags); extern char *mnt_fsname(struct mount_info *mi); extern int apply_sb_flags(void *args, int fd, pid_t pid); extern int mount_root(void *args, int fd, pid_t pid); extern int restore_ext_mount(struct mount_info *mi); extern int cr_pivot_root(char *root); extern int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs); extern struct mount_info *root_yard_mp; extern char *mnt_roots; #endif /* __CR_MOUNT_H__ */ crac-criu-1.5.0/criu/include/namespaces.h000066400000000000000000000151341471504326700202570ustar00rootroot00000000000000#ifndef __CR_NS_H__ #define __CR_NS_H__ #include #include "common/compiler.h" #include "files.h" #include "common/list.h" #include "images/netdev.pb-c.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 #endif #ifndef CLONE_NEWPID #define CLONE_NEWPID 0x20000000 #endif #ifndef CLONE_NEWUTS #define CLONE_NEWUTS 0x04000000 #endif #ifndef CLONE_NEWIPC #define CLONE_NEWIPC 0x08000000 #endif #ifndef CLONE_NEWNET #define CLONE_NEWNET 0x40000000 #endif #ifndef CLONE_NEWUSER #define CLONE_NEWUSER 0x10000000 #endif #ifndef CLONE_NEWCGROUP #define CLONE_NEWCGROUP 0x02000000 #endif #ifndef CLONE_NEWTIME #define CLONE_NEWTIME 0x00000080 #endif #define CLONE_ALLNS \ (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP | \ CLONE_NEWTIME) /* Nested namespaces are supported only for these types */ #define CLONE_SUBNS (CLONE_NEWNS | CLONE_NEWNET) #define EXTRA_SIZE 20 struct ns_desc { unsigned int cflag; char *str; size_t len; }; struct user_ns_extra { char *uid; char *gid; }; /* struct join_ns is used for storing parameters specified by --join-ns */ struct join_ns { struct list_head list; char *ns_file; struct ns_desc *nd; /* namespace descriptor */ int ns_fd; /* extra options of --join-ns, like uid&gid in user namespace */ union { struct user_ns_extra user_extra; char *common_extra; } extra_opts; }; enum ns_type { NS_UNKNOWN = 0, NS_CRIU, NS_ROOT, NS_OTHER, }; struct netns_id { unsigned target_ns_id; unsigned netnsid_value; struct list_head node; }; struct net_link { NetDeviceEntry *nde; bool created; struct list_head node; }; struct ns_id { unsigned int kid; unsigned int id; pid_t ns_pid; struct ns_desc *nd; struct ns_id *next; enum ns_type type; char *ext_key; /* * For mount namespaces on restore -- indicates that * the namespace in question is created (all mounts * are mounted) and other tasks may do setns on it * and proceed. */ bool ns_populated; union { struct { struct mount_info *mntinfo_list; struct mount_info *mntinfo_tree; int nsfd_id; int root_fd_id; } mnt; struct { /* * ns_fd is used when network namespaces are being * restored. On this stage we access these file * descriptors many times and it is more efficient to * have them opened rather than to get them from fdstore. * * nsfd_id is used to restore sockets. On this stage we * can't use random file descriptors to not conflict * with restored file descriptors. */ union { int nsfd_id; /* a namespace descriptor id in fdstore */ int ns_fd; /* a namespace file descriptor */ }; int nlsk; /* for sockets collection */ int seqsk; /* to talk to parasite daemons */ struct list_head ids; struct list_head links; NetnsEntry *netns; } net; }; }; extern struct ns_id *ns_ids; #define NS_DESC_ENTRY(_cflag, _str) \ { \ .cflag = _cflag, .str = _str, .len = sizeof(_str) - 1, \ } extern bool check_ns_proc(struct fd_link *link); extern struct ns_desc pid_ns_desc; extern struct ns_desc user_ns_desc; extern struct ns_desc time_ns_desc; extern unsigned long root_ns_mask; extern const struct fdtype_ops nsfile_dump_ops; extern struct collect_image_info nsfile_cinfo; extern int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void *oarg); extern int collect_namespaces(bool for_dump); extern int collect_mnt_namespaces(bool for_dump); extern int dump_mnt_namespaces(void); extern int dump_namespaces(struct pstree_item *item, unsigned int ns_flags); extern int prepare_namespace_before_tasks(void); extern int prepare_namespace(struct pstree_item *item, unsigned long clone_flags); extern int prepare_userns_creds(void); extern int switch_ns(int pid, struct ns_desc *nd, int *rst); extern int switch_mnt_ns(int pid, int *rst, int *cwd_fd); extern int switch_ns_by_fd(int nsfd, struct ns_desc *nd, int *rst); extern int restore_ns(int rst, struct ns_desc *nd); extern int restore_mnt_ns(int rst, int *cwd_fd); extern int dump_task_ns_ids(struct pstree_item *); extern int predump_task_ns_ids(struct pstree_item *); extern int rst_add_ns_id(unsigned int id, struct pstree_item *, struct ns_desc *nd); extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd); extern int collect_user_namespaces(bool for_dump); extern int prepare_userns(struct pstree_item *item); extern int stop_usernsd(void); extern uid_t userns_uid(uid_t uid); extern gid_t userns_gid(gid_t gid); extern int dump_user_ns(pid_t pid, int ns_id); extern void free_userns_maps(void); extern int join_ns_add(const char *type, char *ns_file, char *extra_opts); extern int check_namespace_opts(void); extern int join_namespaces(void); typedef int (*uns_call_t)(void *arg, int fd, pid_t pid); /* * Async call -- The call is guaranteed to be done till the * CR_STATE_COMPLETE happens. The function may return even * before the call starts. * W/o flag the call is synchronous -- this function returns * strictly after the call finishes. */ #define UNS_ASYNC 0x1 /* * The call returns an FD which should be sent back. Conflicts * with UNS_ASYNC. */ #define UNS_FDOUT 0x2 #define MAX_UNSFD_MSG_SIZE 8192 /* * When we're restoring inside user namespace, some things are * not allowed to be done there due to insufficient capabilities. * If the operation in question can be offloaded to another process, * this call allows to do that. * * In case we're not in userns, just call the callback immediately * in the context of calling task. */ extern int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, size_t arg_size, int fd); #define userns_call(__call, __flags, __arg, __arg_size, __fd) \ __userns_call(__stringify(__call), __call, __flags, __arg, __arg_size, __fd) extern int add_ns_shared_cb(int (*actor)(void *data), void *data); extern struct ns_id *get_socket_ns(int lfd); extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd); struct unsc_msg { struct msghdr h; /* * 0th is the call address * 1st is the flags * 2nd is the optional (NULL in response) arguments */ struct iovec iov[3]; char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; }; extern void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid); extern void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd); extern int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)); #endif /* __CR_NS_H__ */ crac-criu-1.5.0/criu/include/net.h000066400000000000000000000027721471504326700167320ustar00rootroot00000000000000#ifndef __CR_NET_H__ #define __CR_NET_H__ #include #include "common/list.h" #include "external.h" #ifndef RTM_GETNSID #define RTM_GETNSID 90 #endif struct cr_imgset; struct ns_id; extern int dump_net_ns(struct ns_id *ns); extern int prepare_net_namespaces(void); extern void fini_net_namespaces(void); extern int netns_keep_nsfd(void); struct pstree_item; extern int restore_task_net_ns(struct pstree_item *current); struct veth_pair { struct list_head node; char *inside; char *outside; char *bridge; }; extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); extern int network_lock_internal(void); extern struct ns_desc net_ns_desc; #include "images/netdev.pb-c.h" extern int write_netdev_img(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **info); extern int read_ns_sys_file(char *path, char *buf, int len); struct net_link; extern int restore_link_parms(struct net_link *link, int nlsk); extern int veth_pair_add(char *in, char *out); extern int macvlan_ext_add(struct external *ext); extern int move_veth_to_bridge(void); extern int kerndat_has_newifindex(void); extern int kerndat_link_nsid(void); extern int net_get_nsid(int rtsk, int fd, int *nsid); extern struct ns_id *net_get_root_ns(void); extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); extern int net_set_ext(struct ns_id *ns); extern struct ns_id *get_root_netns(void); extern int read_net_ns_img(void); #endif /* __CR_NET_H__ */ crac-criu-1.5.0/criu/include/netfilter.h000066400000000000000000000014571471504326700201370ustar00rootroot00000000000000#ifndef __CR_NETFILTER_H__ #define __CR_NETFILTER_H__ struct inet_sk_desc; extern int iptables_lock_connection(struct inet_sk_desc *); extern int iptables_unlock_connection(struct inet_sk_desc *); struct inet_sk_info; extern int iptables_unlock_connection_info(struct inet_sk_info *); extern void preload_netfilter_modules(void); extern int nftables_init_connection_lock(void); extern int nftables_lock_connection(struct inet_sk_desc *); extern int nftables_get_table(char *table, int n); #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) #define NFT_RUN_CMD(nft, cmd) nft_run_cmd_from_buffer(nft, cmd, strlen(cmd)) #elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) #define NFT_RUN_CMD(nft, cmd) nft_run_cmd_from_buffer(nft, cmd) #else #define NFT_RUN_CMD(nft, cmd) BUILD_BUG_ON(1) #endif #endif /* __CR_NETFILTER_H__ */ crac-criu-1.5.0/criu/include/netlink_diag.h000066400000000000000000000014441471504326700205670ustar00rootroot00000000000000#ifndef __CR_NETLINK_DIAG_H__ #define __CR_NETLINK_DIAG_H__ #include struct netlink_diag_req { __u8 sdiag_family; __u8 sdiag_protocol; __u16 pad; __u32 ndiag_ino; __u32 ndiag_show; __u32 ndiag_cookie[2]; }; struct netlink_diag_msg { __u8 ndiag_family; __u8 ndiag_type; __u8 ndiag_protocol; __u8 ndiag_state; __u32 ndiag_portid; __u32 ndiag_dst_portid; __u32 ndiag_dst_group; __u32 ndiag_ino; __u32 ndiag_cookie[2]; }; enum { NETLINK_DIAG_MEMINFO, NETLINK_DIAG_GROUPS, __NETLINK_DIAG_MAX, }; #define NETLINK_DIAG_MAX (__NETLINK_DIAG_MAX - 1) #define NDIAG_PROTO_ALL ((__u8)~0) #define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ #define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ #endif /* __CR_NETLINK_DIAG_H__ */ crac-criu-1.5.0/criu/include/packet_diag.h000066400000000000000000000026411471504326700203720ustar00rootroot00000000000000#ifndef __CR_PACKET_DIAG_H__ #define __CR_PACKET_DIAG_H__ #include struct packet_diag_req { __u8 sdiag_family; __u8 sdiag_protocol; __u16 pad; __u32 pdiag_ino; __u32 pdiag_show; __u32 pdiag_cookie[2]; }; #define PACKET_SHOW_INFO 0x00000001 /* Basic packet_sk information */ #define PACKET_SHOW_MCLIST 0x00000002 /* A set of packet_diag_mclist-s */ #define PACKET_SHOW_RING_CFG 0x00000004 /* Rings configuration parameters */ #define PACKET_SHOW_FANOUT 0x00000008 struct packet_diag_msg { __u8 pdiag_family; __u8 pdiag_type; __u16 pdiag_num; __u32 pdiag_ino; __u32 pdiag_cookie[2]; }; enum { PACKET_DIAG_INFO, PACKET_DIAG_MCLIST, PACKET_DIAG_RX_RING, PACKET_DIAG_TX_RING, PACKET_DIAG_FANOUT, PACKET_DIAG_MAX, }; struct packet_diag_info { __u32 pdi_index; __u32 pdi_version; __u32 pdi_reserve; __u32 pdi_copy_thresh; __u32 pdi_tstamp; __u32 pdi_flags; #define PDI_RUNNING 0x1 #define PDI_AUXDATA 0x2 #define PDI_ORIGDEV 0x4 #define PDI_VNETHDR 0x8 #define PDI_LOSS 0x10 }; #ifndef MAX_ADDR_LEN #define MAX_ADDR_LEN 32 #endif struct packet_diag_mclist { __u32 pdmc_index; __u32 pdmc_count; __u16 pdmc_type; __u16 pdmc_alen; __u8 pdmc_addr[MAX_ADDR_LEN]; }; struct packet_diag_ring { __u32 pdr_block_size; __u32 pdr_block_nr; __u32 pdr_frame_size; __u32 pdr_frame_nr; __u32 pdr_retire_tmo; __u32 pdr_sizeof_priv; __u32 pdr_features; }; #endif /* __CR_PACKET_DIAG_H__ */ crac-criu-1.5.0/criu/include/page-pipe.h000066400000000000000000000116341471504326700200100ustar00rootroot00000000000000#ifndef __CR_PAGE_PIPE_H__ #define __CR_PAGE_PIPE_H__ #include #include "common/list.h" #define PAGE_ALLOC_COSTLY_ORDER 3 /* from the kernel source code */ struct kernel_pipe_buffer { struct page *page; unsigned int offset, len; const struct pipe_buf_operations *ops; unsigned int flags; unsigned long private; }; /* * The kernel allocates the linear chunk of memory for pipe buffers. * Allocation of chunks with size more than PAGE_ALLOC_COSTLY_ORDER * fails very often, so we need to restrict the pipe capacity to not * allocate big chunks. */ #define PIPE_MAX_SIZE ((1 << PAGE_ALLOC_COSTLY_ORDER) * PAGE_SIZE / sizeof(struct kernel_pipe_buffer)) /* The number of pipes for one chunk */ #define NR_PIPES_PER_CHUNK 8 /* * page_pipe is a descriptor of task's virtual memory * with pipes, containing pages. * * A page-pipe may contain holes -- these are pagemap * entries without pages. Holes are stored in separate * array to optimize paged iovs feed into vmsplice -- * they will be sent there in one go. * * A hole is a pagemap entry that doesn't have pages * in it, since they are present in previous (parent) * snapshot. * * * This page-pipe vs holes vs task vmem vs image layout * is described below. * * Task memory: (+ present, - not present pages) * 0 0 0 0 1 1 1 * 0 3 6 B 1 8 C * ---+++-----++++++-------++++---- * * Page-pipe iovs: * * bufs = 03:3,0B:6,18:4 * holes = * * The pagemap.img would purely contain page-pipe bufs. * * Pages image will contain pages at * * 03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B * * stored one by one. * * Not let's imagine task touches some pages and its mem * looks like: (+ present, = old present, - non present) * * 0 0 0 0 11 11 1 * 0 3 6 B 12 78 C * ---==+-----====+++-----++===---- * * (not new pages at 11 and 17 vaddrs) * * The new --snapshot'ed page-pipe would look like * * bufs = 05:1,0F:3,17:2 * holes = 03:2,0B:4,19:3 * * So the pagemap.img would look like * * 03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P * * (the page_xfer_dump_pages generates one) * * where P means "in parent", i.e. respective pages should * be looked up in the parent pagemap (not pages.img, but * the pagemap, and then the offset in previous pages.img * should be calculated, see the read_pagemap_page routine). * * New pages.img file would contain only pages for * * 05,0F,10,11,17,18 */ struct page_pipe_buf { int p[2]; /* pipe with pages */ unsigned int pipe_size; /* how many pages can be fit into pipe */ unsigned int pipe_off; /* where this buf is started in a pipe */ unsigned int pages_in; /* how many pages are there */ unsigned int nr_segs; /* how many iov-s are busy */ #define PPB_LAZY (1 << 0) unsigned int flags; struct iovec *iov; /* vaddr:len map */ struct list_head l; /* links into page_pipe->bufs */ }; /* * Page pipe buffers with different flags cannot share the same pipe. * We track the last ppb that was used for each type separately in the * prev[] array in the struct page_pipe (below). * Currently we have 2 types: the buffers that are always stored in * the images and the buffers that are lazily migrated */ #define PP_PIPE_TYPES 2 #define PP_HOLE_PARENT (1 << 0) struct page_pipe { unsigned int nr_pipes; /* how many page_pipe_bufs in there */ struct list_head bufs; /* list of bufs */ struct list_head free_bufs; /* list of bufs */ struct page_pipe_buf *prev[PP_PIPE_TYPES]; /* last ppb of each type for pipe sharing */ unsigned int nr_iovs; /* number of iovs */ unsigned int free_iov; /* first free iov */ struct iovec *iovs; /* iovs. They are provided into create_page_pipe and all bufs have their iov-s in there */ unsigned int nr_holes; /* number of holes allocated */ unsigned int free_hole; /* number of holes in use */ struct iovec *holes; /* holes */ unsigned int *hole_flags; unsigned int flags; /* PP_FOO flags below */ }; #define PP_CHUNK_MODE 0x1 /* Restrict the maximum buffer size of pipes and dump memory for a few iterations */ #define PP_OWN_IOVS 0x4 /* create_page_pipe allocated IOVs memory */ struct page_pipe *create_page_pipe(unsigned int nr_segs, struct iovec *iovs, unsigned flags); extern void destroy_page_pipe(struct page_pipe *p); extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr, unsigned int flags); extern int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr, unsigned int flags); extern void debug_show_page_pipe(struct page_pipe *pp); void page_pipe_reinit(struct page_pipe *pp); extern void page_pipe_destroy_ppb(struct page_pipe_buf *ppb); struct pipe_read_dest { int p[2]; int sink_fd; }; extern int pipe_read_dest_init(struct pipe_read_dest *prd); extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, unsigned int ppb_flags); #endif /* __CR_PAGE_PIPE_H__ */ crac-criu-1.5.0/criu/include/page-xfer.h000066400000000000000000000046051471504326700200170ustar00rootroot00000000000000#ifndef __CR_PAGE_XFER__H__ #define __CR_PAGE_XFER__H__ #include "pagemap.h" struct ps_info { int pid; unsigned short port; }; extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd); /* User buffer for read-mode pre-dump*/ #define PIPE_MAX_BUFFER_SIZE (PIPE_MAX_SIZE << PAGE_SHIFT) /* * page_xfer -- transfer pages into image file. * Two images backends are implemented -- local image file * and page-server image file. */ struct page_xfer { /* transfers one vaddr:len entry */ int (*write_pagemap)(struct page_xfer *self, struct iovec *iov, u32 flags); /* transfers pages related to previous pagemap */ int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len); void (*close)(struct page_xfer *self); /* * In case we need to dump pagemaps not as-is, but * relative to some address. Used, e.g. by shmem. */ unsigned long offset; bool transfer_lazy; /* private data for every page-xfer engine */ union { struct /* local */ { struct cr_img *pmi; /* pagemaps */ struct cr_img *pi; /* pages */ }; struct /* page-server */ { int sk; u64 dst_id; }; }; struct page_read *parent; }; extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id); struct page_pipe; extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *); extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *); extern int connect_to_page_server_to_send(void); extern int connect_to_page_server_to_recv(int epfd); extern int disconnect_from_page_server(void); extern int check_parent_page_xfer(int fd_type, unsigned long id); /* * The post-copy migration makes it necessary to receive pages from * remote dump. The protocol we use for that is quite simple: * - lazy-pages sends request containing PS_IOV_GET(nr_pages, vaddr, pid) * - dump-side page server responds with PS_IOV_ADD(nr_pages, vaddr, pid) or PS_IOV_ADD(0, 0, 0) if it failed to locate the required pages * - dump-side page server sends the raw page data */ /* async request/receive of remote pages */ extern int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages); typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, int nr_pages, void *); extern int page_server_start_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); #endif /* __CR_PAGE_XFER__H__ */ crac-criu-1.5.0/criu/include/page.h000066400000000000000000000001241471504326700170450ustar00rootroot00000000000000#ifndef __CR_INC_PAGE_H__ #define __CR_INC_PAGE_H__ #include "common/page.h" #endif crac-criu-1.5.0/criu/include/pagemap-cache.h000066400000000000000000000014651471504326700206150ustar00rootroot00000000000000#ifndef __CR_PAGEMAP_H__ #define __CR_PAGEMAP_H__ #include #include "int.h" #include "common/list.h" struct vma_area; #define PAGEMAP_PFN_OFF(addr) (PAGE_PFN(addr) * sizeof(u64)) typedef struct { pid_t pid; /* which process it belongs */ unsigned long start; /* start of area */ unsigned long end; /* end of area */ const struct list_head *vma_head; /* list head of VMAs we're serving */ u64 *map; /* local buffer */ size_t map_len; /* length of a buffer */ int fd; /* file to read PMs from */ } pmc_t; #define PMC_INIT \ (pmc_t) \ { \ } extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); extern void pmc_fini(pmc_t *pmc); #endif /* __CR_PAGEMAP_H__ */ crac-criu-1.5.0/criu/include/pagemap.h000066400000000000000000000104411471504326700175460ustar00rootroot00000000000000#ifndef __CR_PAGE_READ_H__ #define __CR_PAGE_READ_H__ #include "common/list.h" #include "images/pagemap.pb-c.h" #include "page.h" /* * page_read -- engine, that reads pages from image file(s) * * Several page-read's can be arranged in a chain to read * pages from a series of snapshot. * * A task's address space vs pagemaps+page image pairs can * look like this (taken from comment in page-pipe.h): * * task: * * 0 0 0 0 1 1 1 * 0 3 6 B 2 7 C * ---+++-----+++++++-----+++++---- * pm1: ---+++-----++++++-------++++---- * pm2: ---==+-----====+++-----++===---- * * Here + is present page, - is non prsent, = is present, * but is not modified from last snapshot. * * Thus pagemap.img and pages.img entries are * * pm1: 03:3,0B:6,18:4 * pm2: 03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P * * where P means "page is in parent pagemap". * * pg1: 03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B * pg2: 05,0F,10,11,17,18 * * When trying to restore from these 4 files we'd have * to carefully scan pagemap.img's one by one and read or * skip pages from pages.img where appropriate. * * All this is implemented in read_pagemap_page. */ struct page_read { /* reads page from current pagemap */ int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *, unsigned flags); /* Advance page_read to the next entry */ int (*advance)(struct page_read *pr); void (*close)(struct page_read *); void (*skip_pages)(struct page_read *, unsigned long len); int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); void (*reset)(struct page_read *pr); int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags); /* Whether or not pages can be read in PIE code */ bool pieok; /* Private data of reader */ struct cr_img *pmi; struct cr_img *pi; u32 pages_img_id; PagemapEntry *pe; /* current pagemap we are on */ struct page_read *parent; /* parent pagemap (if ->in_parent pagemap is met in image, * then go to this guy for page, see read_pagemap_page */ unsigned long cvaddr; /* vaddr we are on */ off_t pi_off; /* current offset in pages file */ struct iovec bunch; /* record consequent neighbour iovecs to punch together */ unsigned id; /* for logging */ unsigned long img_id; /* pagemap image file ID */ PagemapEntry **pmes; int nr_pmes; int curr_pme; struct list_head async; }; /* flags for ->read_pages */ #define PR_ASYNC 0x1 /* may exit w/o data in the buffer */ #define PR_ASAP 0x2 /* PR_ASYNC, but start the IO right now */ /* flags for open_page_read */ #define PR_SHMEM 0x1 #define PR_TASK 0x2 #define PR_TYPE_MASK 0x3 #define PR_MOD 0x4 /* Will need to modify */ #define PR_REMOTE 0x8 /* * -1 -- error * 0 -- no images * 1 -- opened */ extern int open_page_read(unsigned long id, struct page_read *, int pr_flags); extern int open_page_read_at(int dfd, unsigned long id, struct page_read *pr, int pr_flags); struct task_restore_args; int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, struct list_head *to); int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); /* * Create a shallow copy of page_read object. * The new object shares the pagemap structures with the original, but * maintains its own set of references to those structures. */ extern void dup_page_read(struct page_read *src, struct page_read *dst); extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned long len); static inline unsigned long pagemap_len(PagemapEntry *pe) { return pe->nr_pages * PAGE_SIZE; } static inline bool page_read_has_parent(struct page_read *pr) { return pr->parent != NULL; } /* Pagemap flags */ #define PE_PARENT (1 << 0) /* pages are in parent snapshot */ #define PE_LAZY (1 << 1) /* pages can be lazily restored */ #define PE_PRESENT (1 << 2) /* pages are present in pages*img */ static inline bool pagemap_in_parent(PagemapEntry *pe) { return !!(pe->flags & PE_PARENT); } static inline bool pagemap_lazy(PagemapEntry *pe) { return !!(pe->flags & PE_LAZY); } static inline bool pagemap_present(PagemapEntry *pe) { return !!(pe->flags & PE_PRESENT); } #endif /* __CR_PAGE_READ_H__ */ crac-criu-1.5.0/criu/include/parasite-syscall.h000066400000000000000000000040111471504326700214100ustar00rootroot00000000000000#ifndef __CR_PARASITE_SYSCALL_H__ #define __CR_PARASITE_SYSCALL_H__ #include "pid.h" #include "common/list.h" #include "common/config.h" #include "asm/parasite-syscall.h" struct parasite_dump_thread; struct parasite_dump_misc; struct parasite_drain_fd; struct vm_area_list; struct pstree_item; struct list_head; struct cr_imgset; struct fd_opts; struct pid; struct parasite_dump_cgroup_args; struct rt_sigframe; struct parasite_ctl; struct parasite_thread_ctl; extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *); struct proc_posix_timers_stat; extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, struct pstree_item *); extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); extern int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce); extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core); extern int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasite_ctl *ctl, int id, struct pid *tid, CoreEntry *core); extern int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *dt); extern int parasite_drain_fds_seized(struct parasite_ctl *ctl, struct parasite_drain_fd *dfds, int nr_fds, int off, int *lfds, struct fd_opts *flags); extern int parasite_get_proc_fd_seized(struct parasite_ctl *ctl); extern struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, struct vm_area_list *vma_area_list); extern void parasite_ensure_args_size(unsigned long sz); extern unsigned long get_exec_start(struct vm_area_list *); extern int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_args *cgroup); extern struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type); #endif /* __CR_PARASITE_SYSCALL_H__ */ crac-criu-1.5.0/criu/include/parasite-vdso.h000066400000000000000000000056411471504326700207230ustar00rootroot00000000000000#ifndef __CR_PARASITE_VDSO_H__ #define __CR_PARASITE_VDSO_H__ #include "common/config.h" #include "util-vdso.h" #include "images/vma.pb-c.h" struct parasite_ctl; struct vm_area_list; /* Check if symbol present in symtable */ static inline bool vdso_symbol_empty(struct vdso_symbol *s) { return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0'; } /* * Special mark which allows to identify runtime vdso (rt-vdso) where * calls from proxy (original) vdso are redirected. This mark usually * placed at the start of vdso area where Elf header lives. * Since such runtime vdso is solely used by the proxy and * nobody else is supposed to access it, it's more-less * safe to screw the Elf header with @signature and * vvar/vdso addresses for next dumping. * * The @orig_addr deserves a few comments. When we redirect the calls * from the original vdso to runtime vdso, on next checkpoint it won't * be possible to find original vdso/vvar pair, thus we save their * addresses in the member. * * As on the following dumps we need to drop rt-{vvar,vdso} pair * from list of VMAs to save in images, we save rt-vvar address also. */ struct vdso_mark { u64 signature; unsigned long orig_vdso_addr; unsigned long version; unsigned long orig_vvar_addr; unsigned long rt_vvar_addr; }; #define VDSO_MARK_SIGNATURE_V1 (0x6f73647675697263ULL) /* Magic number (criuvdso) */ #define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */ #define VDSO_MARK_SIGNATURE_V3 (0x4f53447655495243ULL) /* Magic number (CRIUvDSO) */ #define VDSO_MARK_CUR_VERSION (3) static inline void vdso_put_mark(void *where, unsigned long rt_vvar_addr, unsigned long orig_vdso_addr, unsigned long orig_vvar_addr) { struct vdso_mark *m = where; m->signature = VDSO_MARK_SIGNATURE_V3; m->orig_vdso_addr = orig_vdso_addr; m->version = VDSO_MARK_CUR_VERSION; m->orig_vvar_addr = orig_vvar_addr; m->rt_vvar_addr = rt_vvar_addr; } static inline bool is_vdso_mark(void *addr) { struct vdso_mark *m = addr; switch (m->signature) { case VDSO_MARK_SIGNATURE_V3: return true; /* * Old formats -- simply extend the mark up * to the version we support. */ case VDSO_MARK_SIGNATURE_V2: vdso_put_mark(m, VVAR_BAD_ADDR, m->orig_vdso_addr, m->orig_vvar_addr); return true; case VDSO_MARK_SIGNATURE_V1: vdso_put_mark(m, VVAR_BAD_ADDR, m->orig_vdso_addr, VVAR_BAD_ADDR); return true; } return false; } extern void vdso_update_gtod_addr(struct vdso_maps *rt); extern int vdso_do_park(struct vdso_maps *rt, unsigned long park_at, unsigned long park_size); extern int vdso_map_compat(unsigned long map_at); extern int vdso_proxify(struct vdso_maps *rt, bool *added_proxy, VmaEntry *vmas, size_t nr_vmas, bool compat_vdso, bool force_trampolines); extern int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, struct vdso_symtable *from, bool compat_vdso); #endif /* __CR_PARASITE_VDSO_H__ */ crac-criu-1.5.0/criu/include/parasite.h000066400000000000000000000126751471504326700177570ustar00rootroot00000000000000#ifndef __CR_PARASITE_H__ #define __CR_PARASITE_H__ #define PARASITE_MAX_SIZE (64 << 10) #ifndef __ASSEMBLY__ #include #include #include #include #include "linux/rseq.h" #include "image.h" #include "util-pie.h" #include "common/lock.h" #include "infect-rpc.h" #include "images/vma.pb-c.h" #include "images/tty.pb-c.h" #define __head __used __section(.head.text) enum { PARASITE_CMD_DUMP_THREAD = PARASITE_USER_CMDS, PARASITE_CMD_MPROTECT_VMAS, PARASITE_CMD_DUMPPAGES, PARASITE_CMD_DUMP_SIGACTS, PARASITE_CMD_DUMP_ITIMERS, PARASITE_CMD_DUMP_POSIX_TIMERS, PARASITE_CMD_DUMP_MISC, PARASITE_CMD_DRAIN_FDS, PARASITE_CMD_GET_PROC_FD, PARASITE_CMD_DUMP_TTY, PARASITE_CMD_CHECK_VDSO_MARK, PARASITE_CMD_CHECK_AIOS, PARASITE_CMD_DUMP_CGROUP, PARASITE_CMD_MAX, }; struct parasite_vma_entry { unsigned long start; unsigned long len; int prot; }; struct parasite_vdso_vma_entry { unsigned long start; unsigned long len; unsigned long orig_vdso_addr; unsigned long orig_vvar_addr; unsigned long rt_vvar_addr; int is_marked; bool try_fill_symtable; bool is_vdso; }; struct parasite_dump_pages_args { unsigned int nr_vmas; unsigned int add_prot; unsigned int off; unsigned int nr_segs; unsigned int nr_pages; }; static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) { return (struct parasite_vma_entry *)(a + 1); } static inline struct iovec *pargs_iovs(struct parasite_dump_pages_args *a) { return (struct iovec *)(pargs_vmas(a) + a->nr_vmas); } struct parasite_dump_sa_args { rt_sigaction_t sas[SIGMAX]; }; struct parasite_dump_itimers_args { struct itimerval real; struct itimerval virt; struct itimerval prof; }; struct posix_timer { int it_id; struct itimerspec val; int overrun; }; struct parasite_dump_posix_timers_args { int timer_n; struct posix_timer timer[0]; }; struct parasite_aio { unsigned long ctx; unsigned int size; }; struct parasite_check_aios_args { unsigned nr_rings; struct parasite_aio ring[0]; }; static inline int posix_timers_dump_size(int timer_n) { return sizeof(int) + sizeof(struct posix_timer) * timer_n; } /* * Misc sfuff, that is too small for separate file, but cannot * be read w/o using parasite */ struct parasite_dump_misc { bool has_membarrier_get_registrations; /* this is sent from criu to parasite. */ unsigned long brk; u32 pid; u32 sid; u32 pgid; u32 umask; int dumpable; int thp_disabled; int child_subreaper; int membarrier_registration_mask; }; /* * Calculate how long we can make the groups array in parasite_dump_creds * and still fit the struct in one page */ #define PARASITE_MAX_GROUPS \ ((PAGE_SIZE - sizeof(struct parasite_dump_thread) - offsetof(struct parasite_dump_creds, groups)) / \ sizeof(unsigned int)) /* groups */ struct parasite_dump_creds { unsigned int cap_last_cap; u32 cap_inh[CR_CAP_SIZE]; u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; int uids[4]; int gids[4]; int no_new_privs; unsigned int secbits; unsigned int ngroups; /* * FIXME -- this structure is passed to parasite code * through parasite args area so in parasite_dump_creds() * call we check for size of this data fits the size of * the area. Unfortunately, we _actually_ use more bytes * than the sizeof() -- we put PARASITE_MAX_GROUPS int-s * in there, so the size check is not correct. * * However, all this works simply because we make sure * the PARASITE_MAX_GROUPS is so, that the total amount * of memory in use doesn't exceed the PAGE_SIZE and the * args area is at least one page (PARASITE_ARG_SIZE_MIN). */ unsigned int groups[0]; }; struct parasite_check_rseq { bool has_rseq; bool has_ptrace_get_rseq_conf; /* no need to check if supported */ bool rseq_inited; }; struct parasite_dump_thread { unsigned int *tid_addr; pid_t tid; tls_t tls; struct parasite_check_rseq rseq; stack_t sas; int pdeath_sig; char comm[TASK_COMM_LEN]; struct parasite_dump_creds creds[0]; }; static inline void copy_sas(ThreadSasEntry *dst, const stack_t *src) { dst->ss_sp = encode_pointer(src->ss_sp); dst->ss_size = (u64)src->ss_size; dst->ss_flags = src->ss_flags; } /* * How many descriptors can be transferred from parasite: * * 1) struct parasite_drain_fd + all descriptors should fit into one page * 2) The value should be a multiple of CR_SCM_MAX_FD, because descriptors * are transferred with help of send_fds and recv_fds. * 3) criu should work with a default value of the file limit (1024) */ #define PARASITE_MAX_FDS CR_SCM_MAX_FD * 3 struct parasite_drain_fd { int nr_fds; int fds[0]; }; struct fd_opts { char flags; struct { uint32_t uid; uint32_t euid; uint32_t signum; uint32_t pid_type; uint32_t pid; } fown; }; static inline int drain_fds_size(struct parasite_drain_fd *dfds) { int nr_fds = min((int)PARASITE_MAX_FDS, dfds->nr_fds); return sizeof(*dfds) + nr_fds * (sizeof(dfds->fds[0]) + sizeof(struct fd_opts)); } struct parasite_tty_args { int fd; int type; int sid; int pgrp; bool hangup; int st_pckt; int st_lock; int st_excl; }; struct parasite_dump_cgroup_args { /* * 4K should be enough for most cases. * * The string is null terminated. */ char contents[(1 << 12) - 32]; /* * Contains the path to thread cgroup procfs. * "self/task//cgroup" */ char thread_cgrp[32]; }; #endif /* !__ASSEMBLY__ */ #endif /* __CR_PARASITE_H__ */ crac-criu-1.5.0/criu/include/path.h000066400000000000000000000020371471504326700170720ustar00rootroot00000000000000#ifndef __CR_PATH_H__ #define __CR_PATH_H__ #include "namespaces.h" #include "pstree.h" /* Absolute paths are used on dump and relative paths are used on restore */ static inline int is_root(char *p) { return (!strcmp(p, "/")); } /* True for the root mount (the topmost one) */ static inline int is_root_mount(struct mount_info *mi) { return mi->parent == NULL && mi->nsid->id == root_item->ids->mnt_ns_id; } /* * True if the mountpoint target is root on its FS. * * This is used to determine whether we need to postpone * mounting. E.g. one can bind mount some subdir from a * disk, and in this case we'll have to get the root disk * mount first, then bind-mount it. See do_mount_one(). */ static inline int fsroot_mounted(struct mount_info *mi) { return is_root(mi->root); } char *cut_root_for_bind(char *target_root, char *source_root); /* * Get a mount point for a sibling of m if m->parent and p are in the same * shared group. */ char *mnt_get_sibling_path(struct mount_info *m, struct mount_info *p, char *buf, int len); #endif crac-criu-1.5.0/criu/include/pid.h000066400000000000000000000030761471504326700167160ustar00rootroot00000000000000#ifndef __CR_PID_H__ #define __CR_PID_H__ #include #include "stdbool.h" #include "rbtree.h" /* * Task states, used in e.g. struct pid's state. */ enum __criu_task_state { /* Values shared with compel */ TASK_ALIVE = COMPEL_TASK_ALIVE, TASK_DEAD = COMPEL_TASK_DEAD, TASK_STOPPED = COMPEL_TASK_STOPPED, TASK_ZOMBIE = COMPEL_TASK_ZOMBIE, /* Own internal states */ TASK_HELPER = COMPEL_TASK_MAX + 1, TASK_THREAD, /* new values are to be added before this line */ TASK_UNDEF = 0xff }; struct pid { struct pstree_item *item; /* * The @real pid is used to fetch tasks during dumping stage, * This is a global pid seen from the context where the dumping * is running. */ pid_t real; int state; /* TASK_XXX constants */ /* If an item is in stopped state it has a signal number * that caused task to stop. */ int stop_signo; /* * The @virt pid is one which used in the image itself and keeps * the pid value to be restored. This pid fetched from the * dumpee context, because the dumpee might have own pid namespace. */ struct { pid_t virt; struct rb_node node; } ns[1]; /* Must be at the end of struct pid */ }; /* * When we have to restore a shared resource, we mush select which * task should do it, and make other(s) wait for it. In order to * avoid deadlocks, always make task with lower pid be the restorer. */ static inline bool pid_rst_prio(unsigned pid_a, unsigned pid_b) { return pid_a < pid_b; } static inline bool pid_rst_prio_eq(unsigned pid_a, unsigned pid_b) { return pid_a <= pid_b; } #endif /* __CR_PID_H__ */ crac-criu-1.5.0/criu/include/pidfd-store.h000066400000000000000000000005121471504326700203520ustar00rootroot00000000000000#ifndef __CR_PIDFD_STORE_H__ #define __CR_PIDFD_STORE_H__ #include int init_pidfd_store_sk(pid_t pid, int fd); int init_pidfd_store_hash(void); void free_pidfd_store(void); int pidfd_store_add(pid_t pid); int pidfd_store_check_pid_reuse(pid_t pid); bool pidfd_store_ready(void); #endif /* __CR_PIDFD_STORE_H__ */ crac-criu-1.5.0/criu/include/pipes.h000066400000000000000000000032321471504326700172540ustar00rootroot00000000000000#ifndef __CR_PIPES_H__ #define __CR_PIPES_H__ #include "images/pipe-data.pb-c.h" #include "images/pipe.pb-c.h" extern struct collect_image_info pipe_cinfo; extern struct collect_image_info pipe_data_cinfo; extern const struct fdtype_ops pipe_dump_ops; static inline u32 pipe_id(const struct fd_parms *p) { return p->stat.st_ino; } #define NR_PIPES_WITH_DATA 1024 struct pipe_data_dump { int img_type; unsigned int nr; u32 ids[NR_PIPES_WITH_DATA]; }; extern int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms *p); struct pipe_data_rst { PipeDataEntry *pde; void *data; struct pipe_data_rst *next; }; #define PIPE_DATA_HASH_BITS 5 #define PIPE_DATA_HASH_SIZE (1 << PIPE_DATA_HASH_BITS) #define PIPE_DATA_HASH_MASK (PIPE_DATA_HASH_SIZE - 1) extern int do_collect_pipe_data(struct pipe_data_rst *, ProtobufCMessage *, struct cr_img *, struct pipe_data_rst **hash); extern int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash); /* * The sequence of objects which should be restored: * pipe -> files struct-s -> fd-s. * pipe_entry describes pipe's file structs-s. * A pipe doesn't have own properties, so it has no object. */ #include "images/pipe.pb-c.h" struct pipe_info { PipeEntry *pe; struct list_head pipe_list; /* All pipe_info with the same pipe_id * This is pure circular list without head */ struct list_head list; /* global list of pipes */ struct file_desc d; unsigned int create : 1, reopen : 1; }; extern int collect_one_pipe_ops(void *o, ProtobufCMessage *base, struct file_desc_ops *ops); extern int open_pipe(struct file_desc *d, int *new_fd); #endif /* __CR_PIPES_H__ */ crac-criu-1.5.0/criu/include/plugin.h000066400000000000000000000040301471504326700174270ustar00rootroot00000000000000#ifndef __CR_PLUGIN_H__ #define __CR_PLUGIN_H__ #include "criu-plugin.h" #include "common/compiler.h" #include "common/list.h" #ifndef CR_PLUGIN_DEFAULT #define CR_PLUGIN_DEFAULT "/usr/lib/criu/" #endif void cr_plugin_fini(int stage, int err); int cr_plugin_init(int stage); typedef struct { struct list_head head; struct list_head hook_chain[CR_PLUGIN_HOOK__MAX]; } cr_plugin_ctl_t; extern cr_plugin_ctl_t cr_plugin_ctl; typedef struct { cr_plugin_desc_t *d; struct list_head list; void *dlhandle; struct list_head link[CR_PLUGIN_HOOK__MAX]; } plugin_desc_t; #define run_plugins(__hook, ...) \ ({ \ plugin_desc_t *this; \ int __ret = -ENOTSUP; \ \ list_for_each_entry(this, &cr_plugin_ctl.hook_chain[CR_PLUGIN_HOOK__##__hook], \ link[CR_PLUGIN_HOOK__##__hook]) { \ pr_debug("plugin: `%s' hook %u -> %p\n", this->d->name, CR_PLUGIN_HOOK__##__hook, \ this->d->hooks[CR_PLUGIN_HOOK__##__hook]); \ __ret = ((CR_PLUGIN_HOOK__##__hook##_t *)this->d->hooks[CR_PLUGIN_HOOK__##__hook])( \ __VA_ARGS__); \ if (__ret == -ENOTSUP) \ continue; \ break; \ } \ __ret; \ }) #endif crac-criu-1.5.0/criu/include/posix-timer.h000066400000000000000000000010671471504326700204200ustar00rootroot00000000000000#ifndef __CR_PROC_POSIX_TIMER_H__ #define __CR_PROC_POSIX_TIMER_H__ #include "common/list.h" struct str_posix_timer { long it_id; int clock_id; int si_signo; int it_sigev_notify; int notify_thread_id; void *sival_ptr; }; struct proc_posix_timer { struct list_head list; struct str_posix_timer spt; }; struct proc_posix_timers_stat { int timer_n; struct list_head timers; }; extern int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args); void free_posix_timers(struct proc_posix_timers_stat *st); #endif /* __CR_PROC_POSIX_TIMER_H__ */ crac-criu-1.5.0/criu/include/prctl.h000066400000000000000000000032741471504326700172660ustar00rootroot00000000000000#ifndef __CR_PRCTL_H__ #define __CR_PRCTL_H__ #include "int.h" #ifndef PR_SET_NAME #define PR_SET_NAME 15 #endif #ifndef PR_GET_NAME #define PR_GET_NAME 16 #endif #ifndef PR_SET_SECCOMP #define PR_SET_SECCOMP 22 #endif #ifndef PR_CAPBSET_READ #define PR_CAPBSET_READ 23 #endif #ifndef PR_CAPBSET_DROP #define PR_CAPBSET_DROP 24 #endif #ifndef PR_GET_SECUREBITS #define PR_GET_SECUREBITS 27 #endif #ifndef PR_SET_SECUREBITS #define PR_SET_SECUREBITS 28 #endif #ifndef PR_GET_DUMPABLE #define PR_GET_DUMPABLE 3 #endif #ifndef PR_SET_DUMPABLE #define PR_SET_DUMPABLE 4 #endif #ifndef PR_GET_NO_NEW_PRIVS #define PR_GET_NO_NEW_PRIVS 39 #endif #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #endif #ifndef PR_SET_MM #define PR_SET_MM 35 #define PR_SET_MM_START_CODE 1 #define PR_SET_MM_END_CODE 2 #define PR_SET_MM_START_DATA 3 #define PR_SET_MM_END_DATA 4 #define PR_SET_MM_START_STACK 5 #define PR_SET_MM_START_BRK 6 #define PR_SET_MM_BRK 7 #define PR_SET_MM_ARG_START 8 #define PR_SET_MM_ARG_END 9 #define PR_SET_MM_ENV_START 10 #define PR_SET_MM_ENV_END 11 #define PR_SET_MM_AUXV 12 #define PR_SET_MM_EXE_FILE 13 #endif #ifndef PR_SET_MM_MAP #define PR_SET_MM_MAP 14 #define PR_SET_MM_MAP_SIZE 15 struct prctl_mm_map { u64 start_code; u64 end_code; u64 start_data; u64 end_data; u64 start_brk; u64 brk; u64 start_stack; u64 arg_start; u64 arg_end; u64 env_start; u64 env_end; u64 *auxv; u32 auxv_size; u32 exe_fd; }; #endif #ifndef PR_GET_TID_ADDRESS #define PR_GET_TID_ADDRESS 40 #endif #ifndef PR_SET_THP_DISABLE #define PR_SET_THP_DISABLE 41 #endif #ifndef PR_GET_THP_DISABLE #define PR_GET_THP_DISABLE 42 #endif #endif /* __CR_PRCTL_H__ */ crac-criu-1.5.0/criu/include/proc_parse.h000066400000000000000000000047111471504326700202740ustar00rootroot00000000000000#ifndef __CR_PROC_PARSE_H__ #define __CR_PROC_PARSE_H__ #include #include "compel/infect.h" #define PROC_TASK_COMM_LEN 32 #define PROC_TASK_COMM_LEN_FMT "(%31s" struct proc_pid_stat { int pid; char comm[PROC_TASK_COMM_LEN]; char state; int ppid; int pgid; int sid; int tty_nr; int tty_pgrp; unsigned int flags; unsigned long min_flt; unsigned long cmin_flt; unsigned long maj_flt; unsigned long cmaj_flt; unsigned long utime; unsigned long stime; long cutime; long cstime; long priority; long nice; int num_threads; int zero0; unsigned long long start_time; unsigned long vsize; long mm_rss; unsigned long rsslim; unsigned long start_code; unsigned long end_code; unsigned long start_stack; unsigned long esp; unsigned long eip; unsigned long sig_pending; unsigned long sig_blocked; unsigned long sig_ignored; unsigned long sig_handled; unsigned long wchan; unsigned long zero1; unsigned long zero2; int exit_signal; int task_cpu; unsigned int rt_priority; unsigned int policy; unsigned long long delayacct_blkio_ticks; unsigned long gtime; long cgtime; unsigned long start_data; unsigned long end_data; unsigned long start_brk; unsigned long arg_start; unsigned long arg_end; unsigned long env_start; unsigned long env_end; int exit_code; }; #define PROC_CAP_SIZE 2 struct proc_status_creds { struct seize_task_status s; unsigned int uids[4]; unsigned int gids[4]; u32 last_filter; /* * Keep them at the end of structure * for fast comparison reason. */ u32 cap_inh[PROC_CAP_SIZE]; u32 cap_prm[PROC_CAP_SIZE]; u32 cap_eff[PROC_CAP_SIZE]; u32 cap_bnd[PROC_CAP_SIZE]; }; #define INVALID_UID ((uid_t)-1) extern int parse_pid_stat(pid_t pid, struct proc_pid_stat *s); extern unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent); extern int parse_pid_oom_score_adj(pid_t pid, int *err); extern int prepare_loginuid(unsigned int value); extern int parse_pid_status(pid_t pid, struct seize_task_status *, void *data); extern int parse_file_locks(void); extern int get_fd_mntid(int fd, int *mnt_id); struct pid; extern int parse_threads(int pid, struct pid **_t, int *_n); int parse_children(pid_t pid, pid_t **_c, int *_n); extern bool is_vma_range_fmt(char *line); extern void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf); extern int parse_uptime(uint64_t *upt); extern int parse_timens_offsets(struct timespec *boff, struct timespec *moff); #endif /* __CR_PROC_PARSE_H__ */ crac-criu-1.5.0/criu/include/protobuf-desc.h000066400000000000000000000033611471504326700207130ustar00rootroot00000000000000#ifndef __CR_PROTOBUF_DESC_H__ #define __CR_PROTOBUF_DESC_H__ #include #include enum { /* PB_AUTOGEN_START */ PB_INVENTORY, /* 0 */ PB_STATS, PB_FDINFO, PB_CORE, PB_MM, PB_VMA, PB_ITIMER, PB_POSIX_TIMER, PB_CREDS, PB_FS, PB_UTSNS, /* 10 */ PB_IPC_VAR, PB_IPC_SHM, PB_IPC_SEM, PB_MNT, PB_PSTREE, PB_GHOST_FILE, PB_TCP_STREAM, PB_REG_FILE, PB_EXT_FILE, PB_NS_FILE, /* 20 */ PB_INET_SK, PB_UNIX_SK, PB_PACKET_SOCK, PB_NETLINK_SK, PB_PIPE, PB_FIFO, PB_PIPE_DATA, PB_EVENTFD_FILE, PB_EVENTPOLL_FILE, PB_EVENTPOLL_TFD, /* 30 */ PB_SIGNALFD, PB_INOTIFY_FILE, PB_INOTIFY_WD, PB_FANOTIFY_FILE, PB_FANOTIFY_MARK, PB_TTY_FILE, PB_TTY_INFO, PB_FILE_LOCK, PB_RLIMIT, PB_PAGEMAP, /* 40 */ PB_SIGINFO, PB_TUNFILE, PB_IRMAP_CACHE, PB_CGROUP, PB_SECCOMP, PB_TIMERFD, PB_CPUINFO, PB_USERNS, PB_NETNS, PB_BINFMT_MISC, /* 50 */ PB_TTY_DATA, PB_AUTOFS, PB_GHOST_CHUNK, PB_FILE, PB_MEMFD_FILE, PB_MEMFD_INODE, PB_TIMENS, PB_IMG_STREAMER_REQUEST, PB_IMG_STREAMER_REPLY, PB_PIDNS, PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, /* PB_AUTOGEN_STOP */ PB_PAGEMAP_HEAD, PB_IDS, PB_SIGACT, PB_NETDEV, PB_REMAP_FPATH, PB_SK_QUEUES, PB_IPCNS_MSG, PB_IPCNS_MSG_ENT, PB_MAX, }; typedef size_t (*pb_getpksize_t)(void *obj); typedef size_t (*pb_pack_t)(void *obj, void *where); typedef void *(*pb_unpack_t)(void *allocator, size_t size, void *from); typedef void (*pb_free_t)(void *obj, void *allocator); struct cr_pb_message_desc { pb_getpksize_t getpksize; pb_pack_t pack; pb_unpack_t unpack; pb_free_t free; const ProtobufCMessageDescriptor *pb_desc; }; extern void cr_pb_init(void); extern struct cr_pb_message_desc cr_pb_descs[PB_MAX]; #endif /* __CR_PROTOBUF_DESC_H__ */ crac-criu-1.5.0/criu/include/protobuf.h000066400000000000000000000032271471504326700200000ustar00rootroot00000000000000#ifndef __CR_PROTOBUF_H__ #define __CR_PROTOBUF_H__ #include #include "protobuf-desc.h" #include "common/compiler.h" #include "util.h" struct cr_img; extern int do_pb_read_one(struct cr_img *, void **objp, int type, bool eof); #define pb_read_one(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, false) #define pb_read_one_eof(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, true) extern int pb_write_one(struct cr_img *, void *obj, int type); #define pb_pksize(__obj, __proto_message_name) (__proto_message_name##__get_packed_size(__obj) + sizeof(u32)) #define pb_repeated_size(__obj, __member) ((size_t)(sizeof(*(__obj)->__member) * (__obj)->n_##__member)) #define pb_msg(__base, __type) container_of(__base, __type, base) #include struct collect_image_info { int fd_type; int pb_type; unsigned int priv_size; int (*collect)(void *, ProtobufCMessage *, struct cr_img *); unsigned flags; }; #define COLLECT_SHARED 0x1 /* use shared memory for obj-s */ #define COLLECT_NOFREE 0x2 /* don't free entry after callback */ #define COLLECT_HAPPENED 0x4 /* image was opened and collected */ extern int collect_image(struct collect_image_info *); extern int collect_entry(ProtobufCMessage *base, struct collect_image_info *cinfo); static inline int collect_images(struct collect_image_info **array, unsigned size) { int i; for (i = 0; i < size; i++) { if (collect_image(array[i])) return -1; } return 0; } /* * To speed up reading of packed objects * by providing space on stack, this should * be more than enough for most objects. */ #define PB_PKOBJ_LOCAL_SIZE 1024 #endif /* __CR_PROTOBUF_H__ */ crac-criu-1.5.0/criu/include/pstree.h000066400000000000000000000071501471504326700174410ustar00rootroot00000000000000#ifndef __CR_PSTREE_H__ #define __CR_PSTREE_H__ #include "common/list.h" #include "common/lock.h" #include "pid.h" #include "xmalloc.h" #include "images/core.pb-c.h" /* * That's the init process which usually inherit * all orphaned children in the system. */ #define INIT_PID (1) struct pstree_item { struct pstree_item *parent; struct list_head children; /* list of my children */ struct list_head sibling; /* linkage in my parent's children list */ struct pid *pid; pid_t pgid; pid_t sid; pid_t born_sid; int nr_threads; /* number of threads */ struct pid *threads; /* array of threads */ CoreEntry **core; TaskKobjIdsEntry *ids; union { futex_t task_st; unsigned long task_st_le_bits; }; }; static inline pid_t vpid(const struct pstree_item *i) { return i->pid->ns[0].virt; } enum { FDS_EVENT_BIT = 0, }; #define FDS_EVENT (1 << FDS_EVENT_BIT) extern struct pstree_item *current; struct rst_info; /* See alloc_pstree_item() for details */ static inline struct rst_info *rsti(struct pstree_item *i) { return (struct rst_info *)(i + 1); } struct thread_lsm { char *profile; char *sockcreate; }; struct ns_id; struct dmp_info { struct ns_id *netns; struct page_pipe *mem_pp; struct parasite_ctl *parasite_ctl; struct parasite_thread_ctl **thread_ctls; uint64_t *thread_sp; struct criu_rseq_cs *thread_rseq_cs; /* * Although we don't support dumping different struct creds in general, * we do for threads. Let's keep track of their profiles here; a NULL * entry means there was no LSM profile for this thread. */ struct thread_lsm **thread_lsms; }; static inline struct dmp_info *dmpi(const struct pstree_item *i) { return (struct dmp_info *)(i + 1); } /* ids is allocated and initialized for all alive tasks */ static inline int shared_fdtable(struct pstree_item *item) { return (item->parent && item->ids->files_id == item->parent->ids->files_id); } static inline bool is_alive_state(int state) { return (state == TASK_ALIVE) || (state == TASK_STOPPED); } static inline bool task_alive(struct pstree_item *i) { return is_alive_state(i->pid->state); } extern void free_pstree(struct pstree_item *root_item); extern struct pstree_item *__alloc_pstree_item(bool rst); #define alloc_pstree_item() __alloc_pstree_item(false) extern int init_pstree_helper(struct pstree_item *ret); extern struct pstree_item *lookup_create_item(pid_t pid); extern void pstree_insert_pid(struct pid *pid_node); extern struct pid *pstree_pid_by_virt(pid_t pid); extern struct pstree_item *root_item; extern struct pstree_item *pstree_item_next(struct pstree_item *item); #define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) extern bool restore_before_setsid(struct pstree_item *child); extern int prepare_pstree(void); extern int prepare_dummy_pstree(void); extern int dump_pstree(struct pstree_item *root_item); struct pstree_item *pstree_item_by_real(pid_t virt); struct pstree_item *pstree_item_by_virt(pid_t virt); extern int pid_to_virt(pid_t pid); struct task_entries; extern struct task_entries *task_entries; extern int prepare_task_entries(void); extern int prepare_dummy_task_state(struct pstree_item *pi); extern int get_task_ids(struct pstree_item *); extern TaskKobjIdsEntry *root_ids; extern void core_entry_free(CoreEntry *core); extern CoreEntry *core_entry_alloc(int alloc_thread_info, int alloc_tc); extern int pstree_alloc_cores(struct pstree_item *item); extern void pstree_free_cores(struct pstree_item *item); extern int collect_pstree_ids(void); extern int preorder_pstree_traversal(struct pstree_item *item, int (*f)(struct pstree_item *)); #endif /* __CR_PSTREE_H__ */ crac-criu-1.5.0/criu/include/ptrace-compat.h000066400000000000000000000005261471504326700206760ustar00rootroot00000000000000#ifndef __CR_PTRACE_H__ #define __CR_PTRACE_H__ #include #include #include "common/config.h" #ifndef CONFIG_HAS_PTRACE_PEEKSIGINFO struct ptrace_peeksiginfo_args { __u64 off; /* from which siginfo to start */ __u32 flags; __u32 nr; /* how may siginfos to take */ }; #endif #endif /* __CR_PTRACE_H__ */ crac-criu-1.5.0/criu/include/rbtree.h000066400000000000000000000052601471504326700174220ustar00rootroot00000000000000/* * RBtree implementation adopted from the Linux kernel sources. */ #ifndef __CR_RBTREE_H__ #define __CR_RBTREE_H__ #include #include "common/compiler.h" #define RB_RED 0 #define RB_BLACK 1 #define RB_MASK 3 struct rb_node { unsigned long rb_parent_color; /* Keeps both parent anc color */ struct rb_node *rb_right; struct rb_node *rb_left; } __aligned(sizeof(long)); struct rb_root { struct rb_node *rb_node; }; #define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~RB_MASK)) #define rb_color(r) ((r)->rb_parent_color & RB_BLACK) #define rb_is_red(r) (!rb_color(r)) #define rb_is_black(r) (rb_color(r)) #define rb_set_red(r) \ do { \ (r)->rb_parent_color &= ~RB_BLACK; \ } while (0) #define rb_set_black(r) \ do { \ (r)->rb_parent_color |= RB_BLACK; \ } while (0) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->rb_parent_color = (rb->rb_parent_color & RB_MASK) | (unsigned long)p; } static inline void rb_set_color(struct rb_node *rb, int color) { rb->rb_parent_color = (rb->rb_parent_color & ~RB_BLACK) | color; } #define RB_ROOT \ (struct rb_root) \ { \ NULL, \ } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) #define RB_EMPTY_NODE(node) (rb_parent(node) == node) #define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) static inline void rb_init_node(struct rb_node *node) { *node = (struct rb_node){}; RB_CLEAR_NODE(node); } extern void rb_insert_color(struct rb_node *node, struct rb_root *root); extern void rb_erase(struct rb_node *node, struct rb_root *root); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_first(const struct rb_root *root); extern struct rb_node *rb_last(const struct rb_root *root); extern struct rb_node *rb_next(const struct rb_node *node); extern struct rb_node *rb_prev(const struct rb_node *node); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_and_balance(struct rb_root *root, struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { rb_link_node(node, parent, rb_link); rb_insert_color(node, root); } #endif /* __CR_RBTREE_H__ */ crac-criu-1.5.0/criu/include/restore.h000066400000000000000000000002731471504326700176210ustar00rootroot00000000000000#ifndef __CR_INC_RESTORE_H__ #define __CR_INC_RESTORE_H__ #include "pid.h" #include "types.h" #include "asm/restore.h" extern int arch_set_thread_regs_nosigrt(struct pid *pid); #endif crac-criu-1.5.0/criu/include/restorer.h000066400000000000000000000200321471504326700177760ustar00rootroot00000000000000#ifndef __CR_RESTORER_H__ #define __CR_RESTORER_H__ #include #include #include #include #include "common/config.h" #include "types.h" #include "int.h" #include "types.h" #include "common/compiler.h" #include #include "common/lock.h" #include "util.h" #include "asm/restorer.h" #include "posix-timer.h" #include "timerfd.h" #include "shmem.h" #include "parasite-vdso.h" #include "fault-injection.h" #include #include "images/mm.pb-c.h" /* * These *must* be power of two values. */ #define RESTORE_ARGS_SIZE (512) #define RESTORE_STACK_REDZONE (128) #define RESTORE_STACK_SIZE (KILO(32)) struct restore_mem_zone { u8 redzone[RESTORE_STACK_REDZONE]; u8 stack[RESTORE_STACK_SIZE]; u8 rt_sigframe[RESTORE_STACK_SIGFRAME]; } __stack_aligned__; struct rst_sched_param { int policy; int nice; int prio; }; struct rst_rseq_param { u64 rseq_abi_pointer; u32 rseq_abi_size; u32 signature; }; struct restore_posix_timer { struct str_posix_timer spt; struct itimerspec val; int overrun; }; /* * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame, * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things * simpler, force both _args alignment be 64 bytes. */ struct thread_creds_args { CredsEntry creds; unsigned int cap_last_cap; u32 cap_inh[CR_CAP_SIZE]; u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; char *lsm_profile; unsigned int *groups; char *lsm_sockcreate; unsigned long mem_lsm_profile_pos; unsigned long mem_lsm_sockcreate_pos; unsigned long mem_groups_pos; unsigned long mem_pos_next; }; struct thread_seccomp_filter { struct sock_fprog sock_fprog; unsigned int flags; }; struct thread_restore_args { struct restore_mem_zone *mz; int pid; UserRegsEntry gpregs; u64 clear_tid_addr; u64 futex_rla; u32 futex_rla_len; struct rst_sched_param sp; struct task_restore_args *ta; tls_t tls; struct rst_rseq_param rseq; siginfo_t *siginfo; unsigned int siginfo_n; int pdeath_sig; struct thread_creds_args *creds_args; int seccomp_mode; unsigned long seccomp_filters_pos; struct thread_seccomp_filter *seccomp_filters; void *seccomp_filters_data; unsigned int seccomp_filters_n; bool seccomp_force_tsync; char comm[TASK_COMM_LEN]; int cg_set; int cgroupd_sk; } __aligned(64); typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args); struct restore_vma_io { int nr_iovs; loff_t off; struct iovec iovs[0]; }; #define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec)) struct task_restore_args { struct thread_restore_args *t; /* thread group leader */ int fd_exe_link; /* opened self->exe file */ int logfd; unsigned int loglevel; struct timeval logstart; int uffd; bool thp_disabled; /* threads restoration */ int nr_threads; /* number of threads */ thread_restore_fcall_t clone_restore_fn; /* helper address for clone() call */ struct thread_restore_args *thread_args; /* array of thread arguments */ struct task_entries *task_entries; void *rst_mem; unsigned long rst_mem_size; /* Below arrays get remapped from RM_PRIVATE in sigreturn_restore */ VmaEntry *vmas; unsigned int vmas_n; int vma_ios_fd; struct restore_vma_io *vma_ios; unsigned int vma_ios_n; struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; struct restore_timerfd *timerfd; unsigned int timerfd_n; siginfo_t *siginfo; unsigned int siginfo_n; struct rst_tcp_sock *tcp_socks; unsigned int tcp_socks_n; struct rst_aio_ring *rings; unsigned int rings_n; struct rlimit64 *rlims; unsigned int rlims_n; pid_t *helpers /* the TASK_HELPERS to wait on at the end of restore */; unsigned int helpers_n; pid_t *zombies; unsigned int zombies_n; int *inotify_fds; /* fds to cleanup inotify events at CR_STATE_RESTORE_SIGCHLD stage */ unsigned int inotify_fds_n; /* * * * * * * * * * * * * * * * * * * * */ unsigned long task_size; unsigned long premmapped_addr; unsigned long premmapped_len; rt_sigaction_t sigchld_act; void *bootstrap_start; unsigned long bootstrap_len; struct itimerval itimers[3]; MmEntry mm; auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; u32 mm_saved_auxv_size; char comm[TASK_COMM_LEN]; /* * proc_fd is a handle to /proc that the restorer blob can use to open * files there, because some of them can't be opened before the * restorer blob is called. */ int proc_fd; int seccomp_mode; bool compatible_mode; bool can_map_vdso; bool auto_dedup; unsigned long vdso_rt_size; struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */ unsigned long vdso_rt_parked_at; /* safe place to keep vdso */ void **breakpoint; enum faults fault_strategy; #ifdef ARCH_HAS_LONG_PAGES unsigned page_size; #endif int lsm_type; int child_subreaper; int membarrier_registration_mask; bool has_clone3_set_tid; bool mmap_page_image; bool ptrace_allowed; /* * info about rseq from libc used to * unregister it before memory restoration procedure */ struct rst_rseq_param libc_rseq; uid_t uid; u32 cap_eff[CR_CAP_SIZE]; } __aligned(64); /* * For arm64 stack needs to aligned to 16 bytes. * Hence align to 16 bytes for all */ #define RESTORE_ALIGN_STACK(start, size) (ALIGN((start) + (size)-16, 16)) static inline unsigned long restorer_stack(struct restore_mem_zone *mz) { return RESTORE_ALIGN_STACK((long)&mz->stack, RESTORE_STACK_SIZE); } enum { /* * Restore stages. The stage is started by criu process, then * confirmed by all tasks involved in it. Then criu does some * actions and starts the next stage. * * The first stated stage is CR_STATE_ROOT_TASK which is started * right before calling fork_with_pid() for the root_item. */ CR_STATE_FAIL = -1, /* * Root task is created and does some pre-checks. * After the stage ACT_SETUP_NS scripts are performed. */ CR_STATE_ROOT_TASK = 0, /* * The prepare_namespace() is called. * After the stage criu opens root task's mntns and * calls ACT_POST_SETUP_NS scripts. */ CR_STATE_PREPARE_NAMESPACES, /* * All tasks fork and call open_transport_socket(). * Stage is needed to make sure they all have the socket. * Also this stage is a sync point after which the * fini_restore_mntns() can be called. * * This stage is a little bit special. Normally all stages * are controlled by criu process, but when this stage * starts criu process starts waiting for the tasks to * finish it, but by the time it gets woken up the stage * finished is CR_STATE_RESTORE. The forking stage is * barrier-ed by the root task, this task is also the one * that switches the stage (into restoring). * * The above is done to lower the amount of context * switches from root task to criu and back, since the * separate forking stage is not needed by criu, it's * purely to make sure all tasks be in sync. */ CR_STATE_FORKING, /* * Main restore stage. By the end of it all tasks are * almost ready and what's left is: * pick up zombies and helpers * restore sigchild handlers used to detect restore errors * restore credentials, seccomp, dumpable and pdeath_sig */ CR_STATE_RESTORE, /* * Tasks restore sigchild handlers. * Stage is needed to synchronize the change in error * propagation via sigchild. */ CR_STATE_RESTORE_SIGCHLD, /* * Final stage. * For security reason processes can be resumed only when all * credentials are restored. Otherwise someone can attach to a * process, which are not restored credentials yet and execute * some code. * Seccomp needs to be restored after creds. * Dumpable and pdeath signal are restored after seccomp. */ CR_STATE_RESTORE_CREDS, CR_STATE_COMPLETE }; #define restore_finish_stage(__v, __stage) \ ({ \ futex_dec_and_wake(&(__v)->nr_in_progress); \ futex_wait_while(&(__v)->start, __stage); \ (s32) futex_get(&(__v)->start); \ }) #define __r_sym(name) restorer_sym##name #define restorer_sym(rblob, name) (void *)(rblob + __r_sym(name)) #endif /* __CR_RESTORER_H__ */ crac-criu-1.5.0/criu/include/rst-malloc.h000066400000000000000000000051771471504326700202230ustar00rootroot00000000000000#ifndef __CR_RST_MALLOC__H__ #define __CR_RST_MALLOC__H__ /* * On restore we need differetn types of memory allocation. * Here's an engine that tries to generalize them all. The * main difference is in how the buffer with objects is being * grown up. * * Buffers, that are to be used by restorer will be remapped * into restorer address space with rst_mem_remap() call. Thus * we have to either keep track of all the buffers and objects, * or keep objects one-by-one in a plain linear buffer. The * engine uses the 2nd approach. */ enum { /* * Shared non-remapable allocations. These can happen only * in "global" context, i.e. when objects are allocated to * be used by any process to be restored. The objects are * not going to be used in restorer blob, thus allocation * engine grows buffers in a simple manner. */ RM_SHARED, /* * Shared objects, that are about to be used in restorer * blob. For these the *_remap_* stuff below is used to get * the actual pointer on any object. Growing a buffer is * done with mremap, so that we don't have to keep track * of all the buffer chunks and can remap them in restorer * in one call. */ RM_SHREMAP, /* * Privately used objects. Buffer grow and remap is the * same as for SHREMAP, but memory regions are MAP_PRIVATE. */ RM_PRIVATE, RST_MEM_TYPES, }; /* * Disables SHARED and SHREMAP allocations, turns on PRIVATE */ extern void rst_mem_switch_to_private(void); /* * Reports a cookie of a current shared buffer position, that * can later be used in rst_mem_remap_ptr() to find out the object * pointer in the restorer blob. */ extern unsigned long rst_mem_align_cpos(int type); extern void *rst_mem_remap_ptr(unsigned long pos, int type); #define RST_MEM_FIXUP_PPTR(ptr) \ do { \ ptr = rst_mem_remap_ptr((unsigned long)ptr, RM_PRIVATE); \ } while (0) /* * Allocate and free objects. We don't need to free arbitrary * object, thus allocation is simple (linear) and only the * last object can be freed (pop-ed from buffer). */ extern void *rst_mem_alloc(unsigned long size, int type); extern void rst_mem_free_last(int type); /* Word-align the current freelist pointer for the next allocation. If we don't * align pointers, some futex and atomic operations can fail. */ extern void rst_mem_align(int type); /* * Routines to remap SHREMAP and PRIVATE into restorer address space */ extern unsigned long rst_mem_lock(void); extern int rst_mem_remap(void *to); extern void *shmalloc(size_t bytes); extern void shfree_last(void *ptr); #endif /* __CR_RST_MALLOC__H__ */ crac-criu-1.5.0/criu/include/rst_info.h000066400000000000000000000034011471504326700177550ustar00rootroot00000000000000#ifndef __CR_RST_INFO_H__ #define __CR_RST_INFO_H__ #include "common/lock.h" #include "common/list.h" #include "vma.h" #include "kerndat.h" #include "images/mm.pb-c.h" #include "images/core.pb-c.h" struct task_entries { int nr_threads, nr_tasks, nr_helpers; futex_t nr_in_progress; futex_t start; atomic_t cr_err; mutex_t userns_sync_lock; mutex_t last_pid_mutex; }; struct fdt { int nr; /* How many tasks share this fd table */ pid_t pid; /* Who should restore this fd table */ /* * The fd table is ready for restoing, if fdt_lock is equal to nr * The fdt table was restrored, if fdt_lock is equal to nr + 1 */ futex_t fdt_lock; }; struct rst_rseq { uint64_t rseq_abi_pointer; uint64_t rseq_cs_pointer; }; struct rst_info { struct list_head fds; void *premmapped_addr; unsigned long premmapped_len; unsigned long clone_flags; void *munmap_restorer; int service_fd_id; struct fdt *fdt; struct vm_area_list vmas; MmEntry *mm; struct list_head vma_io; unsigned int pages_img_id; u32 cg_set; union { struct pstree_item *pgrp_leader; futex_t pgrp_set; }; struct file_desc *cwd; struct file_desc *root; bool has_umask; u32 umask; /* * We set this flag when process has seccomp filters * so that we know to suspend them before we unmap the * restorer blob. */ bool has_seccomp; /* * To be compatible with old images where filters * are bound to group leader and we need to use tsync flag. */ bool has_old_seccomp_filter; struct rst_rseq *rseqe; void *breakpoint; }; extern struct task_entries *task_entries; static inline void lock_last_pid(void) { mutex_lock(&task_entries->last_pid_mutex); } static inline void unlock_last_pid(void) { mutex_unlock(&task_entries->last_pid_mutex); } #endif /* __CR_RST_INFO_H__ */ crac-criu-1.5.0/criu/include/sched.h000066400000000000000000000014361471504326700172260ustar00rootroot00000000000000#ifndef __CR_SCHED_H__ #define __CR_SCHED_H__ #include #ifndef ptr_to_u64 #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) #endif #ifndef u64_to_ptr #define u64_to_ptr(x) ((void *)(uintptr_t)x) #endif /* * This structure is needed by clone3(). The kernel * calls it 'struct clone_args'. As CRIU will always * need at least this part of the structure (VER1) * to be able to test if clone3() with set_tid works, * the structure is defined here as 'struct _clone_args'. */ struct _clone_args { __aligned_u64 flags; __aligned_u64 pidfd; __aligned_u64 child_tid; __aligned_u64 parent_tid; __aligned_u64 exit_signal; __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; __aligned_u64 set_tid; __aligned_u64 set_tid_size; }; #endif /* __CR_SCHED_H__ */ crac-criu-1.5.0/criu/include/seccomp.h000066400000000000000000000035711471504326700175730ustar00rootroot00000000000000#ifndef __CR_SECCOMP_H__ #define __CR_SECCOMP_H__ #include #include #include "images/seccomp.pb-c.h" #include "images/core.pb-c.h" #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif #ifndef SECCOMP_MODE_STRICT #define SECCOMP_MODE_STRICT 1 #endif #ifndef SECCOMP_MODE_FILTER #define SECCOMP_MODE_FILTER 2 #endif #ifndef SECCOMP_SET_MODE_FILTER #define SECCOMP_SET_MODE_FILTER 1 #endif #ifndef SECCOMP_FILTER_FLAG_TSYNC #define SECCOMP_FILTER_FLAG_TSYNC 1 #endif struct thread_restore_args; struct task_restore_args; struct pstree_item; struct rb_node; /* * seccomp filters are bound to @current->seccomp.filter * in the kernel, ie they are per thread structures. * * If filter is assigned then every subsequent call * to fork() makes a copy of this @current->seccomp.filter * pointer into child process. * * The thread group can share a filter if the filter * is assigned with SECCOMP_FILTER_FLAG_TSYNC on group * which has no filters yet. */ struct seccomp_filter_chain { struct seccomp_filter_chain *prev; SeccompFilter filter; }; struct seccomp_entry { struct rb_node node; struct seccomp_entry *next; pid_t tid_real; size_t img_filter_pos; unsigned int mode; struct seccomp_filter_chain *chain; size_t nr_chains; }; extern struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory); #define seccomp_find_entry(tid_real) seccomp_lookup(tid_real, false, true) extern int seccomp_collect_entry(pid_t tid_real, unsigned int mode); extern void seccomp_free_entries(void); extern int seccomp_dump_thread(pid_t tid_real, ThreadCoreEntry *thread_core); extern int seccomp_collect_dump_filters(void); extern int seccomp_read_image(void); extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta); extern void seccomp_rst_reloc(struct thread_restore_args *thread_arg); #endif crac-criu-1.5.0/criu/include/seize.h000066400000000000000000000005401471504326700172520ustar00rootroot00000000000000#ifndef __CR_SEIZE_H__ #define __CR_SEIZE_H__ extern int collect_pstree(void); extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); extern bool alarm_timeouted(void); extern char *task_comm_info(pid_t pid, char *comm, size_t size); extern char *__task_comm_info(pid_t pid); #endif crac-criu-1.5.0/criu/include/servicefd.h000066400000000000000000000025661471504326700201170ustar00rootroot00000000000000#ifndef __CR_SERVICE_FD_H__ #define __CR_SERVICE_FD_H__ #include #include #include #include #include #include "criu-log.h" enum sfd_type { SERVICE_FD_MIN, LOG_FD_OFF, IMG_FD_OFF, IMG_STREAMER_FD_OFF, PROC_FD_OFF, /* fd with /proc for all proc_ calls */ PROC_PID_FD_OFF, PROC_SELF_FD_OFF, CR_PROC_FD_OFF, /* some other's proc fd: * - For dump -- target ns' proc * - For restore -- CRIU ns' proc */ ROOT_FD_OFF, /* Root of the namespace we dump/restore */ CGROUP_YARD, CGROUPD_SK, /* Socket for cgroupd to fix up thread's cgroup controller */ USERNSD_SK, /* Socket for usernsd */ NS_FD_OFF, /* Node's net namespace fd */ TRANSPORT_FD_OFF, /* to transfer file descriptors */ RPC_SK_OFF, FDSTORE_SK_OFF, SERVICE_FD_MAX }; struct pstree_item; extern bool sfds_protected; extern const char *sfd_type_name(enum sfd_type type); extern int init_service_fd(void); extern int get_service_fd(enum sfd_type type); extern bool is_any_service_fd(int fd); extern bool is_service_fd(int fd, enum sfd_type type); extern int service_fd_min_fd(struct pstree_item *item); extern int install_service_fd(enum sfd_type type, int fd); extern int close_service_fd(enum sfd_type type); extern void __close_service_fd(enum sfd_type type); extern int clone_service_fd(struct pstree_item *me); #endif /* __CR_SERVICE_FD_H__ */ crac-criu-1.5.0/criu/include/setproctitle.h000066400000000000000000000003321471504326700206530ustar00rootroot00000000000000#ifndef __CR_SETPROCTITLE_H__ #define __CR_SETPROCTITLE_H__ extern void __setproctitle_init(int argc, char *argv[], char *envp[]); extern void __setproctitle(const char *fmt, ...); #endif /* __CR_SETPROCTITLE_H__ */ crac-criu-1.5.0/criu/include/shmem.h000066400000000000000000000014571471504326700172540ustar00rootroot00000000000000#ifndef __CR_SHMEM_H__ #define __CR_SHMEM_H__ #include "int.h" #include "common/lock.h" #include "images/vma.pb-c.h" struct vma_area; extern int collect_shmem(int pid, struct vma_area *vma); extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); extern int fixup_sysv_shmems(void); extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); extern int restore_sysv_shmem_content(void *addr, unsigned long size, unsigned long shmid); extern int restore_memfd_shmem_content(int fd, unsigned long shmid, unsigned long size); #define SYSV_SHMEM_SKIP_FD (0x7fffffff) #endif /* __CR_SHMEM_H__ */ crac-criu-1.5.0/criu/include/sigframe.h000066400000000000000000000005031471504326700177270ustar00rootroot00000000000000/* * Generic sigframe bits. */ #ifndef __CR_SIGFRAME_H__ #define __CR_SIGFRAME_H__ #include #include "images/core.pb-c.h" extern int construct_sigframe(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe, k_rtsigset_t *blkset, CoreEntry *core); #endif /* __CR_SIGFRAME_H__ */ crac-criu-1.5.0/criu/include/signalfd.h000066400000000000000000000004041471504326700177210ustar00rootroot00000000000000#ifndef __CR_SIGNALFD_H__ #define __CR_SIGNALFD_H__ struct cr_imgset; struct fd_parms; extern int is_signalfd_link(char *link); extern const struct fdtype_ops signalfd_dump_ops; extern struct collect_image_info signalfd_cinfo; #endif /* __CR_SIGNALFD_H__ */ crac-criu-1.5.0/criu/include/sizes.h000066400000000000000000000021521471504326700172710ustar00rootroot00000000000000#ifndef __CR_SIZES_H__ #define __CR_SIZES_H__ /* * Copied from the Linux kernel header include/linux/sizes.h */ #define SZ_1 0x00000001 #define SZ_2 0x00000002 #define SZ_4 0x00000004 #define SZ_8 0x00000008 #define SZ_16 0x00000010 #define SZ_32 0x00000020 #define SZ_64 0x00000040 #define SZ_128 0x00000080 #define SZ_256 0x00000100 #define SZ_512 0x00000200 #define SZ_1K 0x00000400 #define SZ_2K 0x00000800 #define SZ_4K 0x00001000 #define SZ_8K 0x00002000 #define SZ_16K 0x00004000 #define SZ_32K 0x00008000 #define SZ_64K 0x00010000 #define SZ_128K 0x00020000 #define SZ_256K 0x00040000 #define SZ_512K 0x00080000 #define SZ_1M 0x00100000 #define SZ_2M 0x00200000 #define SZ_4M 0x00400000 #define SZ_8M 0x00800000 #define SZ_16M 0x01000000 #define SZ_32M 0x02000000 #define SZ_64M 0x04000000 #define SZ_128M 0x08000000 #define SZ_256M 0x10000000 #define SZ_512M 0x20000000 #define SZ_1G 0x40000000 #define SZ_2G 0x80000000 #define SZ_4G 0x100000000ULL #define SZ_8G 0x200000000ULL #define SZ_16G 0x400000000ULL #define SZ_32G 0x800000000ULL #define SZ_64T 0x400000000000ULL #endif /* __CR_SIZES_H__ */ crac-criu-1.5.0/criu/include/sk-inet.h000066400000000000000000000046331471504326700175140ustar00rootroot00000000000000#ifndef __CR_SK_INET_H__ #define __CR_SK_INET_H__ #include #include "sockets.h" #include "files.h" #include "common/list.h" #include "images/sk-inet.pb-c.h" #define INET_ADDR_LEN 48 /* max of INET_ADDRSTRLEN and INET6_ADDRSTRLEN */ #ifndef TCP_REPAIR #define TCP_REPAIR 19 /* TCP sock is under repair right now */ #define TCP_REPAIR_QUEUE 20 #define TCP_QUEUE_SEQ 21 #define TCP_REPAIR_OPTIONS 22 #endif #ifndef IP_HDRINCL #define IP_HDRINCL 3 #endif #ifndef IP_NODEFRAG #define IP_NODEFRAG 22 #endif #ifndef IPV6_HDRINCL #define IPV6_HDRINCL 36 #endif struct inet_sk_desc { struct socket_desc sd; unsigned int type; unsigned int src_port; unsigned int dst_port; unsigned int state; unsigned int rqlen; unsigned int wqlen; /* sent + unsent data */ unsigned int uwqlen; /* unsent data */ unsigned int src_addr[4]; unsigned int dst_addr[4]; unsigned short shutdown; bool cork; int rfd; int cpt_reuseaddr; struct list_head rlist; void *priv; }; struct inet_port; struct inet_sk_info { InetSkEntry *ie; struct file_desc d; struct inet_port *port; struct list_head port_list; /* * This is an fd by which the socket is opened. * It will be carried down to restorer code to * repair-off the socket at the very end. */ int sk_fd; struct list_head rlist; }; extern int inet_bind(int sk, struct inet_sk_info *); extern int inet_connect(int sk, struct inet_sk_info *); #ifdef CR_NOGLIBC #define setsockopt sys_setsockopt #define pr_perror(fmt, ...) pr_err(fmt ": errno %d\n", ##__VA_ARGS__, -ret) #endif static inline void tcp_repair_off(int fd) { int aux = 0, ret; ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) pr_perror("Failed to turn off repair mode on socket %d", fd); } extern void tcp_locked_conn_add(struct inet_sk_info *); extern void rst_unlock_tcp_connections(void); extern void cpt_unlock_tcp_connections(void); extern int dump_one_tcp(int sk, struct inet_sk_desc *sd, SkOptsEntry *soe); extern int restore_one_tcp(int sk, struct inet_sk_info *si); #define SK_EST_PARAM "tcp-established" #define SK_INFLIGHT_PARAM "skip-in-flight" #define SK_CLOSE_PARAM "tcp-close" struct task_restore_args; int prepare_tcp_socks(struct task_restore_args *); struct rst_tcp_sock { int sk; bool reuseaddr; }; union libsoccr_addr; int restore_sockaddr(union libsoccr_addr *sa, int family, u32 pb_port, u32 *pb_addr, u32 ifindex); #endif /* __CR_SK_INET_H__ */ crac-criu-1.5.0/criu/include/sk-packet.h000066400000000000000000000015001471504326700200120ustar00rootroot00000000000000#ifndef __CR_SK_PACKET_H__ #define __CR_SK_PACKET_H__ #ifndef PACKET_TIMESTAMP #define PACKET_TIMESTAMP 17 #endif struct cr_imgset; struct fd_parms; struct vma_area; extern struct collect_image_info packet_sk_cinfo; extern int dump_socket_map(struct vma_area *vma); extern int collect_socket_map(struct vma_area *); struct nlmsghdr; extern int packet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg); #ifndef PACKET_VNET_HDR #define PACKET_VNET_HDR 15 #endif #ifndef PACKET_FANOUT #define PACKET_FANOUT 18 #endif #ifndef TPACKET3_HDRLEN struct tpacket_req3 { unsigned int tp_block_size; unsigned int tp_block_nr; unsigned int tp_frame_size; unsigned int tp_frame_nr; unsigned int tp_retire_blk_tov; unsigned int tp_sizeof_priv; unsigned int tp_feature_req_word; }; #endif #endif /* __CR_SK_PACKET_H__ */ crac-criu-1.5.0/criu/include/sk-queue.h000066400000000000000000000003661471504326700177000ustar00rootroot00000000000000#ifndef __CR_SK_QUEUE_H__ #define __CR_SK_QUEUE_H__ extern struct collect_image_info sk_queues_cinfo; extern int dump_sk_queue(int sock_fd, int sock_id); extern int restore_sk_queue(int fd, unsigned int peer_id); #endif /* __CR_SK_QUEUE_H__ */ crac-criu-1.5.0/criu/include/sockets.h000066400000000000000000000110101471504326700176000ustar00rootroot00000000000000#ifndef __CR_SOCKETS_H__ #define __CR_SOCKETS_H__ #include #include #include #include "images/sk-opts.pb-c.h" #include "images/fdinfo.pb-c.h" struct fdinfo_list_entry; struct sk_opts_entry; struct file_desc; struct fd_parms; struct cr_imgset; struct nlmsghdr; struct cr_img; struct socket_desc { unsigned int family; unsigned int ino; struct socket_desc *next; struct ns_id *sk_ns; int already_dumped; }; extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); extern int dump_socket_opts(int sk, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); extern int restore_prepare_socket(int sk); extern void preload_socket_modules(void); extern bool socket_test_collect_bit(unsigned int family, unsigned int proto); extern int sk_collect_one(unsigned ino, int family, struct socket_desc *d, struct ns_id *ns); struct ns_id; extern int collect_sockets(struct ns_id *); extern struct collect_image_info inet_sk_cinfo; extern struct collect_image_info unix_sk_cinfo; extern int add_fake_unix_queuers(void); extern int fix_external_unix_sockets(void); extern int prepare_scms(void); extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids); extern struct collect_image_info netlink_sk_cinfo; extern struct socket_desc *lookup_socket_ino(unsigned int ino, int family); extern struct socket_desc *lookup_socket(unsigned int ino, int family, int proto); extern const struct fdtype_ops unix_dump_ops; extern const struct fdtype_ops inet_dump_ops; extern const struct fdtype_ops inet6_dump_ops; extern const struct fdtype_ops netlink_dump_ops; extern const struct fdtype_ops packet_dump_ops; extern int inet_collect_one(struct nlmsghdr *h, int family, int type, struct ns_id *ns); extern int unix_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *); extern int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg); extern int unix_sk_id_add(unsigned int ino); extern int unix_sk_ids_parse(char *optarg); extern int unix_prepare_root_shared(void); extern void init_sk_info_hash(void); extern int do_dump_opt(int sk, int level, int name, void *val, int len); #define dump_opt(s, l, n, f) do_dump_opt(s, l, n, f, sizeof(*f)) extern int do_restore_opt(int sk, int level, int name, void *val, int len); #define restore_opt(s, l, n, f) do_restore_opt(s, l, n, f, sizeof(*f)) #define sk_encode_shutdown(img, mask) \ do { \ /* \ * protobuf SK_SHUTDOWN__ bits match those \ * reported by kernel \ */ \ (img)->shutdown = mask; \ if ((img)->shutdown != SK_SHUTDOWN__NONE) \ (img)->has_shutdown = true; \ } while (0) static inline int sk_decode_shutdown(int val) { static const int hows[] = { -1, SHUT_RD, SHUT_WR, SHUT_RDWR }; return hows[val]; } #define USK_EXT_PARAM "ext-unix-sk" #ifndef NETLINK_SOCK_DIAG #define NETLINK_SOCK_DIAG NETLINK_INET_DIAG #endif extern int set_netns(uint32_t ns_id); #ifndef SIOCGSKNS #define SIOCGSKNS 0x894C /* get socket network namespace */ #endif extern int kerndat_socket_netns(void); extern int kerndat_socket_unix_file(void); extern const char *tcp_state_name(unsigned int state, char *nm, size_t size); extern const char *socket_type_name(unsigned int type, char *nm, size_t size); extern const char *socket_family_name(unsigned int family, char *nm, size_t size); extern const char *socket_proto_name(unsigned int proto, char *nm, size_t size); #define __tcp_state_name(state, a) tcp_state_name(state, a, sizeof(a)) #define __socket_type_name(type, a) socket_type_name(type, a, sizeof(a)) #define __socket_family_name(family, a) socket_family_name(family, a, sizeof(a)) #define __socket_proto_name(proto, a) socket_proto_name(proto, a, sizeof(a)) #define __socket_info_helper(__h, __v) \ ({ \ char *__nm = alloca(32); \ const char *__r = __h(__v, __nm, 32); \ __r; \ }) #define ___tcp_state_name(state) __socket_info_helper(tcp_state_name, state) #define ___socket_type_name(type) __socket_info_helper(socket_type_name, type) #define ___socket_family_name(family) __socket_info_helper(socket_family_name, family) #define ___socket_proto_name(proto) __socket_info_helper(socket_proto_name, proto) #ifndef SO_BUF_LOCK #define SO_BUF_LOCK 72 #endif #endif /* __CR_SOCKETS_H__ */ crac-criu-1.5.0/criu/include/stats.h000066400000000000000000000015351471504326700172760ustar00rootroot00000000000000#ifndef __CR_STATS_H__ #define __CR_STATS_H__ enum { TIME_FREEZING, TIME_FROZEN, TIME_MEMDUMP, TIME_MEMWRITE, TIME_IRMAP_RESOLVE, DUMP_TIME_NR_STATS, }; enum { TIME_FORK, TIME_RESTORE, RESTORE_TIME_NS_STATS, }; extern void timing_start(int t); extern void timing_stop(int t); enum { CNT_PAGES_SCANNED, CNT_PAGES_SKIPPED_PARENT, CNT_PAGES_WRITTEN, CNT_PAGES_LAZY, CNT_PAGE_PIPES, CNT_PAGE_PIPE_BUFS, CNT_SHPAGES_SCANNED, CNT_SHPAGES_SKIPPED_PARENT, CNT_SHPAGES_WRITTEN, DUMP_CNT_NR_STATS, }; enum { CNT_PAGES_COMPARED, CNT_PAGES_SKIPPED_COW, CNT_PAGES_RESTORED, RESTORE_CNT_NR_STATS, }; extern void cnt_add(int c, unsigned long val); extern void cnt_sub(int c, unsigned long val); #define DUMP_STATS 1 #define RESTORE_STATS 2 extern int init_stats(int what); extern void write_stats(int what); #endif /* __CR_STATS_H__ */ crac-criu-1.5.0/criu/include/string.h000066400000000000000000000004121471504326700174370ustar00rootroot00000000000000#ifndef __CR_STRING_H__ #define __CR_STRING_H__ #include #include "common/config.h" extern size_t __strlcpy(char *dest, const char *src, size_t size); extern size_t __strlcat(char *dest, const char *src, size_t count); #endif /* __CR_STRING_H__ */ crac-criu-1.5.0/criu/include/syscall.h000066400000000000000000000007701471504326700176120ustar00rootroot00000000000000#ifndef __CR_SYSCALL_H__ #define __CR_SYSCALL_H__ static inline int sys_fsopen(const char *fsname, unsigned int flags) { return syscall(__NR_fsopen, fsname, flags); } static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) { return syscall(__NR_fsconfig, fd, cmd, key, value, aux); } static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) { return syscall(__NR_fsmount, fd, flags, attr_flags); } #endif /* __CR_SYSCALL_H__ */crac-criu-1.5.0/criu/include/sysctl.h000066400000000000000000000017411471504326700174600ustar00rootroot00000000000000#ifndef __CR_SYSCTL_H__ #define __CR_SYSCTL_H__ struct sysctl_req { char *name; void *arg; int type; int flags; }; extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns); enum { CTL_READ, CTL_WRITE, }; #define CTL_SHIFT 4 /* Up to 16 types */ #define CTL_U32 1 /* Single u32 */ #define CTL_U64 2 /* Single u64 */ #define __CTL_U32A 3 /* Array of u32 */ #define __CTL_U64A 4 /* Array of u64 */ #define __CTL_STR 5 /* String */ #define CTL_32 6 /* Single s32 */ #define CTL_U32A(n) (__CTL_U32A | ((n) << CTL_SHIFT)) #define CTL_U64A(n) (__CTL_U64A | ((n) << CTL_SHIFT)) #define CTL_STR(len) (__CTL_STR | ((len) << CTL_SHIFT)) #define CTL_LEN(t) ((t) >> CTL_SHIFT) #define CTL_TYPE(t) ((t) & ((1 << CTL_SHIFT) - 1)) /* * Some entries might be missing mark them as optional. */ #define CTL_FLAGS_OPTIONAL 1 #define CTL_FLAGS_HAS 2 #define CTL_FLAGS_READ_EIO_SKIP 4 #define CTL_FLAGS_IPC_EACCES_SKIP 5 #endif /* __CR_SYSCTL_H__ */ crac-criu-1.5.0/criu/include/sysfs_parse.h000066400000000000000000000010421471504326700204720ustar00rootroot00000000000000#ifndef __CR_SYSFS_PARSE_H__ #define __CR_SYSFS_PARSE_H__ #define SYSFS_AUFS "/sys/fs/aufs/" #define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ #define SBINFO_PATH_LEN (sizeof SYSFS_AUFS + SBINFO_LEN) /* /sys/fs/aufs/ */ #define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%3d */ struct mount_info; struct vma_area; extern int parse_aufs_branches(struct mount_info *mi); extern int fixup_aufs_vma_fd(struct vma_area *vma, int vm_file_fd); extern void free_aufs_branches(void); #endif /* __CR_SYSFS_PARSE_H__ */ crac-criu-1.5.0/criu/include/timens.h000066400000000000000000000003601471504326700174320ustar00rootroot00000000000000#ifndef __CR_TIME_NS_H__ #define __CR_TIME_NS_H__ extern int dump_time_ns(int ns_id); extern int prepare_timens(int pid); extern struct ns_desc time_ns_desc; extern struct ns_desc time_for_children_ns_desc; #endif /* __CR_TIME_NS_H__ */ crac-criu-1.5.0/criu/include/timerfd.h000066400000000000000000000017251471504326700175730ustar00rootroot00000000000000#ifndef __CR_TIMERFD_H__ #define __CR_TIMERFD_H__ #include #include #include "files.h" #include "images/timerfd.pb-c.h" struct pstree_item; struct restore_timerfd { int id; int fd; int clockid; int settime_flags; unsigned long ticks; struct itimerspec val; }; extern const struct fdtype_ops timerfd_dump_ops; extern struct collect_image_info timerfd_cinfo; struct task_restore_args; int prepare_timerfds(struct task_restore_args *); extern int check_timerfd(void); extern int is_timerfd_link(char *link); #ifndef TFD_TIMER_ABSTIME #define TFD_TIMER_ABSTIME (1 << 0) #endif #ifndef TFD_IOC_SET_TICKS #define TFD_IOC_SET_TICKS _IOW('T', 0, u64) #endif static inline int verify_timerfd(TimerfdEntry *tfe) { if (tfe->clockid != CLOCK_REALTIME && tfe->clockid != CLOCK_BOOTTIME && tfe->clockid != CLOCK_MONOTONIC) { pr_err("Unknown clock type %d for %#x\n", tfe->clockid, tfe->id); return -1; } return 0; } #endif /* __CR_TIMERFD_H__ */ crac-criu-1.5.0/criu/include/tls.h000066400000000000000000000013031471504326700167330ustar00rootroot00000000000000#ifndef __CR_TLS_H__ #define __CR_TLS_H__ #ifdef CONFIG_GNUTLS int tls_x509_init(int sockfd, bool is_server); void tls_terminate_session(bool async); ssize_t tls_send(const void *buf, size_t len, int flags); ssize_t tls_recv(void *buf, size_t len, int flags); int tls_send_data_from_fd(int fd, unsigned long len); int tls_recv_data_to_fd(int fd, unsigned long len); #else /* CONFIG_GNUTLS */ #define tls_x509_init(sockfd, is_server) (0) #define tls_send(buf, len, flags) (-1) #define tls_recv(buf, len, flags) (-1) #define tls_send_data_from_fd(fd, len) (-1) #define tls_recv_data_to_fd(fd, len) (-1) #define tls_terminate_session(async) #endif /* CONFIG_HAS_GNUTLS */ #endif /* __CR_TLS_H__ */ crac-criu-1.5.0/criu/include/tty.h000066400000000000000000000017361471504326700167630ustar00rootroot00000000000000#ifndef __CR_TTY_H__ #define __CR_TTY_H__ #include #include #include "files.h" /* Kernel's limit */ #define TERMIOS_NCC 19 /* Popular serial console's majors, which not defined in */ #define USB_SERIAL_MAJOR 188 #define LOW_DENSE_SERIAL_MAJOR 204 extern const struct fdtype_ops tty_dump_ops; struct tty_driver; struct tty_driver *get_tty_driver(dev_t rdev, dev_t dev); static inline int is_tty(dev_t rdev, dev_t dev) { return get_tty_driver(rdev, dev) != NULL; } extern int tty_post_actions(void); extern int dump_verify_tty_sids(void); extern struct collect_image_info tty_info_cinfo; extern struct collect_image_info tty_cinfo; extern struct collect_image_info tty_cdata; struct mount_info; extern int devpts_restore(struct mount_info *pm); extern int tty_prep_fds(void); extern int tty_init_restore(void); extern int devpts_check_bindmount(struct mount_info *m); #define OPT_SHELL_JOB "shell-job" #endif /* __CR_TTY_H__ */ crac-criu-1.5.0/criu/include/tun.h000066400000000000000000000011001471504326700167320ustar00rootroot00000000000000#ifndef __CR_TUN_H__ #define __CR_TUN_H__ #ifndef TUN_MINOR #define TUN_MINOR 200 #endif extern struct ns_id *ns; #include #include "images/netdev.pb-c.h" extern const struct fdtype_ops tunfile_dump_ops; extern int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **info); struct net_link; extern int restore_one_tun(struct ns_id *ns, struct net_link *link, int nlsk); extern struct collect_image_info tunfile_cinfo; extern int check_tun_cr(int no_tun_err); extern int check_tun_netns_cr(bool *result); #endif /* __CR_TUN_H__ */ crac-criu-1.5.0/criu/include/types.h000066400000000000000000000001711471504326700172770ustar00rootroot00000000000000#ifndef __CR_INC_TYPES_H__ #define __CR_INC_TYPES_H__ #include #include "asm/types.h" #endif crac-criu-1.5.0/criu/include/uffd.h000066400000000000000000000006261471504326700170640ustar00rootroot00000000000000#ifndef __CR_UFFD_H_ #define __CR_UFFD_H_ struct task_restore_args; extern int uffd_open(int flags, unsigned long *features, int *err); extern bool uffd_noncooperative(void); extern int setup_uffd(int pid, struct task_restore_args *task_args); extern int lazy_pages_setup_zombie(int pid); extern int prepare_lazy_pages_socket(void); extern int lazy_pages_finish_restore(void); #endif /* __CR_UFFD_H_ */ crac-criu-1.5.0/criu/include/unix_diag.h000066400000000000000000000023351471504326700201060ustar00rootroot00000000000000#ifndef __CR_UNIX_DIAG_H__ #define __CR_UNIX_DIAG_H__ struct unix_diag_req { u8 sdiag_family; u8 sdiag_protocol; u16 pad; u32 udiag_states; u32 udiag_ino; u32 udiag_show; u32 udiag_cookie[2]; }; #define UDIAG_SHOW_NAME 0x00000001 /* show name (not path) */ #define UDIAG_SHOW_VFS 0x00000002 /* show VFS inode info */ #define UDIAG_SHOW_PEER 0x00000004 /* show peer socket info */ #define UDIAG_SHOW_ICONS 0x00000008 /* show pending connections */ #define UDIAG_SHOW_RQLEN 0x00000010 /* show skb receive queue len */ #define UDIAG_SHOW_MEMINFO 0x00000020 /* show memory info of a socket */ struct unix_diag_msg { u8 udiag_family; u8 udiag_type; u8 udiag_state; u8 pad; u32 udiag_ino; u32 udiag_cookie[2]; }; enum { SK_MEMINFO_RMEM_ALLOC, SK_MEMINFO_RCVBUF, SK_MEMINFO_WMEM_ALLOC, SK_MEMINFO_SNDBUF, SK_MEMINFO_FWD_ALLOC, SK_MEMINFO_WMEM_QUEUED, SK_MEMINFO_OPTMEM, SK_MEMINFO_VARS, }; enum { UNIX_DIAG_NAME, UNIX_DIAG_VFS, UNIX_DIAG_PEER, UNIX_DIAG_ICONS, UNIX_DIAG_RQLEN, UNIX_DIAG_MEMINFO, UNIX_DIAG_SHUTDOWN, UNIX_DIAG_MAX, }; struct unix_diag_vfs { u32 udiag_vfs_ino; u32 udiag_vfs_dev; }; struct unix_diag_rqlen { u32 udiag_rqueue; u32 udiag_wqueue; }; #endif /* __CR_UNIX_DIAG_H__ */ crac-criu-1.5.0/criu/include/util-caps.h000066400000000000000000000022731471504326700200410ustar00rootroot00000000000000#ifndef __CR_UTIL_CAPS_H__ #define __CR_UTIL_CAPS_H__ #include #ifndef CAP_CHECKPOINT_RESTORE #define CAP_CHECKPOINT_RESTORE 40 #endif static inline bool has_capability(int cap, u32 *cap_eff) { int mask = CAP_TO_MASK(cap); int index = CAP_TO_INDEX(cap); u32 effective; effective = cap_eff[index]; if (!(mask & effective)) { pr_debug("Effective capability %d missing\n", cap); return false; } return true; } static inline bool has_cap_checkpoint_restore(u32 *cap_eff) { /* * Everything guarded by CAP_CHECKPOINT_RESTORE is also * guarded by CAP_SYS_ADMIN. Check for both capabilities. */ if (has_capability(CAP_CHECKPOINT_RESTORE, cap_eff) || has_capability(CAP_SYS_ADMIN, cap_eff)) return true; return false; } static inline bool has_cap_net_admin(u32 *cap_eff) { return has_capability(CAP_NET_ADMIN, cap_eff); } static inline bool has_cap_sys_chroot(u32 *cap_eff) { return has_capability(CAP_SYS_CHROOT, cap_eff); } static inline bool has_cap_setuid(u32 *cap_eff) { return has_capability(CAP_SETUID, cap_eff); } static inline bool has_cap_sys_resource(u32 *cap_eff) { return has_capability(CAP_SYS_RESOURCE, cap_eff); } #endif /* __CR_UTIL_CAPS_H__ */ crac-criu-1.5.0/criu/include/util-pie.h000066400000000000000000000005661471504326700176730ustar00rootroot00000000000000#ifndef __CR_UTIL_NET_H__ #define __CR_UTIL_NET_H__ #include #include #ifndef UNIX_PATH_MAX #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif #ifndef SO_PEEK_OFF #define SO_PEEK_OFF 42 #endif #include "common/scm.h" extern int open_detach_mount(char *dir); #endif /* __CR_UTIL_NET_H__ */ crac-criu-1.5.0/criu/include/util-vdso.h000066400000000000000000000050441471504326700200650ustar00rootroot00000000000000#ifndef __CR_UTIL_VDSO_H__ #define __CR_UTIL_VDSO_H__ /* * VDSO management common definitions. * * This header file is included by the criu main code and the parasite code. * It contains definitions shared by these 2 parts. * * This file should not be included except in pie/util-vdso.c, include/vdso.h * and include/parasite-vdso.h */ #include /* * Each architecture must export: * VDSO_SYMBOL_MAX, the number of vDSO symbols to manage * ARCH_VDSO_SYMBOLS, a table of string containing the vDSO symbol names * vdso_redirect_calls, a service called to redirect the vDSO symbols in * the parasite code. */ #include "asm/vdso.h" struct vdso_symbol { char name[32]; unsigned long offset; }; struct vdso_symtable { unsigned long vdso_size; unsigned long vvar_size; struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; bool vdso_before_vvar; /* order of vdso/vvar pair */ }; struct vdso_maps { unsigned long vdso_start; unsigned long vvar_start; struct vdso_symtable sym; bool compatible; }; static inline bool vdso_is_present(struct vdso_maps *m) { return m->vdso_start != VDSO_BAD_ADDR; } #define VDSO_SYMBOL_INIT \ { \ .offset = VDSO_BAD_ADDR, \ } #define VDSO_SYMTABLE_INIT \ { \ .vdso_size = VDSO_BAD_SIZE, \ .vvar_size = VVAR_BAD_SIZE, \ .symbols = { \ [0 ... VDSO_SYMBOL_MAX - 1] = \ (struct vdso_symbol)VDSO_SYMBOL_INIT, \ }, \ .vdso_before_vvar = false, \ } #define VDSO_MAPS_INIT \ { \ .vdso_start = VDSO_BAD_ADDR, .vvar_start = VVAR_BAD_ADDR, .sym = VDSO_SYMTABLE_INIT, \ } #ifdef CONFIG_VDSO_32 #define Ehdr_t Elf32_Ehdr #define Sym_t Elf32_Sym #define Phdr_t Elf32_Phdr #define Word_t Elf32_Word #define Dyn_t Elf32_Dyn #ifndef ELF_ST_TYPE #define ELF_ST_TYPE ELF32_ST_TYPE #endif #ifndef ELF_ST_BIND #define ELF_ST_BIND ELF32_ST_BIND #endif #define vdso_fill_symtable vdso_fill_symtable_compat #else /* CONFIG_VDSO_32 */ #define Ehdr_t Elf64_Ehdr #define Sym_t Elf64_Sym #define Phdr_t Elf64_Phdr #define Word_t Elf64_Word #define Dyn_t Elf64_Dyn #ifndef ELF_ST_TYPE #define ELF_ST_TYPE ELF64_ST_TYPE #endif #ifndef ELF_ST_BIND #define ELF_ST_BIND ELF64_ST_BIND #endif #endif /* CONFIG_VDSO_32 */ extern int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t); #endif /* __CR_UTIL_VDSO_H__ */ crac-criu-1.5.0/criu/include/util.h000066400000000000000000000335431471504326700171210ustar00rootroot00000000000000#ifndef __CR_UTIL_H__ #define __CR_UTIL_H__ /* * Some bits are stolen from perf and kvm tools */ #include #include #include #include #include #include #include #include #include #include "int.h" #include "common/compiler.h" #include "xmalloc.h" #include "common/bug.h" #include "log.h" #include "common/err.h" #define PREF_SHIFT_OP(pref, op, size) ((size)op(pref##BYTES_SHIFT)) #define KBYTES_SHIFT 10 #define MBYTES_SHIFT 20 #define GBYTES_SHIFT 30 #define KBYTES(size) PREF_SHIFT_OP(K, >>, size) #define MBYTES(size) PREF_SHIFT_OP(M, >>, size) #define GBYTES(size) PREF_SHIFT_OP(G, >>, size) #define KILO(size) PREF_SHIFT_OP(K, <<, size) #define MEGA(size) PREF_SHIFT_OP(M, <<, size) #define GIGA(size) PREF_SHIFT_OP(G, <<, size) struct vma_area; struct list_head; extern int service_fd_rlim_cur; extern void pr_vma(const struct vma_area *vma_area); #define pr_info_vma(vma_area) pr_vma(vma_area) #define pr_vma_list(head) \ do { \ struct vma_area *vma; \ list_for_each_entry(vma, head, list) \ pr_vma(vma); \ } while (0) #define pr_info_vma_list(head) pr_vma_list(head) extern int move_fd_from(int *img_fd, int want_fd); extern int close_safe(int *fd); extern int reopen_fd_as_safe(char *file, int line, int new_fd, int old_fd, bool allow_reuse_fd); #define reopen_fd_as(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, false) #define reopen_fd_as_nocheck(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, true) extern void close_proc(void); extern int open_pid_proc(pid_t pid); extern int close_pid_proc(void); extern int set_proc_fd(int fd); extern pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid, void *child_tid, unsigned long newtls); /* * Values for pid argument of the proc opening routines below. * SELF would open file under /proc/self * GEN would open a file under /proc itself * NONE is internal, don't use it ;) */ #define PROC_SELF 0 #define PROC_GEN -1 #define PROC_NONE -2 extern int do_open_proc(pid_t pid, int flags, const char *fmt, ...) __attribute__((__format__(__printf__, 3, 4))); #define __open_proc(pid, ier, flags, fmt, ...) \ ({ \ int __fd = do_open_proc(pid, flags, fmt, ##__VA_ARGS__); \ if (__fd < 0 && (errno != (ier))) \ pr_perror("Can't open %d/" fmt " on procfs", pid, ##__VA_ARGS__); \ \ __fd; \ }) /* int open_proc(pid_t pid, const char *fmt, ...); */ #define open_proc(pid, fmt, ...) __open_proc(pid, 0, O_RDONLY, fmt, ##__VA_ARGS__) /* int open_proc_rw(pid_t pid, const char *fmt, ...); */ #define open_proc_rw(pid, fmt, ...) __open_proc(pid, 0, O_RDWR, fmt, ##__VA_ARGS__) #define open_proc_path(pid, fmt, ...) __open_proc(pid, 0, O_PATH, fmt, ##__VA_ARGS__) /* DIR *opendir_proc(pid_t pid, const char *fmt, ...); */ #define opendir_proc(pid, fmt, ...) \ ({ \ int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \ DIR *__d = NULL; \ \ if (__fd >= 0) { \ __d = fdopendir(__fd); \ if (__d == NULL) \ pr_perror("Can't fdopendir %d " \ "(%d/" fmt " on procfs)", \ __fd, pid, ##__VA_ARGS__); \ } \ __d; \ }) /* FILE *fopen_proc(pid_t pid, const char *fmt, ...); */ #define fopen_proc(pid, fmt, ...) \ ({ \ int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \ FILE *__f = NULL; \ \ if (__fd >= 0) { \ __f = fdopen(__fd, "r"); \ if (__f == NULL) \ pr_perror("Can't fdopen %d " \ "(%d/" fmt " on procfs)", \ __fd, pid, ##__VA_ARGS__); \ } \ __f; \ }) #define DEVZERO (makedev(1, 5)) #define KDEV_MINORBITS 20 #define KDEV_MINORMASK ((1UL << KDEV_MINORBITS) - 1) #define MKKDEV(ma, mi) (((ma) << KDEV_MINORBITS) | (mi)) static inline u32 kdev_major(u32 kdev) { return kdev >> KDEV_MINORBITS; } static inline u32 kdev_minor(u32 kdev) { return kdev & KDEV_MINORMASK; } static inline dev_t kdev_to_odev(u32 kdev) { /* * New kernels encode devices in a new form. * See kernel's fs/stat.c for details, there * choose_32_64 helpers which are the key. */ unsigned major = kdev_major(kdev); unsigned minor = kdev_minor(kdev); return makedev(major, minor); } extern int copy_file(int fd_in, int fd_out, size_t bytes); extern int is_anon_link_type(char *link, char *type); #define is_hex_digit(c) (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F')) #define CRS_CAN_FAIL 0x1 /* cmd can validly exit with non zero code */ extern int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags); extern int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], unsigned flags, int userns_pid); extern pid_t fork_and_ptrace_attach(int (*child_setup)(void)); extern int cr_daemon(int nochdir, int noclose, int close_fd); extern int status_ready(void); extern int is_root_user(void); extern int set_proc_self_fd(int fd); static inline bool dir_dots(const struct dirent *de) { return !strcmp(de->d_name, ".") || !strcmp(de->d_name, ".."); } extern int is_empty_dir(int dirfd); /* * Size of buffer to carry the worst case or /proc/self/fd/N * path. Since fd is an integer, we can easily estimate one :) */ #define PSFDS (sizeof("/proc/self/fd/2147483647")) extern int read_fd_link(int lfd, char *buf, size_t size); #define USEC_PER_SEC 1000000L #define NSEC_PER_SEC 1000000000L int vaddr_to_pfn(int fd, unsigned long vaddr, u64 *pfn); /* * Check whether @str starts with @sub and report the * next character of @str in @end */ static inline bool strstartswith2(const char *str, const char *sub, char *end) { while (1) { if (*sub == '\0') /* end of sub -- match */ { if (end) { if (*(sub - 1) == '/') /* "/", "./" or "path/" */ *end = '/'; else *end = *str; } return true; } if (*str == '\0') /* end of str, sub is NOT ended -- miss */ return false; if (*str != *sub) return false; str++; sub++; } } static inline bool strstartswith(const char *str, const char *sub) { return strstartswith2(str, sub, NULL); } /* * Checks whether the @path has @sub_path as a sub path, i.e. * sub_path is the beginning of path and the last component * match is full (next character terminates path component). * * Paths shouldn't contain excessive /-s, i.e. only one slash * between path components and no slash at the end (except for * the "/" path. This is pretty good assumption to what paths * are used by criu. */ static inline bool issubpath(const char *path, const char *sub_path) { char end; return strstartswith2(path, sub_path, &end) && (end == '/' || end == '\0'); } extern char *get_relative_path(char *path, char *sub_path); extern bool is_sub_path(char *path, char *sub_path); extern bool is_same_path(char *path1, char *path2); int strip_deleted(char *path, int len); int cut_path_ending(char *path, char *sub_path); /* * mkdir -p */ int mkdirpat(int fd, const char *path, int mode); /* * Tests whether a path is a prefix of another path. This is different than * strstartswith because "/foo" is _not_ a path prefix of "/foobar", since they * refer to different directories. */ bool is_path_prefix(const char *path, const char *prefix); FILE *fopenat(int dirfd, char *path, char *cflags); void split(char *str, char token, char ***out, int *n); int cr_fchown(int fd, uid_t new_uid, gid_t new_gid); int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode); int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags); int fd_has_data(int lfd); int make_yard(char *path); static inline int sk_wait_data(int sk) { struct pollfd pfd = { sk, POLLIN, 0 }; return poll(&pfd, 1, -1); } void fd_set_nonblocking(int fd, bool on); const char *ns_to_string(unsigned int ns); int xatol(const char *string, long *number); int xatoi(const char *string, int *number); char *xstrcat(char *str, const char *fmt, ...) __attribute__((__format__(__printf__, 2, 3))); char *xsprintf(const char *fmt, ...) __attribute__((__format__(__printf__, 1, 2))); int setup_tcp_server(char *type, char *addr, unsigned short *port); int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk); int setup_tcp_client(char *hostname); /* path should be writable and no more than PATH_MAX long */ int rmrf(char *path); #define LAST_PID_PATH "sys/kernel/ns_last_pid" #define PID_MAX_PATH "sys/kernel/pid_max" #define block_sigmask(saved_mask, sig_mask) \ ({ \ sigset_t ___blocked_mask; \ int ___ret = 0; \ sigemptyset(&___blocked_mask); \ sigaddset(&___blocked_mask, sig_mask); \ if (sigprocmask(SIG_BLOCK, &___blocked_mask, saved_mask) == -1) { \ pr_perror("Can not set mask of blocked signals"); \ ___ret = -1; \ } \ ___ret; \ }) #define restore_sigmask(saved_mask) \ ({ \ int ___ret = 0; \ if (sigprocmask(SIG_SETMASK, saved_mask, NULL) == -1) { \ pr_perror("Can not unset mask of blocked signals"); \ ___ret = -1; \ } \ ___ret; \ }) /* * Helpers to organize asynchronous reading from a bunch * of file descriptors. */ #include struct epoll_rfd { int fd; /* * EPOLLIN notification. The data is available for read in * rfd->fd. * @return 0 to resume polling, 1 to stop polling or a * negative error code */ int (*read_event)(struct epoll_rfd *); /* * EPOLLHUP | EPOLLRDHUP notification. The remote side has * close the connection for rfd->fd. * @return 0 to resume polling, 1 to stop polling or a * negative error code */ int (*hangup_event)(struct epoll_rfd *); }; extern int epoll_add_rfd(int epfd, struct epoll_rfd *); extern int epoll_del_rfd(int epfd, struct epoll_rfd *rfd); extern int epoll_run_rfds(int epfd, struct epoll_event *evs, int nr_fds, int tmo); extern int epoll_prepare(int nr_events, struct epoll_event **evs); extern void rlimit_unlimit_nofile(void); extern int call_in_child_process(int (*fn)(void *), void *arg); #ifdef __GLIBC__ extern void print_stack_trace(pid_t pid); #else static inline void print_stack_trace(pid_t pid) { } #endif #define block_sigmask(saved_mask, sig_mask) \ ({ \ sigset_t ___blocked_mask; \ int ___ret = 0; \ sigemptyset(&___blocked_mask); \ sigaddset(&___blocked_mask, sig_mask); \ if (sigprocmask(SIG_BLOCK, &___blocked_mask, saved_mask) == -1) { \ pr_perror("Can not set mask of blocked signals"); \ ___ret = -1; \ } \ ___ret; \ }) #define restore_sigmask(saved_mask) \ ({ \ int ___ret = 0; \ if (sigprocmask(SIG_SETMASK, saved_mask, NULL) == -1) { \ pr_perror("Can not unset mask of blocked signals"); \ ___ret = -1; \ } \ ___ret; \ }) extern int mount_detached_fs(const char *fsname); extern char *get_legacy_iptables_bin(bool ipv6, bool restore); extern int set_opts_cap_eff(void); extern ssize_t read_all(int fd, void *buf, size_t size); extern ssize_t write_all(int fd, const void *buf, size_t size); #define cleanup_free __attribute__((cleanup(cleanup_freep))) static inline void cleanup_freep(void *p) { void **pp = (void **)p; free(*pp); } extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args); /* * criu_run_id is a unique value of the current run. It can be used to * generate resource ID-s to avoid conflicts with other CRIU processes. */ extern uint64_t criu_run_id; extern void util_init(void); extern char *resolve_mountpoint(char *path); #endif /* __CR_UTIL_H__ */ crac-criu-1.5.0/criu/include/uts_ns.h000066400000000000000000000002711471504326700174470ustar00rootroot00000000000000#ifndef __CR_UTS_NS_H__ #define __CR_UTS_NS_H__ extern int dump_uts_ns(int ns_id); extern int prepare_utsns(int pid); extern struct ns_desc uts_ns_desc; #endif /* __CR_UTS_NS_H__ */ crac-criu-1.5.0/criu/include/vdso.h000066400000000000000000000012211471504326700171030ustar00rootroot00000000000000#ifndef __CR_VDSO_H__ #define __CR_VDSO_H__ #include #include #include "common/config.h" #include "util-vdso.h" extern struct vdso_maps vdso_maps; extern struct vdso_maps vdso_maps_compat; extern int vdso_init_dump(void); extern int vdso_init_restore(void); extern int kerndat_vdso_fill_symtable(void); extern int kerndat_vdso_preserves_hint(void); extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list *vma_area_list); #ifdef CONFIG_COMPAT extern void compat_vdso_helper(struct vdso_maps *native, int pipe_fd, int err_fd, void *vdso_buf, size_t buf_size); #endif #endif /* __CR_VDSO_H__ */ crac-criu-1.5.0/criu/include/vma.h000066400000000000000000000077061471504326700167310ustar00rootroot00000000000000#ifndef __CR_VMA_H__ #define __CR_VMA_H__ #include "image.h" #include "common/list.h" #include "images/vma.pb-c.h" #include #include struct vm_area_list { struct list_head h; /* list of VMAs */ unsigned nr; /* nr of all VMAs in the list */ unsigned int nr_aios; /* nr of AIOs VMAs in the list */ union { unsigned long nr_priv_pages; /* dmp: nr of pages in private VMAs */ unsigned long rst_priv_size; /* rst: size of private VMAs */ }; unsigned long nr_priv_pages_longest; /* nr of pages in longest private VMA */ unsigned long nr_shared_pages_longest; /* nr of pages in longest shared VMA */ }; static inline void vm_area_list_init(struct vm_area_list *vml) { memset(vml, 0, sizeof(*vml)); INIT_LIST_HEAD(&vml->h); } struct file_desc; struct vma_area { struct list_head list; VmaEntry *e; union { struct /* for dump */ { int vm_socket_id; char *aufs_rpath; /* path from aufs root */ char *aufs_fpath; /* full path from global root */ /* * When several subsequent vmas have the same * dev:ino pair all 'tail' ones set this to true * and the vmst points to the head's stat buf. */ bool file_borrowed; struct stat *vmst; int mnt_id; }; struct /* for restore */ { int (*vm_open)(int pid, struct vma_area *vma); struct file_desc *vmfd; struct vma_area *pvma; /* parent for inherited VMAs */ unsigned long *page_bitmap; /* existent pages */ unsigned long premmaped_addr; /* restore only */ /* * Some notes about pvma, page_bitmap and premmaped_addr bits * above. * * The pvma is set in prepare_cow_vmas() when we resolve which * VMAs _may_ inherit pages from each other. * The page_bitmap and premmaped_addr are set in prepare_mappings() * when the respective VMAs get mmap-ed or mremap-ed. * These VMAs are then inherited during fork_with_pid()-s * called from create_children_and_session(). */ }; }; }; #define VMA_COW_ROOT ((struct vma_area *)1) typedef int (*dump_filemap_t)(struct vma_area *vma_area, int fd); extern struct vma_area *alloc_vma_area(void); extern int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t cb); extern void free_mappings(struct vm_area_list *vma_area_list); extern int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t cb); extern int parse_self_maps_lite(struct vm_area_list *vms); #define vma_area_is(vma_area, s) vma_entry_is((vma_area)->e, s) #define vma_area_len(vma_area) vma_entry_len((vma_area)->e) #define vma_entry_is(vma, s) (((vma)->status & (s)) == (s)) #define vma_entry_len(vma) ((vma)->end - (vma)->start) /* * vma_premmaped_start() can be used only in restorer. * In other cases vma_area->premmaped_addr must be used. * This hack is required, because vma_area isn't transferred in restorer and * shmid is used to determine which vma-s are cowed. */ #define vma_premmaped_start(vma) ((vma)->shmid) static inline int in_vma_area(struct vma_area *vma, unsigned long addr) { return addr >= (unsigned long)vma->e->start && addr < (unsigned long)vma->e->end; } static inline bool vma_entry_is_private(VmaEntry *entry, unsigned long task_size) { return (vma_entry_is(entry, VMA_AREA_REGULAR) && (vma_entry_is(entry, VMA_ANON_PRIVATE) || vma_entry_is(entry, VMA_FILE_PRIVATE)) && (entry->end <= task_size)) || vma_entry_is(entry, VMA_AREA_AIORING); } static inline bool vma_area_is_private(struct vma_area *vma, unsigned long task_size) { return vma_entry_is_private(vma->e, task_size); } static inline struct vma_area *vma_next(struct vma_area *vma) { return list_entry(vma->list.next, struct vma_area, list); } static inline bool vma_entry_can_be_lazy(VmaEntry *e) { return ((e->flags & MAP_ANONYMOUS) && (e->flags & MAP_PRIVATE) && !(e->flags & MAP_LOCKED) && !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VVAR)) && !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && !(e->flags & MAP_HUGETLB)); } #endif /* __CR_VMA_H__ */ crac-criu-1.5.0/criu/include/xmalloc.h000066400000000000000000000000551471504326700175730ustar00rootroot00000000000000#include "log.h" #include "common/xmalloc.h" crac-criu-1.5.0/criu/ipc_ns.c000066400000000000000000000520101471504326700157550ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "util.h" #include "cr_options.h" #include "imgset.h" #include "namespaces.h" #include "sysctl.h" #include "ipc_ns.h" #include "shmem.h" #include "types.h" #include "protobuf.h" #include "images/ipc-var.pb-c.h" #include "images/ipc-shm.pb-c.h" #include "images/ipc-sem.pb-c.h" #include "images/ipc-msg.pb-c.h" #if defined(__GLIBC__) && __GLIBC__ >= 2 #define KEY __key #else #define KEY key #endif #ifndef MSGMAX #define MSGMAX 8192 #endif #ifndef MSG_COPY #define MSG_COPY 040000 #endif static void pr_ipc_desc_entry(const IpcDescEntry *desc) { pr_info("id: %-10d key: %#08x uid: %-10d gid: %-10d " "cuid: %-10d cgid: %-10d mode: %-10o ", desc->id, desc->key, desc->uid, desc->gid, desc->cuid, desc->cgid, desc->mode); } static void fill_ipc_desc(int id, IpcDescEntry *desc, const struct ipc_perm *ipcp) { desc->id = id; desc->key = ipcp->KEY; desc->uid = userns_uid(ipcp->uid); desc->gid = userns_gid(ipcp->gid); desc->cuid = userns_uid(ipcp->cuid); desc->cgid = userns_gid(ipcp->cgid); desc->mode = ipcp->mode; } static void pr_ipc_sem_array(int nr, u16 *values) { while (nr--) pr_info(" %-5d", values[nr]); // no \n pr_info("\n"); } #define pr_info_ipc_sem_array(nr, values) pr_ipc_sem_array(nr, values) static void pr_info_ipc_sem_entry(const IpcSemEntry *sem) { pr_ipc_desc_entry(sem->desc); pr_info("nsems: %-10d\n", sem->nsems); } static int dump_ipc_sem_set(struct cr_img *img, const IpcSemEntry *sem) { size_t rounded; int ret, size; u16 *values; size = sizeof(u16) * sem->nsems; rounded = round_up(size, sizeof(u64)); values = xmalloc(rounded); if (values == NULL) { pr_err("Failed to allocate memory for semaphore set values\n"); ret = -ENOMEM; goto out; } ret = semctl(sem->desc->id, 0, GETALL, values); if (ret < 0) { pr_perror("Failed to get semaphore set values"); ret = -errno; goto out; } pr_info_ipc_sem_array(sem->nsems, values); memzero((void *)values + size, rounded - size); ret = write_img_buf(img, values, rounded); if (ret < 0) { pr_err("Failed to write IPC message data\n"); goto out; } out: xfree(values); return ret; } static int dump_ipc_sem_desc(struct cr_img *img, int id, const struct semid_ds *ds) { IpcSemEntry sem = IPC_SEM_ENTRY__INIT; IpcDescEntry desc = IPC_DESC_ENTRY__INIT; int ret; sem.desc = &desc; sem.nsems = ds->sem_nsems; fill_ipc_desc(id, sem.desc, &ds->sem_perm); pr_info_ipc_sem_entry(&sem); ret = pb_write_one(img, &sem, PB_IPC_SEM); if (ret < 0) { pr_err("Failed to write IPC semaphores set\n"); return ret; } return dump_ipc_sem_set(img, &sem); } static int dump_ipc_sem(struct cr_img *img) { int i, maxid; struct seminfo info; int slot; maxid = semctl(0, 0, SEM_INFO, &info); if (maxid < 0) { pr_perror("semctl failed"); return -errno; } pr_info("IPC semaphore sets: %d\n", info.semusz); for (i = 0, slot = 0; i <= maxid; i++) { struct semid_ds ds; int id, ret; id = semctl(i, 0, SEM_STAT, &ds); if (id < 0) { if (errno == EINVAL) continue; pr_perror("Failed to get stats for IPC semaphore set"); break; } ret = dump_ipc_sem_desc(img, id, &ds); if (!ret) slot++; } if (slot != info.semusz) { pr_err("Failed to collect %d (only %d succeeded)\n", info.semusz, slot); return -EFAULT; } return info.semusz; } static void pr_info_ipc_msg(int nr, const IpcMsg *msg) { pr_info(" %-5d: type: %-20" PRId64 " size: %-10d\n", nr++, msg->mtype, msg->msize); } static void pr_info_ipc_msg_entry(const IpcMsgEntry *msg) { pr_ipc_desc_entry(msg->desc); pr_info("qbytes: %-10d qnum: %-10d\n", msg->qbytes, msg->qnum); } static int dump_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq, unsigned int msg_nr) { struct msgbuf *message = NULL; unsigned int msgmax; int ret, msg_cnt = 0; struct sysctl_req req[] = { { "kernel/msgmax", &msgmax, CTL_U32 }, }; ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, CLONE_NEWIPC); if (ret < 0) { pr_err("Failed to read max IPC message size\n"); goto err; } msgmax += sizeof(struct msgbuf); message = xmalloc(round_up(msgmax, sizeof(u64))); if (message == NULL) { pr_err("Failed to allocate memory for IPC message\n"); return -ENOMEM; } for (msg_cnt = 0; msg_cnt < msg_nr; msg_cnt++) { IpcMsg msg = IPC_MSG__INIT; size_t rounded; ret = msgrcv(msq->desc->id, message, msgmax, msg_cnt, IPC_NOWAIT | MSG_COPY); if (ret < 0) { pr_perror("Failed to copy IPC message"); goto err; } msg.msize = ret; msg.mtype = message->mtype; pr_info_ipc_msg(msg_cnt, &msg); ret = pb_write_one(img, &msg, PB_IPCNS_MSG); if (ret < 0) { pr_err("Failed to write IPC message header\n"); break; } rounded = round_up(msg.msize, sizeof(u64)); memzero(((void *)message->mtext + msg.msize), rounded - msg.msize); ret = write_img_buf(img, message->mtext, rounded); if (ret < 0) { pr_err("Failed to write IPC message data\n"); break; } } ret = 0; err: xfree(message); return ret; } static int dump_ipc_msg_queue(struct cr_img *img, int id, const struct msqid_ds *ds) { IpcMsgEntry msg = IPC_MSG_ENTRY__INIT; IpcDescEntry desc = IPC_DESC_ENTRY__INIT; int ret; msg.desc = &desc; fill_ipc_desc(id, msg.desc, &ds->msg_perm); msg.qbytes = ds->msg_qbytes; msg.qnum = ds->msg_qnum; pr_info_ipc_msg_entry(&msg); ret = pb_write_one(img, &msg, PB_IPCNS_MSG_ENT); if (ret < 0) { pr_err("Failed to write IPC message queue\n"); return ret; } return dump_ipc_msg_queue_messages(img, &msg, ds->msg_qnum); } static int dump_ipc_msg(struct cr_img *img) { int i, maxid; struct msginfo info; int slot; maxid = msgctl(0, MSG_INFO, (struct msqid_ds *)&info); if (maxid < 0) { pr_perror("msgctl failed"); return -errno; } pr_info("IPC message queues: %d\n", info.msgpool); for (i = 0, slot = 0; i <= maxid; i++) { struct msqid_ds ds; int id, ret; id = msgctl(i, MSG_STAT, &ds); if (id < 0) { if (errno == EINVAL) continue; pr_perror("Failed to get stats for IPC message queue"); break; } ret = dump_ipc_msg_queue(img, id, &ds); if (!ret) slot++; } if (slot != info.msgpool) { pr_err("Failed to collect %d message queues (only %d succeeded)\n", info.msgpool, slot); return -EFAULT; } return info.msgpool; } static void pr_info_ipc_shm(const IpcShmEntry *shm) { pr_ipc_desc_entry(shm->desc); pr_info("size: %-10" PRIu64 "\n", shm->size); } #define NR_MANDATORY_IPC_SYSCTLS 9 static int ipc_sysctl_req(IpcVarEntry *e, int op) { int i; struct sysctl_req req[] = { { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) }, { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 }, { "kernel/msgmnb", &e->msg_ctlmnb, CTL_U32 }, { "kernel/auto_msgmni", &e->auto_msgmni, CTL_U32 }, { "kernel/msgmni", &e->msg_ctlmni, CTL_U32 }, { "kernel/shmmax", &e->shm_ctlmax, CTL_U64 }, { "kernel/shmall", &e->shm_ctlall, CTL_U64 }, { "kernel/shmmni", &e->shm_ctlmni, CTL_U32 }, { "kernel/shm_rmid_forced", &e->shm_rmid_forced, CTL_U32 }, /* We have 9 mandatory sysctls above and 8 optional below */ { "fs/mqueue/queues_max", &e->mq_queues_max, CTL_U32 }, { "fs/mqueue/msg_max", &e->mq_msg_max, CTL_U32 }, { "fs/mqueue/msgsize_max", &e->mq_msgsize_max, CTL_U32 }, { "fs/mqueue/msg_default", &e->mq_msg_default, CTL_U32 }, { "fs/mqueue/msgsize_default", &e->mq_msgsize_default, CTL_U32 }, { "kernel/msg_next_id", &e->msg_next_id, CTL_U32 }, { "kernel/sem_next_id", &e->sem_next_id, CTL_U32 }, { "kernel/shm_next_id", &e->shm_next_id, CTL_U32 }, }; int nr = NR_MANDATORY_IPC_SYSCTLS; /* Skip sysctls which can't be set or haven't existed on dump */ if (access("/proc/sys/fs/mqueue", X_OK)) pr_info("Mqueue sysctls are missing\n"); else { nr += 3; if (e->has_mq_msg_default) { req[nr++] = req[12]; req[nr++] = req[13]; } } if (e->has_msg_next_id) req[nr++] = req[14]; if (e->has_sem_next_id) req[nr++] = req[15]; if (e->has_shm_next_id) req[nr++] = req[16]; for (i = 0; i < nr; i++) req[i].flags = CTL_FLAGS_IPC_EACCES_SKIP; return sysctl_op(req, nr, op, CLONE_NEWIPC); } static int dump_ipc_shm_pages(const IpcShmEntry *shm) { int ret; void *data; data = shmat(shm->desc->id, NULL, SHM_RDONLY); if (data == (void *)-1) { pr_perror("Failed to attach IPC shared memory"); return -errno; } ret = dump_one_sysv_shmem(data, shm->size, shm->desc->id); if (shmdt(data)) { pr_perror("Failed to detach IPC shared memory"); return -errno; } return ret; } static int dump_shm_hugetlb_flag(IpcShmEntry *shm, int id, unsigned long size) { void *addr; int ret, hugetlb_flag, exit_code = -1; struct stat st; char path[64]; addr = shmat(id, NULL, SHM_RDONLY); if (addr == (void *)-1) { pr_perror("Failed to attach shm"); return -1; } /* The shm segment size may not be aligned, * we need to align it up to next page size */ size = (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); snprintf(path, sizeof(path), "/proc/self/map_files/%lx-%lx", (unsigned long)addr, (unsigned long)addr + size); ret = stat(path, &st); if (ret < 0) { pr_perror("Can't stat map_files"); goto detach; } if (is_hugetlb_dev(st.st_dev, &hugetlb_flag)) { shm->has_hugetlb_flag = true; shm->hugetlb_flag = hugetlb_flag | SHM_HUGETLB; } exit_code = 0; detach: shmdt(addr); return exit_code; } static int dump_ipc_shm_seg(struct cr_img *img, int id, const struct shmid_ds *ds) { IpcShmEntry shm = IPC_SHM_ENTRY__INIT; IpcDescEntry desc = IPC_DESC_ENTRY__INIT; int ret; shm.desc = &desc; shm.size = ds->shm_segsz; shm.has_in_pagemaps = true; shm.in_pagemaps = true; if (dump_shm_hugetlb_flag(&shm, id, ds->shm_segsz)) return -1; fill_ipc_desc(id, shm.desc, &ds->shm_perm); pr_info_ipc_shm(&shm); ret = pb_write_one(img, &shm, PB_IPC_SHM); if (ret < 0) { pr_err("Failed to write IPC shared memory segment\n"); return ret; } return dump_ipc_shm_pages(&shm); } static int dump_ipc_shm(struct cr_img *img) { int i, maxid, slot; struct shm_info info; maxid = shmctl(0, SHM_INFO, (void *)&info); if (maxid < 0) { pr_perror("shmctl(SHM_INFO) failed"); return -errno; } pr_info("IPC shared memory segments: %d\n", info.used_ids); for (i = 0, slot = 0; i <= maxid; i++) { struct shmid_ds ds; int id, ret; id = shmctl(i, SHM_STAT, &ds); if (id < 0) { if (errno == EINVAL) continue; pr_perror("Failed to get stats for IPC shared memory"); break; } ret = dump_ipc_shm_seg(img, id, &ds); if (ret < 0) return ret; slot++; } if (slot != info.used_ids) { pr_err("Failed to collect %d (only %d succeeded)\n", info.used_ids, slot); return -EFAULT; } return 0; } static int dump_ipc_var(struct cr_img *img) { IpcVarEntry var = IPC_VAR_ENTRY__INIT; int ret = -1; var.n_sem_ctls = 4; var.sem_ctls = xmalloc(pb_repeated_size(&var, sem_ctls)); if (!var.sem_ctls) goto err; var.has_mq_msg_default = true; var.has_mq_msgsize_default = true; var.has_msg_next_id = true; var.has_sem_next_id = true; var.has_shm_next_id = true; ret = ipc_sysctl_req(&var, CTL_READ); if (ret < 0) { pr_err("Failed to read IPC variables\n"); goto err; } /* * One can not write to msg_next_xxx sysctls -1, * which is their initial value */ if (var.msg_next_id == -1) var.has_msg_next_id = false; if (var.sem_next_id == -1) var.has_sem_next_id = false; if (var.shm_next_id == -1) var.has_shm_next_id = false; ret = pb_write_one(img, &var, PB_IPC_VAR); if (ret < 0) { pr_err("Failed to write IPC variables\n"); goto err; } err: xfree(var.sem_ctls); return ret; } static int dump_ipc_data(const struct cr_imgset *imgset) { int ret; ret = dump_ipc_var(img_from_set(imgset, CR_FD_IPC_VAR)); if (ret < 0) return ret; ret = dump_ipc_shm(img_from_set(imgset, CR_FD_IPCNS_SHM)); if (ret < 0) return ret; ret = dump_ipc_msg(img_from_set(imgset, CR_FD_IPCNS_MSG)); if (ret < 0) return ret; ret = dump_ipc_sem(img_from_set(imgset, CR_FD_IPCNS_SEM)); if (ret < 0) return ret; return 0; } int dump_ipc_ns(int ns_id) { int ret; struct cr_imgset *imgset; imgset = cr_imgset_open(ns_id, IPCNS, O_DUMP); if (imgset == NULL) return -1; ret = dump_ipc_data(imgset); if (ret < 0) { pr_err("Failed to write IPC namespace data\n"); goto err; } err: close_cr_imgset(&imgset); return ret < 0 ? -1 : 0; } static int prepare_ipc_sem_values(struct cr_img *img, const IpcSemEntry *sem) { int ret, size; u16 *values; size = round_up(sizeof(u16) * sem->nsems, sizeof(u64)); values = xmalloc(size); if (values == NULL) { pr_err("Failed to allocate memory for semaphores set values\n"); ret = -ENOMEM; goto out; } ret = read_img_buf(img, values, size); if (ret < 0) { pr_err("Failed to allocate memory for semaphores set values\n"); ret = -ENOMEM; goto out; } pr_info_ipc_sem_array(sem->nsems, values); ret = semctl(sem->desc->id, 0, SETALL, values); if (ret < 0) { pr_perror("Failed to set semaphores set values"); ret = -errno; } out: xfree(values); return ret; } static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem) { int ret, id; struct sysctl_req req[] = { { "kernel/sem_next_id", &sem->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct semid_ds semid; ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC); if (ret < 0) { pr_err("Failed to set desired IPC sem ID\n"); return ret; } id = semget(sem->desc->key, sem->nsems, sem->desc->mode | IPC_CREAT | IPC_EXCL); if (id == -1) { pr_perror("Failed to create sem set"); return -errno; } if (id != sem->desc->id) { pr_err("Failed to restore sem id (%d instead of %d)\n", id, sem->desc->id); return -EFAULT; } ret = semctl(id, sem->nsems, IPC_STAT, &semid); if (ret == -1) { pr_err("Failed to get sem stat structure\n"); return -EFAULT; } semid.sem_perm.uid = sem->desc->uid; semid.sem_perm.gid = sem->desc->gid; ret = semctl(id, sem->nsems, IPC_SET, &semid); if (ret == -1) { pr_err("Failed to set sem uid and gid\n"); return -EFAULT; } ret = prepare_ipc_sem_values(img, sem); if (ret < 0) { pr_err("Failed to update sem pages\n"); return ret; } return 0; } static int prepare_ipc_sem(int pid) { int ret; struct cr_img *img; pr_info("Restoring IPC semaphores sets\n"); img = open_image(CR_FD_IPCNS_SEM, O_RSTR, pid); if (!img) return -1; while (1) { IpcSemEntry *sem; ret = pb_read_one_eof(img, &sem, PB_IPC_SEM); if (ret < 0) { ret = -EIO; goto err; } if (ret == 0) break; pr_info_ipc_sem_entry(sem); ret = prepare_ipc_sem_desc(img, sem); ipc_sem_entry__free_unpacked(sem, NULL); if (ret < 0) { pr_err("Failed to prepare semaphores set\n"); goto err; } } close_image(img); return 0; err: close_image(img); return ret; } static int prepare_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq) { IpcMsg *msg = NULL; int msg_nr = 0; int ret = 0; while (msg_nr < msq->qnum) { struct msgbuf { long mtype; char mtext[MSGMAX]; } data; ret = pb_read_one(img, &msg, PB_IPCNS_MSG); if (ret <= 0) return -EIO; pr_info_ipc_msg(msg_nr, msg); if (msg->msize > MSGMAX) { ret = -1; pr_err("Unsupported message size: %d (MAX: %d)\n", msg->msize, MSGMAX); break; } ret = read_img_buf(img, data.mtext, round_up(msg->msize, sizeof(u64))); if (ret < 0) { pr_err("Failed to read IPC message data\n"); break; } data.mtype = msg->mtype; ret = msgsnd(msq->desc->id, &data, msg->msize, IPC_NOWAIT); if (ret < 0) { pr_perror("Failed to send IPC message"); ret = -errno; break; } msg_nr++; } if (msg) ipc_msg__free_unpacked(msg, NULL); return ret; } static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq) { int ret, id; struct sysctl_req req[] = { { "kernel/msg_next_id", &msq->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct msqid_ds msqid; ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC); if (ret < 0) { pr_err("Failed to set desired IPC msg ID\n"); return ret; } id = msgget(msq->desc->key, msq->desc->mode | IPC_CREAT | IPC_EXCL); if (id == -1) { pr_perror("Failed to create msg set"); return -errno; } if (id != msq->desc->id) { pr_err("Failed to restore msg id (%d instead of %d)\n", id, msq->desc->id); return -EFAULT; } ret = msgctl(id, IPC_STAT, &msqid); if (ret == -1) { pr_err("Failed to get msq stat structure\n"); return -EFAULT; } msqid.msg_perm.uid = msq->desc->uid; msqid.msg_perm.gid = msq->desc->gid; ret = msgctl(id, IPC_SET, &msqid); if (ret == -1) { pr_err("Failed to set msq queue uid and gid\n"); return -EFAULT; } ret = prepare_ipc_msg_queue_messages(img, msq); if (ret < 0) { pr_err("Failed to update message queue messages\n"); return ret; } return 0; } static int prepare_ipc_msg(int pid) { int ret; struct cr_img *img; pr_info("Restoring IPC message queues\n"); img = open_image(CR_FD_IPCNS_MSG, O_RSTR, pid); if (!img) return -1; while (1) { IpcMsgEntry *msq; ret = pb_read_one_eof(img, &msq, PB_IPCNS_MSG_ENT); if (ret < 0) { pr_err("Failed to read IPC messages queue\n"); ret = -EIO; goto err; } if (ret == 0) break; pr_info_ipc_msg_entry(msq); ret = prepare_ipc_msg_queue(img, msq); ipc_msg_entry__free_unpacked(msq, NULL); if (ret < 0) { pr_err("Failed to prepare messages queue\n"); goto err; } } close_image(img); return 0; err: close_image(img); return ret; } static int restore_content(void *data, struct cr_img *img, const IpcShmEntry *shm) { int ifd; ssize_t size, off; ifd = img_raw_fd(img); if (ifd < 0) { pr_err("Failed getting raw image fd\n"); return -1; } size = round_up(shm->size, sizeof(u32)); off = 0; do { ssize_t ret; ret = read(ifd, data + off, size - off); if (ret <= 0) { pr_perror("Failed to write IPC shared memory data"); return (int)ret; } off += ret; } while (off < size); return 0; } static int prepare_ipc_shm_pages(struct cr_img *img, const IpcShmEntry *shm) { int ret; void *data; data = shmat(shm->desc->id, NULL, 0); if (data == (void *)-1) { pr_perror("Failed to attach IPC shared memory"); return -errno; } if (shm->has_in_pagemaps && shm->in_pagemaps) ret = restore_sysv_shmem_content(data, shm->size, shm->desc->id); else ret = restore_content(data, img, shm); if (shmdt(data)) { pr_perror("Failed to detach IPC shared memory"); return -errno; } return ret; } static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) { int ret, id, hugetlb_flag = 0; struct sysctl_req req[] = { { "kernel/shm_next_id", &shm->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct shmid_ds shmid; if (collect_sysv_shmem(shm->desc->id, shm->size)) return -1; ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC); if (ret < 0) { pr_err("Failed to set desired IPC shm ID\n"); return ret; } if (shm->has_hugetlb_flag) hugetlb_flag = shm->hugetlb_flag; id = shmget(shm->desc->key, shm->size, hugetlb_flag | shm->desc->mode | IPC_CREAT | IPC_EXCL); if (id == -1) { pr_perror("Failed to create shm set"); return -errno; } if (id != shm->desc->id) { pr_err("Failed to restore shm id (%d instead of %d)\n", id, shm->desc->id); return -EFAULT; } ret = shmctl(id, IPC_STAT, &shmid); if (ret == -1) { pr_err("Failed to get shm stat structure\n"); return -EFAULT; } shmid.shm_perm.uid = shm->desc->uid; shmid.shm_perm.gid = shm->desc->gid; ret = shmctl(id, IPC_SET, &shmid); if (ret == -1) { pr_err("Failed to set shm uid and gid\n"); return -EFAULT; } ret = prepare_ipc_shm_pages(img, shm); if (ret < 0) { pr_err("Failed to update shm pages\n"); return ret; } return 0; } static int prepare_ipc_shm(int pid) { int ret; struct cr_img *img; pr_info("Restoring IPC shared memory\n"); img = open_image(CR_FD_IPCNS_SHM, O_RSTR, pid); if (!img) return -1; while (1) { IpcShmEntry *shm; ret = pb_read_one_eof(img, &shm, PB_IPC_SHM); if (ret < 0) { pr_err("Failed to read IPC shared memory segment\n"); ret = -EIO; goto err; } if (ret == 0) break; pr_info_ipc_shm(shm); ret = prepare_ipc_shm_seg(img, shm); ipc_shm_entry__free_unpacked(shm, NULL); if (ret < 0) { pr_err("Failed to prepare shm segment\n"); goto err; } } close_image(img); return 0; err: close_image(img); return ret; } static int prepare_ipc_var(int pid) { int ret; struct cr_img *img; IpcVarEntry *var; pr_info("Restoring IPC variables\n"); img = open_image(CR_FD_IPC_VAR, O_RSTR, pid); if (!img) return -1; ret = pb_read_one(img, &var, PB_IPC_VAR); close_image(img); if (ret <= 0) { pr_err("Failed to read IPC namespace variables\n"); return -EFAULT; } ret = ipc_sysctl_req(var, CTL_WRITE); ipc_var_entry__free_unpacked(var, NULL); if (ret < 0) { pr_err("Failed to prepare IPC namespace variables\n"); return -EFAULT; } return 0; } int prepare_ipc_ns(int pid) { int ret; pr_info("Restoring IPC namespace\n"); ret = prepare_ipc_var(pid); if (ret < 0) return ret; ret = prepare_ipc_shm(pid); if (ret < 0) return ret; ret = prepare_ipc_msg(pid); if (ret < 0) return ret; ret = prepare_ipc_sem(pid); if (ret < 0) return ret; return 0; } struct ns_desc ipc_ns_desc = NS_DESC_ENTRY(CLONE_NEWIPC, "ipc"); crac-criu-1.5.0/criu/irmap.c000066400000000000000000000230311471504326700156130ustar00rootroot00000000000000/* * IRMAP -- inode reverse mapping. * * Helps us to map inode number (and device) back to path * so that we can restore inotify/fanotify-s. * * Scanning _is_ slow, so we limit it with hints, which are * heuristically known places where notifies are typically put. */ #include #include #include #include #include #include #include #include "xmalloc.h" #include "irmap.h" #include "mount.h" #include "log.h" #include "util.h" #include "image.h" #include "stats.h" #include "pstree.h" #include "cr_options.h" #include "protobuf.h" #include "images/fsnotify.pb-c.h" #include "images/fh.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "irmap: " #define IRMAP_CACHE_BITS 5 #define IRMAP_CACHE_SIZE (1 << IRMAP_CACHE_BITS) #define IRMAP_CACHE_MASK (IRMAP_CACHE_SIZE - 1) static inline int irmap_hashfn(unsigned int s_dev, unsigned long i_ino) { return (s_dev + i_ino) & IRMAP_CACHE_MASK; } struct irmap { unsigned int dev; unsigned long ino; char *path; struct irmap *next; bool revalidate; int nr_kids; struct irmap *kids; }; static struct irmap *cache[IRMAP_CACHE_SIZE]; static struct irmap hints[] = { { .path = "/etc", .nr_kids = -1, }, { .path = "/var/spool", .nr_kids = -1, }, { .path = "/var/log", .nr_kids = -1, }, { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 }, { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 }, { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 }, { .path = "/lib/udev", .nr_kids = -1, }, { .path = "/.", .nr_kids = 0, }, { .path = "/no-such-path", .nr_kids = -1, }, {}, }; /* * Update inode (and device) number and cache the entry */ static int irmap_update_stat(struct irmap *i) { struct stat st; int mntns_root; unsigned hv; if (i->ino) return 0; mntns_root = get_service_fd(ROOT_FD_OFF); pr_debug("Refresh stat for %s\n", i->path); if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { pr_pwarn("Can't stat %s", i->path); return -1; } i->revalidate = false; i->dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); i->ino = st.st_ino; if (!S_ISDIR(st.st_mode)) i->nr_kids = 0; /* don't irmap_update_dir */ hv = irmap_hashfn(i->dev, i->ino); i->next = cache[hv]; cache[hv] = i; return 0; } /* * Update list of children, but don't cache any. Later * we'll scan them one-by-one and cache. */ static int irmap_update_dir(struct irmap *t) { int fd, nr = 0, mntns_root; DIR *dfd; struct dirent *de; if (t->nr_kids >= 0) return 0; mntns_root = get_service_fd(ROOT_FD_OFF); pr_debug("Refilling %s dir\n", t->path); fd = openat(mntns_root, t->path + 1, O_RDONLY); if (fd < 0) { pr_pwarn("Can't open %s", t->path); return -1; } dfd = fdopendir(fd); if (!dfd) { pr_perror("Can't opendir %s", t->path); close(fd); return -1; } errno = 0; while ((de = readdir(dfd)) != NULL) { struct irmap *k; if (dir_dots(de)) continue; nr++; if (xrealloc_safe(&t->kids, nr * sizeof(struct irmap))) goto out_err; k = &t->kids[nr - 1]; k->kids = NULL; /* for xrealloc above */ k->ino = 0; /* for irmap_update_stat */ k->nr_kids = -1; /* for irmap_update_dir */ k->path = xsprintf("%s/%s", t->path, de->d_name); if (!k->path) goto out_err; } if (errno) { pr_perror("Readdir failed"); goto out_err; } closedir(dfd); t->nr_kids = nr; return 0; out_err: xfree(t->kids); closedir(dfd); return -1; } static struct irmap *irmap_scan(struct irmap *t, unsigned int dev, unsigned long ino) { struct irmap *c; int i; if (irmap_update_stat(t)) return NULL; if (t->dev == dev && t->ino == ino) return t; if (irmap_update_dir(t)) return NULL; for (i = 0; i < t->nr_kids; i++) { c = irmap_scan(&t->kids[i], dev, ino); if (c) return c; } return NULL; } static int irmap_revalidate(struct irmap *c, struct irmap **p) { struct stat st; int mntns_root; mntns_root = get_service_fd(ROOT_FD_OFF); pr_debug("Revalidate stat for %s\n", c->path); if (fstatat(mntns_root, c->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { /* File can be (re)moved, so just treat it as invalid */ pr_perror("Can't stat %s", c->path); goto invalid; } if (c->dev != MKKDEV(major(st.st_dev), minor(st.st_dev))) goto invalid; if (c->ino != st.st_ino) goto invalid; c->revalidate = false; return 0; invalid: pr_debug("\t%x:%lx is invalid\n", c->dev, c->ino); *p = c->next; xfree(c->path); xfree(c); return 1; } static bool doing_predump = false; char *irmap_lookup(unsigned int s_dev, unsigned long i_ino) { struct irmap *c, *h, **p; char *path = NULL; int hv; struct irmap_path_opt *o; pr_debug("Resolving %x:%lx path\n", s_dev, i_ino); /* * If we're in predump, then processes already run * and the root_item is already freed by that time. * But the root service fd is already set by the * irmap_predump_prep, so we just go ahead and scan. */ if (!doing_predump && __mntns_get_root_fd(root_item->pid->real) < 0) goto out; timing_start(TIME_IRMAP_RESOLVE); hv = irmap_hashfn(s_dev, i_ino); for (p = &cache[hv]; *p;) { c = *p; if (!(c->dev == s_dev && c->ino == i_ino)) { p = &(*p)->next; continue; } if (c->revalidate && irmap_revalidate(c, p)) continue; pr_debug("\tFound %s in cache\n", c->path); path = c->path; goto out; } /* Let's scan any user provided paths first; since the user told us * about them, hopefully they're more interesting than our hints. */ list_for_each_entry(o, &opts.irmap_scan_paths, node) { c = irmap_scan(o->ir, s_dev, i_ino); if (c) { pr_debug("\tScanned %s\n", c->path); path = c->path; goto out; } } for (h = hints; h->path; h++) { pr_debug("Scanning %s hint\n", h->path); c = irmap_scan(h, s_dev, i_ino); if (c) { pr_debug("\tScanned %s\n", c->path); path = c->path; goto out; } } out: timing_stop(TIME_IRMAP_RESOLVE); return path; } /* * IRMAP pre-cache -- do early irmap scan on pre-dump to reduce * the freeze time on dump */ struct irmap_predump { unsigned int dev; unsigned long ino; FhEntry fh; struct irmap_predump *next; }; static struct irmap_predump *predump_queue; int irmap_queue_cache(unsigned int dev, unsigned long ino, FhEntry *fh) { struct irmap_predump *ip; ip = xmalloc(sizeof(*ip)); if (!ip) return -1; ip->dev = dev; ip->ino = ino; ip->fh = *fh; ip->fh.handle = xmemdup(fh->handle, FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); if (!ip->fh.handle) { xfree(ip); return -1; } pr_debug("Queue %x:%lx for pre-dump\n", dev, ino); ip->next = predump_queue; predump_queue = ip; return 0; } int irmap_predump_prep(void) { /* * Tasks are about to get released soon, but * we'll need to do FS scan for irmaps. In this * scan we will need to know the root dir tasks * live in. Need to make sure the respective fd * (service) is set to that root, so that the * scan works and doesn't race with the tasks * dying or changind root. */ doing_predump = true; return __mntns_get_root_fd(root_item->pid->real) < 0 ? -1 : 0; } int irmap_predump_run(void) { int ret = 0; struct cr_img *img; struct irmap_predump *ip; img = open_image_at(AT_FDCWD, CR_FD_IRMAP_CACHE, O_DUMP); if (!img) return -1; pr_info("Running irmap pre-dump\n"); for (ip = predump_queue; ip; ip = ip->next) { pr_debug("\tchecking %x:%lx\n", ip->dev, ip->ino); ret = check_open_handle(ip->dev, ip->ino, &ip->fh); if (ret) { pr_err("Failed to resolve %x:%lx\n", ip->dev, ip->ino); break; } if (ip->fh.path) { IrmapCacheEntry ic = IRMAP_CACHE_ENTRY__INIT; pr_info("Irmap cache %x:%lx -> %s\n", ip->dev, ip->ino, ip->fh.path); ic.dev = ip->dev; ic.inode = ip->ino; ic.path = ip->fh.path; ret = pb_write_one(img, &ic, PB_IRMAP_CACHE); if (ret) break; } } close_image(img); return ret; } static int irmap_cache_one(IrmapCacheEntry *ie) { struct irmap *ic; unsigned hv; ic = xmalloc(sizeof(*ic)); if (!ic) return -1; ic->dev = ie->dev; ic->ino = ie->inode; ic->path = xstrdup(ie->path); if (!ie->path) { xfree(ic); return -1; } ic->nr_kids = 0; /* * We've loaded entry from cache, thus we'll need to check * whether it's still valid when find it in cache. */ ic->revalidate = true; pr_debug("Pre-cache %x:%lx -> %s\n", ic->dev, ic->ino, ic->path); hv = irmap_hashfn(ic->dev, ic->ino); ic->next = cache[hv]; cache[hv] = ic; return 0; } static int open_irmap_cache(struct cr_img **img) { int dir = AT_FDCWD; pr_info("Searching irmap cache in work dir\n"); in: *img = open_image_at(dir, CR_FD_IRMAP_CACHE, O_RSTR); if (dir != AT_FDCWD) close(dir); if (empty_image(*img)) { close_image(*img); if (dir == AT_FDCWD) { pr_info("Searching irmap cache in parent\n"); if (open_parent(get_service_fd(IMG_FD_OFF), &dir)) return -1; if (dir >= 0) goto in; } pr_info("No irmap cache\n"); return 0; } if (!*img) return -1; pr_info("... done\n"); return 1; } int irmap_load_cache(void) { int ret; struct cr_img *img; ret = open_irmap_cache(&img); if (ret <= 0) return ret; pr_info("Loading irmap cache\n"); while (1) { IrmapCacheEntry *ic; ret = pb_read_one_eof(img, &ic, PB_IRMAP_CACHE); if (ret <= 0) break; ret = irmap_cache_one(ic); if (ret < 0) break; irmap_cache_entry__free_unpacked(ic, NULL); } close_image(img); return ret; } int irmap_scan_path_add(char *path) { struct irmap_path_opt *o; o = xzalloc(sizeof(*o)); if (!o) return -1; o->ir = xzalloc(sizeof(*o->ir)); if (!o->ir) { xfree(o); return -1; } o->ir->path = path; o->ir->nr_kids = -1; list_add_tail(&o->node, &opts.irmap_scan_paths); return 0; } crac-criu-1.5.0/criu/kcmp-ids.c000066400000000000000000000124661471504326700162240ustar00rootroot00000000000000#include #include #include #include "log.h" #include "xmalloc.h" #include "common/compiler.h" #include "common/bug.h" #include "rbtree.h" #include "kcmp-ids.h" /* * We track shared files by global rbtree, where each node might * be a root for subtree. The reason for that is the nature of data * we obtain from operating system. * * Basically OS provides us two ways to distinguish files * * - information obtained from fstat call * - shiny new sys_kcmp system call (which may compare the file descriptor * pointers inside the kernel and provide us order info) * * So, to speedup procedure of searching for shared file descriptors * we use both techniques. From fstat call we get that named general file * IDs (genid) which are carried in the main rbtree. * * In case if two genid are the same -- we need to use a second way and * call for sys_kcmp. Thus, if kernel tells us that files have identical * genid but in real they are different from kernel point of view -- we assign * a second unique key (subid) to such file descriptor and put it into a subtree. * * So the tree will look like * * (root) * genid-1 * / \ * genid-2 genid-3 * / \ / \ * * Where each genid node might be a sub-rbtree as well * * (genid-N) * / \ * subid-1 subid-2 * / \ / \ * * Carrying two rbtree at once allow us to minimize the number * of sys_kcmp syscalls, also to collect and dump file descriptors * in one pass. */ struct kid_entry { struct rb_node node; struct rb_root subtree_root; struct rb_node subtree_node; uint32_t subid; /* subid is always unique */ struct kid_elem elem; } __aligned(sizeof(long)); static struct kid_entry *alloc_kid_entry(struct kid_tree *tree, struct kid_elem *elem) { struct kid_entry *e; e = xmalloc(sizeof(*e)); if (!e) goto err; e->subid = tree->subid++; e->elem = *elem; /* Make sure no overflow here */ BUG_ON(!e->subid); rb_init_node(&e->node); rb_init_node(&e->subtree_node); e->subtree_root = RB_ROOT; rb_link_and_balance(&e->subtree_root, &e->subtree_node, NULL, &e->subtree_root.rb_node); err: return e; } static uint32_t kid_generate_sub(struct kid_tree *tree, struct kid_entry *e, struct kid_elem *elem, int *new_id) { struct rb_node *node = e->subtree_root.rb_node; struct kid_entry *sub = NULL; struct rb_node **new = &e->subtree_root.rb_node; struct rb_node *parent = NULL; BUG_ON(!node); while (node) { struct kid_entry *this = rb_entry(node, struct kid_entry, subtree_node); int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, tree->kcmp_type, this->elem.idx, elem->idx); parent = *new; if (ret == 1) node = node->rb_left, new = &((*new)->rb_left); else if (ret == 2) node = node->rb_right, new = &((*new)->rb_right); else if (ret == 0) return this->subid; else { pr_perror("kcmp failed: pid (%d %d) type %u idx (%u %u)", this->elem.pid, elem->pid, tree->kcmp_type, this->elem.idx, elem->idx); return 0; } } sub = alloc_kid_entry(tree, elem); if (!sub) return 0; rb_link_and_balance(&e->subtree_root, &sub->subtree_node, parent, new); *new_id = 1; return sub->subid; } uint32_t kid_generate_gen(struct kid_tree *tree, struct kid_elem *elem, int *new_id) { struct rb_node *node = tree->root.rb_node; struct kid_entry *e = NULL; struct rb_node **new = &tree->root.rb_node; struct rb_node *parent = NULL; while (node) { struct kid_entry *this = rb_entry(node, struct kid_entry, node); parent = *new; if (elem->genid < this->elem.genid) node = node->rb_left, new = &((*new)->rb_left); else if (elem->genid > this->elem.genid) node = node->rb_right, new = &((*new)->rb_right); else return kid_generate_sub(tree, this, elem, new_id); } e = alloc_kid_entry(tree, elem); if (!e) return 0; rb_link_and_balance(&tree->root, &e->node, parent, new); *new_id = 1; return e->subid; } static struct kid_elem *kid_lookup_epoll_tfd_sub(struct kid_tree *tree, struct kid_entry *e, struct kid_elem *elem, kcmp_epoll_slot_t *slot) { struct rb_node *node = e->subtree_root.rb_node; struct rb_node **new = &e->subtree_root.rb_node; BUG_ON(!node); while (node) { struct kid_entry *this = rb_entry(node, struct kid_entry, subtree_node); int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, KCMP_EPOLL_TFD, this->elem.idx, slot); if (ret == 1) node = node->rb_left, new = &((*new)->rb_left); else if (ret == 2) node = node->rb_right, new = &((*new)->rb_right); else if (ret == 0) return &this->elem; else { pr_perror("kcmp-epoll failed: pid (%d %d) type %u idx (%u %u)", this->elem.pid, elem->pid, KCMP_EPOLL_TFD, this->elem.idx, elem->idx); return NULL; } } return NULL; } struct kid_elem *kid_lookup_epoll_tfd(struct kid_tree *tree, struct kid_elem *elem, kcmp_epoll_slot_t *slot) { struct rb_node *node = tree->root.rb_node; struct rb_node **new = &tree->root.rb_node; while (node) { struct kid_entry *this = rb_entry(node, struct kid_entry, node); if (elem->genid < this->elem.genid) node = node->rb_left, new = &((*new)->rb_left); else if (elem->genid > this->elem.genid) node = node->rb_right, new = &((*new)->rb_right); else return kid_lookup_epoll_tfd_sub(tree, this, elem, slot); } return NULL; } crac-criu-1.5.0/criu/kerndat.c000066400000000000000000001246121471504326700161420ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) #include #endif #include #include "common/config.h" #include "int.h" #include "log.h" #include "restorer.h" #include "kerndat.h" #include "fs-magic.h" #include "mem.h" #include "common/compiler.h" #include "sysctl.h" #include "cr_options.h" #include "util.h" #include "lsm.h" #include "proc_parse.h" #include "sk-inet.h" #include "sockets.h" #include "net.h" #include "tun.h" #include #include #include "netfilter.h" #include "fsnotify.h" #include "linux/userfaultfd.h" #include "prctl.h" #include "uffd.h" #include "vdso.h" #include "kcmp.h" #include "sched.h" #include "memfd.h" #include "mount-v2.h" #include "util-caps.h" struct kerndat_s kdat = {}; volatile int dummy_var; static int check_pagemap(void) { int ret, fd, retry; u64 pfn = 0; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); if (fd < 0) { if (errno == EPERM) { pr_info("Pagemap disabled\n"); kdat.pmap = PM_DISABLED; return 0; } return -1; } retry = 3; while (retry--) { ++dummy_var; /* Get the PFN of a page likely to be present. */ ret = pread(fd, &pfn, sizeof(pfn), PAGE_PFN((uintptr_t)&dummy_var) * sizeof(pfn)); if (ret != sizeof(pfn)) { pr_perror("Can't read pagemap"); close(fd); return -1; } /* The page can be swapped out by the time the read occurs, * in which case the rest of the bits are a swap type + offset * (which could be zero even if not hidden). * Retry if this happens. */ if (pfn & PME_PRESENT) break; pr_warn("got non-present PFN %#lx for the dummy data page; %s\n", (unsigned long)pfn, retry ? "retrying" : "giving up"); pfn = 0; } close(fd); if ((pfn & PME_PFRAME_MASK) == 0) { pr_info("Pagemap provides flags only\n"); kdat.pmap = PM_FLAGS_ONLY; } else { pr_info("Pagemap is fully functional\n"); kdat.pmap = PM_FULL; } return 0; } /* * Anonymous shared mappings are backed by hidden tmpfs * mount. Find out its dev to distinguish such mappings * from real tmpfs files maps. */ static int parse_self_maps(unsigned long vm_start, dev_t *device) { FILE *maps; char buf[1024]; maps = fopen_proc(PROC_SELF, "maps"); if (maps == NULL) return -1; while (fgets(buf, sizeof(buf), maps) != NULL) { char *end, *aux; unsigned long start; int maj, min; start = strtoul(buf, &end, 16); if (vm_start > start) continue; if (vm_start < start) break; /* It's ours */ aux = strchr(end + 1, ' '); /* end prot */ aux = strchr(aux + 1, ' '); /* prot pgoff */ aux = strchr(aux + 1, ' '); /* pgoff dev */ maj = strtoul(aux + 1, &end, 16); min = strtoul(end + 1, NULL, 16); *device = makedev(maj, min); fclose(maps); return 0; } fclose(maps); return -1; } static void kerndat_mmap_min_addr(void) { /* From kernel's default CONFIG_LSM_MMAP_MIN_ADDR */ static const unsigned long default_mmap_min_addr = 65536; uint64_t value; struct sysctl_req req[] = { { .name = "vm/mmap_min_addr", .arg = &value, .type = CTL_U64, }, }; if (sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0)) { pr_warn("Can't fetch %s value, use default %#lx\n", req[0].name, (unsigned long)default_mmap_min_addr); kdat.mmap_min_addr = default_mmap_min_addr; return; } if (value < default_mmap_min_addr) { pr_debug("Adjust mmap_min_addr %#lx -> %#lx\n", (unsigned long)value, (unsigned long)default_mmap_min_addr); kdat.mmap_min_addr = default_mmap_min_addr; } else kdat.mmap_min_addr = value; pr_debug("Found mmap_min_addr %#lx\n", (unsigned long)kdat.mmap_min_addr); } static int kerndat_files_stat(void) { static const uint32_t NR_OPEN_DEFAULT = 1024 * 1024; uint32_t nr_open; struct sysctl_req req[] = { { .name = "fs/nr_open", .arg = &nr_open, .type = CTL_U32, }, }; if (sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0)) { pr_warn("Can't fetch file_stat, using kernel defaults\n"); nr_open = NR_OPEN_DEFAULT; } kdat.sysctl_nr_open = nr_open; pr_debug("files stat: %s %u\n", req[0].name, kdat.sysctl_nr_open); return 0; } static int kerndat_get_dev(dev_t *dev, char *map, size_t size) { char maps[128]; struct stat buf; sprintf(maps, "/proc/self/map_files/%lx-%lx", (unsigned long)map, (unsigned long)map + size); if (stat(maps, &buf) < 0) { int e = errno; if (errno == EPERM) { /* * Kernel disables messing with map_files. * OK, let's go the slower route. */ if (parse_self_maps((unsigned long)map, dev) < 0) { pr_err("Can't read self maps\n"); return -1; } } else { pr_perror("Can't stat self map_files %d", e); return -1; } } else { *dev = buf.st_dev; } return 0; } static int kerndat_get_shmemdev(void) { void *map; dev_t dev; map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (map == MAP_FAILED) { pr_perror("Can't mmap memory for shmemdev test"); return -1; } if (kerndat_get_dev(&dev, map, PAGE_SIZE)) goto err; munmap(map, PAGE_SIZE); kdat.shmem_dev = dev; pr_info("Found anon-shmem device at %" PRIx64 "\n", kdat.shmem_dev); return 0; err: munmap(map, PAGE_SIZE); return -1; } /* Return -1 -- error * Return 0 -- successful but can't get any new device's numbers * Return 1 -- successful and get new device's numbers * * At first, all kdat.hugetlb_dev elements are initialized to 0. * When the function finishes, * kdat.hugetlb_dev[i] == -1 -- this hugetlb page size is not supported * kdat.hugetlb_dev[i] == 0 -- this hugetlb page size is supported but can't collect device's number * Otherwise, kdat.hugetlb_dev[i] contains the corresponding device's number * * Next time the function is called, it only tries to collect the device's number of hugetlb page size * that is supported but can't be collected in the previous call (kdat.hugetlb_dev[i] == 0) */ static int kerndat_get_hugetlb_dev(void) { void *map; int i, flag, ret = 0; unsigned long long size; dev_t dev; for (i = 0; i < HUGETLB_MAX; i++) { /* Skip if this hugetlb size is not supported or the device's number has been collected */ if (kdat.hugetlb_dev[i]) continue; size = hugetlb_info[i].size; flag = hugetlb_info[i].flag; map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | flag, 0, 0); if (map == MAP_FAILED) { if (errno == EINVAL) { kdat.hugetlb_dev[i] = (dev_t)-1; continue; } else if (errno == ENOMEM) { pr_info("Hugetlb size %llu Mb is supported but cannot get dev's number\n", size >> 20); continue; } else { pr_perror("Unexpected result when get hugetlb dev"); return -1; } } if (kerndat_get_dev(&dev, map, size)) { munmap(map, size); return -1; } munmap(map, size); kdat.hugetlb_dev[i] = dev; ret = 1; pr_info("Found hugetlb device at %" PRIx64 "\n", kdat.hugetlb_dev[i]); } return ret; } static dev_t get_host_dev(unsigned int which) { static struct kst { const char *name; const char *path; unsigned int magic; dev_t fs_dev; } kstat[KERNDAT_FS_STAT_MAX] = { [KERNDAT_FS_STAT_DEVPTS] = { .name = "devpts", .path = "/dev/pts", .magic = DEVPTS_SUPER_MAGIC, }, [KERNDAT_FS_STAT_DEVTMPFS] = { .name = "devtmpfs", .path = "/dev", .magic = TMPFS_MAGIC, }, [KERNDAT_FS_STAT_BINFMT_MISC] = { .name = "binfmt_misc", .path = "/proc/sys/fs/binfmt_misc", .magic = BINFMTFS_MAGIC, }, }; if (which >= KERNDAT_FS_STAT_MAX) { pr_err("Wrong fs type %u passed\n", which); return 0; } if (kstat[which].fs_dev == 0) { struct statfs fst; struct stat st; if (statfs(kstat[which].path, &fst)) { pr_perror("Unable to statefs %s", kstat[which].path); return 0; } /* * XXX: If the fs we need is not there, it still * may mean that it's virtualized, but just not * mounted on the host. */ if (fst.f_type != kstat[which].magic) { pr_err("%s isn't mount on the host\n", kstat[which].name); return 0; } if (stat(kstat[which].path, &st)) { pr_perror("Unable to stat %s", kstat[which].path); return 0; } BUG_ON(st.st_dev == 0); kstat[which].fs_dev = st.st_dev; } return kstat[which].fs_dev; } int kerndat_fs_virtualized(unsigned int which, u32 kdev) { dev_t host_fs_dev; host_fs_dev = get_host_dev(which); if (host_fs_dev == 0) return -1; return (kdev_to_odev(kdev) == host_fs_dev) ? 0 : 1; } /* * Check whether pagemap reports soft dirty bit. Kernel has * this functionality under CONFIG_MEM_SOFT_DIRTY option. */ static int kerndat_get_dirty_track(void) { char *map; int pm2; u64 pmap = 0; int ret = -1; map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (map == MAP_FAILED) { pr_perror("Can't mmap memory for pagemap test"); return ret; } /* * Kernel shows soft-dirty bits only if this soft-dirty * was at least once re-set. (this is to be removed in * a couple of kernel releases) */ ret = do_task_reset_dirty_track(getpid()); if (ret < 0) return ret; if (ret == 1) goto no_dt; ret = -1; pm2 = open_proc(PROC_SELF, "pagemap"); if (pm2 < 0) { munmap(map, PAGE_SIZE); return ret; } map[0] = '\0'; lseek(pm2, (unsigned long)map / PAGE_SIZE * sizeof(u64), SEEK_SET); ret = read(pm2, &pmap, sizeof(pmap)); if (ret < 0) pr_perror("Read pmap err!"); close(pm2); munmap(map, PAGE_SIZE); if (pmap & PME_SOFT_DIRTY) { pr_info("Dirty track supported on kernel\n"); kdat.has_dirty_track = true; } else { no_dt: pr_info("Dirty tracking support is OFF\n"); } return 0; } /* The page frame number (PFN) is constant for the zero page */ static int init_zero_page_pfn(void) { void *addr; int ret = 0; kdat.zero_page_pfn = -1; if (kdat.pmap != PM_FULL) { pr_info("Zero page detection failed, optimization turns off.\n"); return 0; } addr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { pr_perror("Unable to map zero page"); return 0; } if (*((int *)addr) != 0) { BUG(); return -1; } ret = vaddr_to_pfn(-1, (unsigned long)addr, &kdat.zero_page_pfn); munmap(addr, PAGE_SIZE); if (kdat.zero_page_pfn == 0) { pr_err("vaddr_to_pfn succeeded but kdat.zero_page_pfn is invalid.\n"); ret = -1; } return ret; } static int get_last_cap(void) { struct sysctl_req req[] = { { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 }, }; int ret; ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); if (ret || kdat.last_cap < 32 * CR_CAP_SIZE) return ret; pr_err("Kernel reports more capabilities than this CRIU supports: %u > %u\n", kdat.last_cap, 32 * CR_CAP_SIZE - 1); return -1; } static bool kerndat_has_memfd_create(void) { int ret; ret = memfd_create(NULL, 0); if (ret == -1 && errno == ENOSYS) kdat.has_memfd = false; else if (ret == -1 && errno == EFAULT) kdat.has_memfd = true; else { pr_perror("Unexpected error from memfd_create(NULL, 0)"); return -1; } return 0; } static bool kerndat_has_memfd_hugetlb(void) { int ret; if (!kdat.has_memfd) { kdat.has_memfd_hugetlb = false; return 0; } ret = memfd_create("", MFD_HUGETLB); if (ret >= 0) { kdat.has_memfd_hugetlb = true; close(ret); } else if (ret == -1 && (errno == EINVAL || errno == ENOENT || errno == ENOSYS)) { kdat.has_memfd_hugetlb = false; } else { pr_perror("Unexpected error from memfd_create(\"\", MFD_HUGETLB)"); return -1; } return 0; } static int get_task_size(void) { kdat.task_size = compel_task_size(); pr_debug("Found task size of %lx\n", kdat.task_size); return 0; } static int kerndat_fdinfo_has_lock(void) { int fd, pfd = -1, exit_code = -1, len; char buf[PAGE_SIZE]; fd = open_proc(PROC_GEN, "locks"); if (fd < 0) return -1; if (flock(fd, LOCK_SH)) { pr_perror("Can't take a lock"); goto out; } pfd = open_proc(PROC_SELF, "fdinfo/%d", fd); if (pfd < 0) goto out; len = read(pfd, buf, sizeof(buf) - 1); if (len < 0) { pr_perror("Unable to read"); goto out; } buf[len] = 0; kdat.has_fdinfo_lock = (strstr(buf, "lock:") != NULL); exit_code = 0; out: close_safe(&pfd); close(fd); return exit_code; } static int get_ipv6(void) { if (access("/proc/sys/net/ipv6", F_OK) < 0) { if (errno == ENOENT) { pr_debug("ipv6 is disabled\n"); kdat.ipv6 = false; return 0; } pr_perror("Unable to access /proc/sys/net/ipv6"); return -1; } kdat.ipv6 = true; return 0; } static int kerndat_loginuid(void) { unsigned int saved_loginuid; int ret; kdat.luid = LUID_NONE; /* No such file: CONFIG_AUDITSYSCALL disabled */ saved_loginuid = parse_pid_loginuid(PROC_SELF, &ret, true); if (ret < 0) return 0; kdat.luid = LUID_READ; /* * From kernel v3.13-rc2 it's possible to unset loginuid value, * on that rely dump/restore code. * See also: marc.info/?l=git-commits-head&m=138509506407067 */ if (prepare_loginuid(INVALID_UID) < 0) return 0; /* Cleaning value back as it was */ if (prepare_loginuid(saved_loginuid) < 0) return 0; kdat.luid = LUID_FULL; return 0; } static int kerndat_iptables_has_xtlocks(void) { int fd; char *argv[4] = { "sh", "-c", "iptables -w -L", NULL }; fd = open("/dev/null", O_RDWR); if (fd < 0) { fd = -1; pr_perror("failed to open /dev/null, using log fd for xtlocks check"); } kdat.has_xtlocks = 1; if (cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL) == -1) kdat.has_xtlocks = 0; close_safe(&fd); return 0; } /* * Unfortunately in C htonl() is not constexpr and cannot be used in a static * initialization below. */ #define constant_htonl(x) \ (__BYTE_ORDER == __BIG_ENDIAN ? (x) : \ (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) static int kerndat_tcp_repair(void) { static const struct sockaddr_in loopback_ip4 = { .sin_family = AF_INET, .sin_port = 0, .sin_addr = { constant_htonl(INADDR_LOOPBACK) }, }; static const struct sockaddr_in6 loopback_ip6 = { .sin6_family = AF_INET6, .sin6_port = 0, .sin6_addr = IN6ADDR_LOOPBACK_INIT, }; int sock, clnt = -1, yes = 1, exit_code = -1; const struct sockaddr *addr; struct sockaddr_storage listener_addr; socklen_t addrlen; addr = (const struct sockaddr *)&loopback_ip4; addrlen = sizeof(loopback_ip4); sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (sock < 0 && errno == EAFNOSUPPORT) { addr = (const struct sockaddr *)&loopback_ip6; addrlen = sizeof(loopback_ip6); sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); } if (sock < 0) { pr_perror("Unable to create a socket"); return -1; } if (bind(sock, addr, addrlen)) { pr_perror("Unable to bind a socket"); goto err; } addrlen = sizeof(listener_addr); if (getsockname(sock, (struct sockaddr *)&listener_addr, &addrlen)) { pr_perror("Unable to get a socket name"); goto err; } if (listen(sock, 1)) { pr_perror("Unable to listen a socket"); goto err; } clnt = socket(addr->sa_family, SOCK_STREAM, IPPROTO_TCP); if (clnt < 0) { pr_perror("Unable to create a socket"); goto err; } if (connect(clnt, (const struct sockaddr *)&listener_addr, addrlen)) { pr_perror("Unable to connect a socket"); goto err; } if (shutdown(clnt, SHUT_WR)) { pr_perror("Unable to shutdown a socket"); goto err; } if (setsockopt(clnt, SOL_TCP, TCP_REPAIR, &yes, sizeof(yes))) { if (errno != EPERM) { pr_perror("Unable to set TCP_REPAIR with setsockopt"); goto err; } kdat.has_tcp_half_closed = false; } else kdat.has_tcp_half_closed = true; exit_code = 0; err: close_safe(&clnt); close(sock); return exit_code; } static int kerndat_nsid(void) { int nsid, sk; kdat.has_nsid = false; sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sk < 0) { pr_pwarn("Unable to create a netlink socket: NSID can't be used."); return 0; } if (net_get_nsid(sk, getpid(), &nsid) < 0) { pr_warn("NSID is not supported\n"); close(sk); return 0; } kdat.has_nsid = true; close(sk); return 0; } static int kerndat_compat_restore(void) { int ret; ret = kdat_can_map_vdso(); if (ret < 0) { pr_err("kdat_can_map_vdso failed\n"); return ret; } kdat.can_map_vdso = !!ret; /* depends on kdat.can_map_vdso result */ kdat.compat_cr = kdat_compatible_cr(); return 0; } static int kerndat_detect_stack_guard_gap(void) { int num, ret = -1, detected = 0; unsigned long start, end; char r, w, x, s; char buf[1024]; FILE *maps; void *mem; mem = mmap(NULL, (3ul << 20), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, -1, 0); if (mem == MAP_FAILED) { pr_perror("Can't mmap stack area"); return -1; } munmap(mem, (3ul << 20)); mem = mmap(mem + (2ul << 20), (1ul << 20), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | MAP_GROWSDOWN, -1, 0); if (mem == MAP_FAILED) { pr_perror("Can't mmap stack area"); return -1; } maps = fopen("/proc/self/maps", "r"); if (maps == NULL) { pr_perror("Could not open /proc/self/maps"); munmap(mem, 4096); return -1; } while (fgets(buf, sizeof(buf), maps)) { num = sscanf(buf, "%lx-%lx %c%c%c%c", &start, &end, &r, &w, &x, &s); if (num < 6) { pr_err("Can't parse: %s\n", buf); goto err; } /* * When reading /proc/$pid/[s]maps the * start/end addresses might be cut off * with PAGE_SIZE on kernels prior 4.12 * (see kernel commit 1be7107fbe18ee). * * Same time there was semi-complete * patch released which hitted a number * of repos (Ubuntu, Fedora) where instead * of PAGE_SIZE the 1M gap is cut off. */ if (start == (unsigned long)mem) { kdat.stack_guard_gap_hidden = false; detected = 1; break; } else if (start == ((unsigned long)mem + (1ul << 20))) { pr_warn("Unsupported stack guard detected, confused but continue\n"); kdat.stack_guard_gap_hidden = true; detected = 1; break; } else if (start == ((unsigned long)mem + PAGE_SIZE)) { kdat.stack_guard_gap_hidden = true; detected = 1; break; } } if (detected) ret = 0; err: munmap(mem, (1ul << 20)); fclose(maps); return ret; } static int kerndat_has_inotify_setnextwd(void) { int ret = 0; int fd; fd = inotify_init(); if (fd < 0) { pr_perror("Can't create inotify"); return -1; } if (ioctl(fd, INOTIFY_IOC_SETNEXTWD, 0x10)) { if (errno != ENOTTY) { pr_perror("Can't call ioctl"); ret = -1; } } else kdat.has_inotify_setnextwd = true; close(fd); return ret; } static int kerndat_has_fsopen(void) { if (syscall(__NR_fsopen, NULL, -1) != -1) { pr_err("fsopen should fail\n"); return -1; } if (errno == ENOSYS) pr_info("The new mount API (fsopen, fsmount) isn't supported\n"); else kdat.has_fsopen = true; return 0; } static int has_kcmp_epoll_tfd(void) { kcmp_epoll_slot_t slot = {}; int ret = -1, efd, tfd; pid_t pid = getpid(); struct epoll_event ev; int pipefd[2]; efd = epoll_create(1); if (efd < 0) { pr_perror("Can't create epoll"); return -1; } memset(&ev, 0xff, sizeof(ev)); ev.events = EPOLLIN | EPOLLOUT; if (pipe(pipefd)) { pr_perror("Can't create pipe"); close(efd); return -1; } tfd = pipefd[0]; if (epoll_ctl(efd, EPOLL_CTL_ADD, tfd, &ev)) { pr_perror("Can't add event"); goto out; } slot.efd = efd; slot.tfd = tfd; if (syscall(SYS_kcmp, pid, pid, KCMP_EPOLL_TFD, tfd, &slot) == 0) kdat.has_kcmp_epoll_tfd = true; else kdat.has_kcmp_epoll_tfd = false; ret = 0; out: close(pipefd[0]); close(pipefd[1]); close(efd); return ret; } static int has_time_namespace(void) { #if 0 if (access("/proc/self/timens_offsets", F_OK) < 0) { if (errno == ENOENT) { pr_debug("Time namespaces are not supported.\n"); kdat.has_timens = false; return 0; } pr_perror("Unable to access /proc/self/timens_offsets"); return -1; } kdat.has_timens = true; return 0; #else kdat.has_timens = false; return 0; #endif } int __attribute__((weak)) kdat_x86_has_ptrace_fpu_xsave_bug(void) { return 0; } static int kerndat_x86_has_ptrace_fpu_xsave_bug(void) { int ret = kdat_x86_has_ptrace_fpu_xsave_bug(); if (ret < 0) { pr_err("kdat_x86_has_ptrace_fpu_xsave_bug failed\n"); return ret; } kdat.x86_has_ptrace_fpu_xsave_bug = !!ret; return 0; } static int kerndat_has_rseq(void) { if (syscall(__NR_rseq, NULL, 0, 0, 0) != -1) { pr_err("rseq should fail\n"); return -1; } if (errno == ENOSYS) pr_info("rseq syscall isn't supported\n"); else kdat.has_rseq = true; return 0; } static int kerndat_has_ptrace_get_rseq_conf(void) { pid_t pid; int len; struct __ptrace_rseq_configuration rseq; int ret = 0; pid = fork_and_ptrace_attach(NULL); if (pid < 0) { kdat.has_ptrace_get_rseq_conf = false; pr_warn("Can't detect has_ptrace_get_rseq_conf\n"); return 0; } len = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, pid, sizeof(rseq), &rseq); if (len != sizeof(rseq)) { if (kdat.has_ptrace_get_rseq_conf) ret = 1; /* we should update kdat */ kdat.has_ptrace_get_rseq_conf = false; pr_info("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) is not supported\n"); goto out; } /* * flags is always zero from the kernel side, if it will be changed * we need to pay attention to that and, possibly, make changes on the CRIU side. */ if (rseq.flags != 0) { if (kdat.has_ptrace_get_rseq_conf) ret = 1; /* we should update kdat */ kdat.has_ptrace_get_rseq_conf = false; pr_err("ptrace(PTRACE_GET_RSEQ_CONFIGURATION): rseq.flags != 0\n"); } else { if (!kdat.has_ptrace_get_rseq_conf) ret = 1; /* we should update kdat */ kdat.has_ptrace_get_rseq_conf = true; if (memcmp(&kdat.libc_rseq_conf, &rseq, sizeof(rseq))) ret = 1; /* we should update kdat */ kdat.libc_rseq_conf = rseq; } out: kill(pid, SIGKILL); waitpid(pid, NULL, 0); return ret; } int kerndat_sockopt_buf_lock(void) { int exit_code = -1; socklen_t len; u32 buf_lock; int sock; sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (sock < 0 && errno == EAFNOSUPPORT) sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { pr_perror("Unable to create a socket"); return -1; } len = sizeof(buf_lock); if (getsockopt(sock, SOL_SOCKET, SO_BUF_LOCK, &buf_lock, &len)) { if (errno != ENOPROTOOPT) { pr_perror("Unable to get SO_BUF_LOCK with getsockopt"); goto err; } kdat.has_sockopt_buf_lock = false; } else kdat.has_sockopt_buf_lock = true; exit_code = 0; err: close(sock); return exit_code; } static int kerndat_has_move_mount_set_group(void) { char tmpdir[] = "/tmp/.criu.move_mount_set_group.XXXXXX"; char subdir[64]; int exit_code = -1; if (mkdtemp(tmpdir) == NULL) { pr_perror("Fail to make dir %s", tmpdir); return -1; } if (mount("criu.move_mount_set_group", tmpdir, "tmpfs", 0, NULL)) { pr_warn("Fail to mount tmfps to %s: %m\n", tmpdir); kdat.has_move_mount_set_group = false; rmdir(tmpdir); return 0; } if (mount(NULL, tmpdir, NULL, MS_PRIVATE, NULL)) { pr_perror("Fail to make %s private", tmpdir); goto out; } if (snprintf(subdir, sizeof(subdir), "%s/subdir", tmpdir) >= sizeof(subdir)) { pr_err("Fail to snprintf subdir\n"); goto out; } if (mkdir(subdir, 0700)) { pr_perror("Fail to make dir %s", subdir); goto out; } if (mount(subdir, subdir, NULL, MS_BIND, NULL)) { pr_perror("Fail to make bind-mount %s", subdir); goto out; } if (mount(NULL, tmpdir, NULL, MS_SHARED, NULL)) { pr_perror("Fail to make %s private", tmpdir); goto out; } if (sys_move_mount(AT_FDCWD, tmpdir, AT_FDCWD, subdir, MOVE_MOUNT_SET_GROUP)) { if (errno == EINVAL || errno == ENOSYS) { pr_debug("No MOVE_MOUNT_SET_GROUP kernel feature\n"); kdat.has_move_mount_set_group = false; exit_code = 0; goto out; } pr_perror("Fail to MOVE_MOUNT_SET_GROUP"); goto out; } kdat.has_move_mount_set_group = true; exit_code = 0; out: if (umount2(tmpdir, MNT_DETACH)) pr_warn("Fail to umount2 %s: %s\n", tmpdir, strerror(errno)); if (rmdir(tmpdir)) pr_warn("Fail to rmdir %s: %s\n", tmpdir, strerror(errno)); return exit_code; } static int kerndat_has_openat2(void) { if (sys_openat2(AT_FDCWD, ".", NULL, 0) != -1) { pr_err("openat2 should fail\n"); return -1; } if (errno == ENOSYS) { pr_debug("No openat2 syscall support\n"); kdat.has_openat2 = false; } else { kdat.has_openat2 = true; } return 0; } #define KERNDAT_CACHE_NAME "criu.kdat" #define KERNDAT_CACHE_FILE KDAT_RUNDIR "/" KERNDAT_CACHE_NAME /* * Returns: * -1 if kdat_file was not written due to error * 0 if kdat_file was written * 1 if kdat_file was not written because cache directory undefined in env (non-root mode) */ static int get_kerndat_filename(char **kdat_file) { int ret; /* * Running as non-root, even with CAP_CHECKPOINT_RESTORE, does not * allow to write to KDAT_RUNDIR which usually is only writable by root. * Let's write criu.kdat file to XDG_RUNTIME_DIR for non-root cases. * Note that XDG_RUNTIME_DIR is not always defined (e.g. when executing * via su/sudo). */ if (opts.unprivileged) { const char *cache_dir = getenv("XDG_RUNTIME_DIR"); if (!cache_dir) { pr_warn("$XDG_RUNTIME_DIR not set. Cannot find location for kerndat file\n"); return 1; } ret = asprintf(kdat_file, "%s/%s", cache_dir, KERNDAT_CACHE_NAME); } else { ret = asprintf(kdat_file, "%s", KERNDAT_CACHE_FILE); } if (unlikely(ret < 0)) { pr_warn("Cannot allocate memory for kerndat file name\n"); return -1; } return 0; } /* * Returns: * -1 if error * 0 if cache was loaded * 1 if cache does not exist or is stale or cache directory undefined in env (non-root mode) */ static int kerndat_try_load_cache(void) { cleanup_free char *kdat_file = NULL; int fd, ret; ret = get_kerndat_filename(&kdat_file); if (ret) return ret; fd = open(kdat_file, O_RDONLY); if (fd < 0) { if (ENOENT == errno) pr_debug("File %s does not exist\n", kdat_file); else pr_warn("Can't load %s\n", kdat_file); return 1; } ret = read(fd, &kdat, sizeof(kdat)); if (ret < 0) { pr_perror("Can't read kdat cache"); close(fd); return -1; } close(fd); if (ret != sizeof(kdat) || kdat.magic1 != KDAT_MAGIC || kdat.magic2 != KDAT_MAGIC_2) { pr_warn("Stale %s file\n", kdat_file); unlink(kdat_file); return 1; } pr_info("Loaded kdat cache from %s\n", kdat_file); return 0; } static void kerndat_save_cache(void) { int fd, ret; struct statfs s; cleanup_free char *kdat_file = NULL; cleanup_free char *kdat_file_tmp = NULL; pr_debug("skip kerndat_save_cache\n"); return; if (get_kerndat_filename(&kdat_file)) return; ret = asprintf(&kdat_file_tmp, "%s.tmp", kdat_file); if (unlikely(ret < 0)) { pr_warn("Cannot allocate memory for kerndat file name\n"); return; } fd = open(kdat_file_tmp, O_CREAT | O_EXCL | O_WRONLY, 0600); if (fd < 0) /* * It can happen that we race with some other criu * instance. That's OK, just ignore this error and * proceed. */ return; /* * If running as root we store the cache file on a tmpfs (/run), * because the file should be gone after reboot. */ if (fstatfs(fd, &s) < 0 || s.f_type != TMPFS_MAGIC) { pr_warn("Can't keep kdat cache on non-tempfs\n"); close(fd); goto unl; } /* * One magic to make sure we're reading the kdat file. * One more magic to make somehow sure we don't read kdat * from some other criu */ kdat.magic1 = KDAT_MAGIC; kdat.magic2 = KDAT_MAGIC_2; ret = write(fd, &kdat, sizeof(kdat)); close(fd); if (ret == sizeof(kdat)) ret = rename(kdat_file_tmp, kdat_file); else { ret = -1; errno = EIO; } if (ret < 0) { pr_perror("Couldn't save %s", kdat_file); unl: unlink(kdat_file); } } static int kerndat_uffd(void) { int uffd, err = 0; if (opts.unprivileged) /* * If running as non-root uffd_open() fails with * 'Operation not permitted'. Just ignore uffd for * non-root for now. */ return 0; kdat.uffd_features = 0; uffd = uffd_open(0, &kdat.uffd_features, &err); /* * err == ENOSYS means userfaultfd is not supported on this system and * we just happily return with kdat.has_uffd = false. * err == EPERM means that userfaultfd is not allowed as we are * non-root user, so we also return with kdat.has_uffd = false. * Errors other than ENOSYS and EPERM would mean "Houston, Houston, we * have a problem!" */ if (uffd < 0) { if (err == ENOSYS || err == EPERM) return 0; if (err == EPERM) { pr_info("Lazy pages are not permitted\n"); return 0; } pr_err("Lazy pages are not available\n"); return -1; } kdat.has_uffd = true; /* * we have to close the uffd and reopen in later in restorer * to enable non-cooperative features */ close(uffd); return 0; } int kerndat_has_thp_disable(void) { struct bfd f; void *addr; char *str; int ret = -1; bool vma_match = false; if (prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0)) { if (errno != EINVAL && errno != EPERM) { pr_perror("prctl PR_SET_THP_DISABLE failed"); return -1; } pr_info("PR_SET_THP_DISABLE is not available\n"); return 0; } addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (addr == MAP_FAILED) { pr_perror("Can't mmap memory for THP disable test"); return -1; } if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { pr_perror("prctl PR_SET_THP_DISABLE failed"); goto out_unmap; } f.fd = open("/proc/self/smaps", O_RDONLY); if (f.fd < 0) { pr_perror("Can't open /proc/self/smaps"); goto out_unmap; } if (bfdopenr(&f)) goto out_unmap; while ((str = breadline(&f)) != NULL) { if (IS_ERR(str)) goto out_close; if (is_vma_range_fmt(str)) { unsigned long vma_addr; if (sscanf(str, "%lx-", &vma_addr) != 1) { pr_err("Can't parse: %s\n", str); goto out_close; } if (vma_addr == (unsigned long)addr) vma_match = true; } if (vma_match && !strncmp(str, "VmFlags: ", 9)) { u32 flags = 0; u64 madv = 0; int io_pf = 0; parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); if (!kdat.has_thp_disable) pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE\n"); break; } } ret = 0; out_close: bclose(&f); out_unmap: munmap(addr, PAGE_SIZE); return ret; } static int kerndat_tun_netns(void) { return check_tun_netns_cr(&kdat.tun_ns); } static bool kerndat_has_clone3_set_tid(void) { pid_t pid; struct _clone_args args = {}; #if defined(CONFIG_MIPS) /* * Currently the CRIU PIE assembler clone3() wrapper is * not implemented for MIPS. */ kdat.has_clone3_set_tid = false; return 0; #endif args.set_tid = -1; /* * On a system without clone3() this will return ENOSYS. * On a system with clone3() but without set_tid this * will return E2BIG. * On a system with clone3() and set_tid it will return * EINVAL. */ pid = syscall(__NR_clone3, &args, sizeof(args)); if (pid != -1) { pr_err("Unexpected success: clone3() returned %d\n", pid); return -1; } if (errno == ENOSYS || errno == E2BIG) return 0; if (errno != EINVAL) { pr_pwarn("Unexpected error from clone3"); return 0; } if (errno != EPERM) kdat.has_clone3_set_tid = true; return 0; } static void kerndat_has_pidfd_open(void) { int pidfd; pidfd = syscall(SYS_pidfd_open, getpid(), 0); if (pidfd == -1) kdat.has_pidfd_open = false; else kdat.has_pidfd_open = true; close_safe(&pidfd); } static int kerndat_has_pidfd_getfd(void) { int ret; int fds[2]; int val_a, val_b; int pidfd, stolen_fd; ret = 0; if (socketpair(AF_UNIX, SOCK_DGRAM, 0, fds)) { pr_perror("Can't open unix socket pair"); ret = -1; goto out; } val_a = 1984; if (write(fds[0], &val_a, sizeof(val_a)) != sizeof(val_a)) { pr_perror("Can't write to socket"); ret = -1; goto close_pair; } pidfd = syscall(SYS_pidfd_open, getpid(), 0); if (pidfd == -1) { pr_warn("Can't get pidfd\n"); /* * If pidfd_open is not supported then pidfd_getfd * will not be supported as well. */ kdat.has_pidfd_getfd = false; goto close_pair; } stolen_fd = syscall(SYS_pidfd_getfd, pidfd, fds[1], 0); if (stolen_fd == -1) { kdat.has_pidfd_getfd = false; goto close_all; } if (read(fds[1], &val_b, sizeof(val_b)) != sizeof(val_b)) { pr_perror("Can't read from socket"); ret = -1; goto close_all; } if (val_b == val_a) { kdat.has_pidfd_getfd = true; } else { /* If val_b != val_a, something unexpected happened. */ pr_err("Unexpected value read from socket\n"); ret = -1; } close_all: close_safe(&stolen_fd); close_safe(&pidfd); close_pair: close(fds[0]); close(fds[1]); out: return ret; } int kerndat_has_nspid(void) { struct bfd f; int ret = -1; char *str; f.fd = open("/proc/self/status", O_RDONLY); if (f.fd < 0) { pr_perror("Can't open /proc/self/status"); return -1; } if (bfdopenr(&f)) return -1; while ((str = breadline(&f)) != NULL) { if (IS_ERR(str)) goto close; if (!strncmp(str, "NSpid:", 6)) { kdat.has_nspid = true; break; } } ret = 0; close: bclose(&f); return ret; } #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) static int __has_nftables_concat(void *arg) { bool *has = (bool *)arg; struct nft_ctx *nft; int ret = 1; /* * Create a separate network namespace to avoid * collisions between two CRIU instances. */ if (unshare(CLONE_NEWNET)) { pr_perror("Unable create a network namespace"); return 1; } nft = nft_ctx_new(NFT_CTX_DEFAULT); if (!nft) return 1; if (NFT_RUN_CMD(nft, "create table inet CRIU")) { pr_err("Can't create nftables table\n"); goto nft_ctx_free_out; } if (NFT_RUN_CMD(nft, "add set inet CRIU conn { type ipv4_addr . inet_service ;}")) *has = false; /* kdat.has_nftables_concat = false */ else *has = true; /* kdat.has_nftables_concat = true */ /* Clean up */ NFT_RUN_CMD(nft, "delete table inet CRIU"); ret = 0; nft_ctx_free_out: nft_ctx_free(nft); return ret; } #endif static int kerndat_has_nftables_concat(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) bool has; if (call_in_child_process(__has_nftables_concat, (void *)&has)) return -1; kdat.has_nftables_concat = has; return 0; #else pr_warn("CRIU was built without libnftables support\n"); kdat.has_nftables_concat = false; return 0; #endif } #ifndef IPV6_FREEBIND #define IPV6_FREEBIND 78 #endif static int __kerndat_has_ipv6_freebind(int sk) { int val = 1; if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { if (errno == ENOPROTOOPT) { kdat.has_ipv6_freebind = false; return 0; } pr_perror("Unable to setsockopt ipv6_freebind"); return -1; } kdat.has_ipv6_freebind = true; return 0; } static int kerndat_has_ipv6_freebind(void) { int sk, ret; if (!kdat.ipv6) { kdat.has_ipv6_freebind = false; return 0; } sk = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); if (sk == -1) { pr_perror("Unable to create a ipv6 dgram socket"); return -1; } ret = __kerndat_has_ipv6_freebind(sk); close(sk); return ret; } #define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 static int kerndat_has_membarrier_get_registrations(void) { int ret = syscall(__NR_membarrier, 1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0); if (ret < 0) { if (errno != EINVAL) { return ret; } kdat.has_membarrier_get_registrations = false; } else { kdat.has_membarrier_get_registrations = true; } return 0; } /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the * availability of those features at the first time we run kerndat * check. So in later kerndat checks, we need to retry to get those * information. This function contains calls to those kerndat checks. * * Those kerndat checks must * Return -1 on error * Return 0 when the check is successful but no new information * Return 1 when the check is successful and there is new information */ int kerndat_try_load_new(void) { int ret; ret = kerndat_get_hugetlb_dev(); if (ret < 0) return ret; ret = kerndat_has_ptrace_get_rseq_conf(); if (ret < 0) { pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); return ret; } /* New information is found, we need to save to the cache */ if (ret) kerndat_save_cache(); return 0; } static int root_only_init(void) { int ret = 0; if (opts.unprivileged) return 0; if (!ret && kerndat_loginuid()) { pr_err("kerndat_loginuid failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_tun_netns()) { pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_socket_unix_file()) { pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_link_nsid()) { pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_socket_netns()) { pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_nftables_concat()) { pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_move_mount_set_group()) { pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); ret = -1; } return ret; } int kerndat_init(void) { int ret; ret = kerndat_try_load_cache(); if (ret < 0) return ret; if (ret == 0) return kerndat_try_load_new(); ret = 0; /* kerndat_try_load_cache can leave some trash in kdat */ memset(&kdat, 0, sizeof(kdat)); preload_socket_modules(); if (!opts.unprivileged) /* * This uses 'iptables -L' to implicitly load necessary modules. * If the non nft backed iptables is used it does a * openat(AT_FDCWD, "/run/xtables.lock", O_RDONLY|O_CREAT, 0600) = -1 EACCES * which will fail as non-root. There are no capabilities to * change this. The iptables nft backend fails with * openat(AT_FDCWD, "/proc/net/ip_tables_names", O_RDONLY) = -1 EACCES */ preload_netfilter_modules(); if (check_pagemap()) { pr_err("check_pagemap failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_get_shmemdev()) { pr_err("kerndat_get_shmemdev failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_get_hugetlb_dev() < 0) { pr_err("kerndat_get_hugetlb_dev failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_get_dirty_track()) { pr_err("kerndat_get_dirty_track failed when initializing kerndat.\n"); ret = -1; } if (!ret && init_zero_page_pfn()) { pr_err("init_zero_page_pfn failed when initializing kerndat.\n"); ret = -1; } if (!ret && get_last_cap()) { pr_err("get_last_cap failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_fdinfo_has_lock()) { pr_err("kerndat_fdinfo_has_lock failed when initializing kerndat.\n"); ret = -1; } if (!ret && get_task_size()) { pr_err("get_task_size failed when initializing kerndat.\n"); ret = -1; } if (!ret && get_ipv6()) { pr_err("get_ipv6 failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_nsid()) { pr_err("kerndat_nsid failed when initializing kerndat.\n"); ret = -1; } if (!ret && root_only_init()) ret = -1; if (!ret && kerndat_iptables_has_xtlocks()) { pr_err("kerndat_iptables_has_xtlocks failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_tcp_repair()) { pr_err("kerndat_tcp_repair failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_compat_restore()) { pr_err("kerndat_compat_restore failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_memfd_create()) { pr_err("kerndat_has_memfd_create failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_memfd_hugetlb()) { pr_err("kerndat_has_memfd_hugetlb failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_detect_stack_guard_gap()) { pr_err("kerndat_detect_stack_guard_gap failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_uffd()) { pr_err("kerndat_uffd failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_thp_disable()) { pr_err("kerndat_has_thp_disable failed when initializing kerndat.\n"); ret = -1; } /* Needs kdat.compat_cr filled before */ if (!ret && kerndat_vdso_fill_symtable()) { pr_err("kerndat_vdso_fill_symtable failed when initializing kerndat.\n"); ret = -1; } /* Depends on kerndat_vdso_fill_symtable() */ if (!ret && kerndat_vdso_preserves_hint()) { pr_err("kerndat_vdso_preserves_hint failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_x86_has_ptrace_fpu_xsave_bug()) { pr_err("kerndat_x86_has_ptrace_fpu_xsave_bug failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_inotify_setnextwd()) { pr_err("kerndat_has_inotify_setnextwd failed when initializing kerndat.\n"); ret = -1; } if (!ret && has_kcmp_epoll_tfd()) { pr_err("has_kcmp_epoll_tfd failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_fsopen()) { pr_err("kerndat_has_fsopen failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_clone3_set_tid()) { pr_err("kerndat_has_clone3_set_tid failed when initializing kerndat.\n"); ret = -1; } if (!ret && has_time_namespace()) { pr_err("has_time_namespace failed when initializing kerndat.\n"); ret = -1; } if (!ret && (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) && kerndat_has_newifindex()) { pr_err("kerndat_has_newifindex failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_pidfd_getfd()) { pr_err("kerndat_has_pidfd_getfd failed when initializing kerndat.\n"); ret = -1; } if (!ret) kerndat_has_pidfd_open(); if (!ret && kerndat_has_nspid()) { pr_err("kerndat_has_nspid failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_sockopt_buf_lock()) { pr_err("kerndat_sockopt_buf_lock failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_openat2()) { pr_err("kerndat_has_openat2 failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_rseq()) { pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); ret = -1; } if (!ret && (kerndat_has_ptrace_get_rseq_conf() < 0)) { pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); ret = -1; } if (!ret && (kerndat_has_ipv6_freebind() < 0)) { pr_err("kerndat_has_ipv6_freebind failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_has_membarrier_get_registrations()) { pr_err("kerndat_has_membarrier_get_registrations failed when initializing kerndat.\n"); ret = -1; } kerndat_lsm(); kerndat_mmap_min_addr(); kerndat_files_stat(); if (!ret) kerndat_save_cache(); return ret; } crac-criu-1.5.0/criu/libnetlink.c000066400000000000000000000122021471504326700166340ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "libnetlink.h" #include "util.h" static int nlmsg_receive(char *buf, int len, int (*cb)(struct nlmsghdr *, struct ns_id *ns, void *), int (*err_cb)(int, struct ns_id *, void *), struct ns_id *ns, void *arg) { struct nlmsghdr *hdr; for (hdr = (struct nlmsghdr *)buf; NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) { if (hdr->nlmsg_seq != CR_NLMSG_SEQ) continue; if (hdr->nlmsg_type == NLMSG_DONE) { int *length = (int *)NLMSG_DATA(hdr); if (*length < 0) return err_cb(*length, ns, arg); return 0; } if (hdr->nlmsg_type == NLMSG_ERROR) { struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(hdr); if (hdr->nlmsg_len - sizeof(*hdr) < sizeof(struct nlmsgerr)) { pr_err("ERROR truncated\n"); return -1; } if (err->error == 0) return 0; return err_cb(err->error, ns, arg); } if (cb(hdr, ns, arg)) return -1; } return 1; } /* * Default error handler: just point our an error * and pass up to caller. */ static int rtnl_return_err(int err, struct ns_id *ns, void *arg) { errno = -err; pr_perror("%d reported by netlink", err); return err; } int do_rtnl_req(int nl, void *req, int size, int (*receive_callback)(struct nlmsghdr *h, struct ns_id *ns, void *), int (*error_callback)(int err, struct ns_id *ns, void *arg), struct ns_id *ns, void *arg) { struct msghdr msg; struct sockaddr_nl nladdr; struct iovec iov; static char buf[16384]; int err; if (!error_callback) error_callback = rtnl_return_err; memset(&msg, 0, sizeof(msg)); msg.msg_name = &nladdr; msg.msg_namelen = sizeof(nladdr); msg.msg_iov = &iov; msg.msg_iovlen = 1; memset(&nladdr, 0, sizeof(nladdr)); nladdr.nl_family = AF_NETLINK; iov.iov_base = req; iov.iov_len = size; if (sendmsg(nl, &msg, 0) < 0) { err = -errno; pr_perror("Can't send request message"); goto err; } iov.iov_base = buf; iov.iov_len = sizeof(buf); while (1) { memset(&msg, 0, sizeof(msg)); msg.msg_name = &nladdr; msg.msg_namelen = sizeof(nladdr); msg.msg_iov = &iov; msg.msg_iovlen = 1; err = recvmsg(nl, &msg, 0); if (err < 0) { if (errno == EINTR) continue; else { err = -errno; pr_perror("Error receiving nl report"); goto err; } } if (err == 0) break; if (msg.msg_flags & MSG_TRUNC) { pr_err("Message truncated\n"); err = -EMSGSIZE; goto err; } err = nlmsg_receive(buf, err, receive_callback, error_callback, ns, arg); if (err < 0) goto err; if (err == 0) break; } return 0; err: return err; } int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, int alen) { int len = nla_attr_size(alen); struct rtattr *rta; if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) { pr_err("addattr_l ERROR: message exceeded bound of %d\n", maxlen); return -1; } rta = NLMSG_TAIL(n); rta->rta_type = type; rta->rta_len = len; memcpy(RTA_DATA(rta), data, alen); n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); return 0; } /* * Here is a workaround for a bug in libnl-3: * 6a8d90f5fec4 "attr: Allow attribute type 0 */ /** * Create attribute index based on a stream of attributes. * @arg tb Index array to be filled (maxtype+1 elements). * @arg maxtype Maximum attribute type expected and accepted. * @arg head Head of attribute stream. * @arg len Length of attribute stream. * @arg policy Attribute validation policy. * * Iterates over the stream of attributes and stores a pointer to each * attribute in the index array using the attribute type as index to * the array. Attribute with a type greater than the maximum type * specified will be silently ignored in order to maintain backwards * compatibility. If \a policy is not NULL, the attribute will be * validated using the specified policy. * * @see nla_validate * @return 0 on success or a negative error code. */ int __wrap_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, struct nla_policy *policy) { struct nlattr *nla; int rem; memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { int type = nla_type(nla); if (type > maxtype) continue; if (tb[type]) pr_warn("Attribute of type %#x found multiple times in message, " "previous attribute is being ignored.\n", type); tb[type] = nla; } if (rem > 0) pr_warn("netlink: %d bytes leftover after parsing " "attributes.\n", rem); return 0; } /** * parse attributes of a netlink message * @arg nlh netlink message header * @arg hdrlen length of family specific header * @arg tb destination array with maxtype+1 elements * @arg maxtype maximum attribute type to be expected * @arg policy validation policy * * See nla_parse() */ int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, struct nla_policy *policy) { if (!nlmsg_valid_hdr(nlh, hdrlen)) return -NLE_MSG_TOOSHORT; return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy); } crac-criu-1.5.0/criu/log.c000066400000000000000000000227031471504326700152710ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "page.h" #include "common/compiler.h" #include "util.h" #include "cr_options.h" #include "servicefd.h" #include "rst-malloc.h" #include "common/lock.h" #include "string.h" #include "version.h" #include "../soccr/soccr.h" #include "compel/log.h" #define DEFAULT_LOGFD STDERR_FILENO /* Enable timestamps if verbosity is increased from default */ #define LOG_TIMESTAMP (DEFAULT_LOGLEVEL + 1) #define LOG_BUF_LEN (8 * 1024) #define EARLY_LOG_BUF_LEN 1024 static unsigned int current_loglevel = DEFAULT_LOGLEVEL; static void vprint_on_level(unsigned int, const char *, va_list); static char buffer[LOG_BUF_LEN]; static char buf_off = 0; /* * The early_log_buffer is used to store log messages before * logging is set up to make sure no logs are lost. */ static char early_log_buffer[EARLY_LOG_BUF_LEN]; static unsigned int early_log_buf_off = 0; /* If this is 0 the logging has not been set up yet. */ static int init_done = 0; static struct timeval start; /* * Manual buf len as sprintf will _always_ put '\0' at the end, * but we want a "constant" pid to be there on restore */ #define TS_BUF_OFF 12 static void timediff(struct timeval *from, struct timeval *to) { to->tv_sec -= from->tv_sec; if (to->tv_usec >= from->tv_usec) to->tv_usec -= from->tv_usec; else { to->tv_sec--; to->tv_usec += USEC_PER_SEC - from->tv_usec; } } static void print_ts(void) { struct timeval t; gettimeofday(&t, NULL); timediff(&start, &t); snprintf(buffer, TS_BUF_OFF, "(%02u.%06u", (unsigned)t.tv_sec, (unsigned)t.tv_usec); buffer[TS_BUF_OFF - 2] = ')'; /* this will overwrite the last digit if tv_sec>=100 */ buffer[TS_BUF_OFF - 1] = ' '; /* kill the '\0' produced by snprintf */ } int log_get_fd(void) { int fd = get_service_fd(LOG_FD_OFF); return fd < 0 ? DEFAULT_LOGFD : fd; } void log_get_logstart(struct timeval *s) { if (current_loglevel >= LOG_TIMESTAMP) *s = start; else { s->tv_sec = 0; s->tv_usec = 0; } } static void reset_buf_off(void) { if (current_loglevel >= LOG_TIMESTAMP) /* reserve space for a timestamp */ buf_off = TS_BUF_OFF; else buf_off = 0; } /* * Keeping the very first error message for RPC to report back. */ struct str_and_lock { mutex_t l; char s[1024]; }; static struct str_and_lock *first_err; int log_keep_err(void) { first_err = shmalloc(sizeof(struct str_and_lock)); if (first_err == NULL) return -1; mutex_init(&first_err->l); first_err->s[0] = '\0'; return 0; } static void log_note_err(char *msg) { if (first_err && first_err->s[0] == '\0') { /* * In any action other than restore this locking is * actually not required, but ... it's error path * anyway, so it doesn't make much sense to try hard * and optimize this out. */ mutex_lock(&first_err->l); if (first_err->s[0] == '\0') __strlcpy(first_err->s, msg, sizeof(first_err->s)); mutex_unlock(&first_err->l); } } char *log_first_err(void) { if (!first_err) return NULL; if (first_err->s[0] == '\0') return NULL; return first_err->s; } static void print_versions(void) { struct utsname buf; pr_info("Version: %s (gitid %s+1)\n", CRIU_VERSION, CRIU_GITID); if (uname(&buf) < 0) { pr_perror("Reading kernel version failed!"); /* This pretty unlikely, just keep on running. */ return; } pr_info("Running on %s %s %s %s %s\n", buf.nodename, buf.sysname, buf.release, buf.version, buf.machine); } struct early_log_hdr { uint16_t level; uint16_t len; }; void flush_early_log_buffer(int fd) { unsigned int pos = 0; int ret; while (pos < early_log_buf_off) { /* * The early_log_buffer contains all messages written * before logging was set up. We only want to print * out messages which correspond to the requested * log_level. Therefore the early_log_buffer also contains * the log_level and the size. This writes one messages, * depending on the log_level, to the logging fd. Start * with reading the log_level. */ struct early_log_hdr *hdr = (void *)early_log_buffer + pos; pos += sizeof(hdr); if (hdr->level <= current_loglevel) { size_t size = 0; while (size < hdr->len) { ret = write(fd, early_log_buffer + pos + size, hdr->len - size); if (ret <= 0) break; size += ret; } } pos += hdr->len; } if (early_log_buf_off == EARLY_LOG_BUF_LEN) pr_warn("The early log buffer is full, some messages may have been lost\n"); early_log_buf_off = 0; } int log_init(const char *output) { int new_logfd, fd; gettimeofday(&start, NULL); reset_buf_off(); if (output && !strncmp(output, "-", 2)) { new_logfd = dup(STDOUT_FILENO); if (new_logfd < 0) { pr_perror("Can't dup stdout stream"); return -1; } } else if (output) { new_logfd = open(output, O_CREAT | O_TRUNC | O_WRONLY | O_APPEND, 0600); if (new_logfd < 0) { pr_perror("Can't create log file %s", output); return -1; } } else { new_logfd = dup(DEFAULT_LOGFD); if (new_logfd < 0) { pr_perror("Can't dup log file"); return -1; } } // The file was opened with effective UID, which is usually root (due to suid bit) // To make debugging more convenient we'll try to change that to real UID. if (fchown(new_logfd, getuid(), -1)) { pr_warn("Cannot change ownership of the log file: %s", strerror(errno)); } fd = install_service_fd(LOG_FD_OFF, new_logfd); if (fd < 0) goto err; init_done = 1; /* * Once logging is setup this write out all early log messages. * Only those messages which have to correct log level are printed. */ flush_early_log_buffer(fd); print_versions(); return 0; err: pr_perror("Log engine failure, can't duplicate descriptor"); return -1; } int log_init_by_pid(pid_t pid) { char path[PATH_MAX]; /* * reset buf_off as this fn is called on each fork while * restoring process tree */ reset_buf_off(); if (!opts.log_file_per_pid) { buf_off += snprintf(buffer + buf_off, sizeof buffer - buf_off, "%6d: ", pid); return 0; } if (!opts.output) return 0; snprintf(path, PATH_MAX, "%s.%d", opts.output, pid); return log_init(path); } void log_fini(void) { close_service_fd(LOG_FD_OFF); } static void soccr_print_on_level(unsigned int loglevel, const char *format, ...) { va_list args; int lv; switch (loglevel) { case SOCCR_LOG_DBG: lv = LOG_DEBUG; break; case SOCCR_LOG_ERR: lv = LOG_ERROR; break; default: lv = LOG_INFO; break; } va_start(args, format); vprint_on_level(lv, format, args); va_end(args); } void log_set_loglevel(unsigned int level) { current_loglevel = level; libsoccr_set_log(level, soccr_print_on_level); compel_log_init(vprint_on_level, level); } unsigned int log_get_loglevel(void) { return current_loglevel; } static void early_vprint(const char *format, unsigned int loglevel, va_list params) { unsigned int log_size = 0; struct early_log_hdr *hdr; if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN) return; /* Save loglevel */ hdr = (void *)early_log_buffer + early_log_buf_off; hdr->level = loglevel; /* Skip the log entry size */ early_log_buf_off += sizeof(hdr); if (loglevel >= LOG_TIMESTAMP) { /* * If logging is not yet setup we just write zeros * instead of a real timestamp. This way we can * keep the same format as the other messages on * log levels with timestamps (>=LOG_TIMESTAMP). */ log_size = snprintf(early_log_buffer + early_log_buf_off, sizeof(early_log_buffer) - early_log_buf_off, "(00.000000) "); } log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, sizeof(early_log_buffer) - early_log_buf_off - log_size, format, params); /* Save log entry size */ hdr->len = log_size; early_log_buf_off += log_size; } static void vprint_on_level(unsigned int loglevel, const char *format, va_list params) { int fd, size, ret, off = 0; int _errno = errno; if (unlikely(loglevel == LOG_MSG)) { fd = STDOUT_FILENO; off = buf_off; /* skip dangling timestamp */ } else { /* * If logging has not yet been initialized (init_done == 0) * make sure all messages are written to the early_log_buffer. */ if (!init_done) { early_vprint(format, loglevel, params); return; } if (loglevel > current_loglevel) return; fd = log_get_fd(); if (current_loglevel >= LOG_TIMESTAMP) print_ts(); } size = vsnprintf(buffer + buf_off, sizeof buffer - buf_off, format, params); size += buf_off; while (off < size) { ret = write(fd, buffer + off, size - off); if (ret <= 0) break; off += ret; } /* This is missing for messages in the early_log_buffer. */ if (loglevel == LOG_ERROR) log_note_err(buffer + buf_off); errno = _errno; } void print_on_level(unsigned int loglevel, const char *format, ...) { va_list params; va_start(params, format); vprint_on_level(loglevel, format, params); va_end(params); } int write_pidfile(int pid) { int fd, ret, exit_code = -1; fd = open(opts.pidfile, O_WRONLY | O_EXCL | O_CREAT, 0600); if (fd == -1) { pr_perror("pidfile: Can't open %s", opts.pidfile); return -1; } ret = dprintf(fd, "%d", pid); if (ret < 0) { pr_perror("pidfile: Can't write pid %d to %s", pid, opts.pidfile); goto close; } if (ret == 0) { pr_err("pidfile: Can't write pid %d to %s\n", pid, opts.pidfile); goto close; } pr_debug("pidfile: Wrote pid %d to %s (%d bytes)\n", pid, opts.pidfile, ret); exit_code = 0; close: close(fd); return exit_code; } crac-criu-1.5.0/criu/lsm.c000066400000000000000000000206761471504326700153120ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "common/config.h" #include "kerndat.h" #include "pstree.h" #include "util.h" #include "cr_options.h" #include "lsm.h" #include "fdstore.h" #include "apparmor.h" #include "protobuf.h" #include "images/inventory.pb-c.h" #include "images/creds.pb-c.h" #include "images/fdinfo.pb-c.h" #ifdef CONFIG_HAS_SELINUX #include #endif static int apparmor_get_label(pid_t pid, char **profile_name) { FILE *f; char *space; f = fopen_proc(pid, "attr/current"); if (!f) return -1; if (fscanf(f, "%ms", profile_name) != 1) { pr_perror("err scanfing"); fclose(f); return -1; } fclose(f); /* * A profile name can be followed by an enforcement mode, e.g. * lxc-default-with-nesting (enforced) * but the profile name is just the part before the space. */ space = strstr(*profile_name, " "); if (space) *space = 0; /* * An "unconfined" value means there is no profile, so we don't need to * worry about trying to restore one. */ if (strcmp(*profile_name, "unconfined") == 0) { free(*profile_name); *profile_name = NULL; } if (*profile_name && collect_aa_namespace(*profile_name) < 0) { free(*profile_name); *profile_name = NULL; pr_err("failed to collect AA namespace\n"); return -1; } return 0; } #ifdef CONFIG_HAS_SELINUX static int verify_selinux_label(char *ctx) { char *pos; int i; /* * There are SELinux setups where SELinux seems to be enabled, * but the returned labels are not really valid. See also * https://github.com/torvalds/linux/blob/master/security/selinux/include/initial_sid_to_string.h * * CRIU tells the user that such labels are invalid * and CRIU expects a SELinux label to contain three ':'. * * A label should look like this: * * unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 */ pos = (char *)ctx; for (i = 0; i < 3; i++) { pos = strstr(pos, ":"); if (!pos) return -1; pos++; } return 0; } static int selinux_get_label(pid_t pid, char **output) { char *ctx; int ret = -1; if (getpidcon_raw(pid, &ctx) < 0) { pr_perror("getting selinux profile failed"); return -1; } if (verify_selinux_label(ctx)) { pr_err("Invalid selinux context %s\n", (char *)ctx); goto err; } *output = xstrdup((char *)ctx); if (!*output) goto err; ret = 0; err: freecon(ctx); return ret; } /* * selinux_get_sockcreate_label reads /proc/PID/attr/sockcreate * to see if the PID has a special label specified for sockets. * Most of the time this will be empty and the process will use * the process context also for sockets. */ static int selinux_get_sockcreate_label(pid_t pid, char **output) { FILE *f; int ret; f = fopen_proc(pid, "attr/sockcreate"); if (!f) return -1; ret = fscanf(f, "%ms", output); if (ret == -1 && errno != 0) { pr_perror("Unable to parse /proc/%d/attr/sockcreate", pid); /* * Only if the error indicator is set it is a real error. * -1 could also be EOF, which would mean that sockcreate * was just empty, which is the most common case. */ fclose(f); return -1; } fclose(f); return 0; } int reset_setsockcreatecon(void) { /* Currently this only works for SELinux. */ if (kdat.lsm != LSMTYPE__SELINUX) return 0; if (setsockcreatecon_raw(NULL)) { pr_perror("Unable to reset socket SELinux context"); return -1; } return 0; } int run_setsockcreatecon(FdinfoEntry *e) { char *ctx = NULL; /* Currently this only works for SELinux. */ if (kdat.lsm != LSMTYPE__SELINUX) return 0; ctx = e->xattr_security_selinux; /* Writing to the FD using fsetxattr() did not work for some reason. */ if (setsockcreatecon_raw(ctx)) { pr_perror("Unable to set the %s socket SELinux context", ctx); return -1; } return 0; } int dump_xattr_security_selinux(int fd, FdinfoEntry *e) { char *ctx = NULL; int len; int ret; /* Currently this only works for SELinux. */ if (kdat.lsm != LSMTYPE__SELINUX) return 0; /* Get the size of the xattr. */ len = fgetxattr(fd, "security.selinux", ctx, 0); if (len == -1) { pr_err("Reading xattr security.selinux from FD %d failed\n", fd); return -1; } ctx = xmalloc(len); if (!ctx) { pr_err("xmalloc to read xattr for FD %d failed\n", fd); return -1; } ret = fgetxattr(fd, "security.selinux", ctx, len); if (len != ret) { pr_err("Reading xattr %s to FD %d failed\n", ctx, fd); return -1; } e->xattr_security_selinux = ctx; return 0; } #endif void kerndat_lsm(void) { if (access(AA_SECURITYFS_PATH, F_OK) == 0) { kdat.lsm = LSMTYPE__APPARMOR; kdat.apparmor_ns_dumping_enabled = check_aa_ns_dumping(); return; } #ifdef CONFIG_HAS_SELINUX if (is_selinux_enabled()) { char *ctx; /* * CRIU used to only check if /sys/fs/selinux is mounted, but that does not * seem to be enough for CRIU's use case. CRIU actually needs to look if * a valid label is returned. */ if (getpidcon_raw(getpid(), &ctx) < 0) goto no_lsm; if (verify_selinux_label(ctx)) { freecon(ctx); goto no_lsm; } kdat.lsm = LSMTYPE__SELINUX; freecon(ctx); return; } no_lsm: #endif kdat.lsm = LSMTYPE__NO_LSM; } Lsmtype host_lsm_type(void) { return kdat.lsm; } static int collect_lsm_profile(pid_t pid, struct thread_lsm *lsm) { int ret; switch (kdat.lsm) { case LSMTYPE__NO_LSM: ret = 0; break; case LSMTYPE__APPARMOR: ret = apparmor_get_label(pid, &lsm->profile); break; #ifdef CONFIG_HAS_SELINUX case LSMTYPE__SELINUX: ret = selinux_get_label(pid, &lsm->profile); if (ret) break; ret = selinux_get_sockcreate_label(pid, &lsm->sockcreate); break; #endif default: BUG(); ret = -1; break; } if (lsm->profile) pr_info("%d has lsm profile %s\n", pid, lsm->profile); if (lsm->sockcreate) pr_info("%d has lsm sockcreate label %s\n", pid, lsm->sockcreate); return ret; } int collect_and_suspend_lsm(void) { struct pstree_item *item; for_each_pstree_item(item) { struct thread_lsm **thread_lsms; int i; thread_lsms = xzalloc((item->nr_threads + 1) * sizeof(thread_lsms)); if (!thread_lsms) return -1; dmpi(item)->thread_lsms = thread_lsms; for (i = 0; i < item->nr_threads; i++) { thread_lsms[i] = xzalloc(sizeof(**thread_lsms)); if (!thread_lsms[i]) return -1; if (collect_lsm_profile(item->threads[i].real, thread_lsms[i]) < 0) return -1; } } /* now, suspend the LSM; this is where code that implements something * like PTRACE_O_SUSPEND_LSM should live. */ switch (kdat.lsm) { case LSMTYPE__APPARMOR: if (suspend_aa() < 0) return -1; break; case LSMTYPE__SELINUX: break; case LSMTYPE__NO_LSM: break; default: pr_debug("don't know how to suspend LSM %d\n", kdat.lsm); } return 0; } int unsuspend_lsm(void) { if (kdat.lsm == LSMTYPE__APPARMOR && unsuspend_aa()) return -1; return 0; } // in inventory.c extern Lsmtype image_lsm; int validate_lsm(char *lsm_profile) { if (image_lsm == LSMTYPE__NO_LSM || image_lsm == kdat.lsm) return 0; /* * This is really only a problem if the processes have actually * specified an LSM profile. If not, we won't restore anything anyway, * so it's fine. */ if (lsm_profile) { pr_err("mismatched lsm types and lsm profile specified\n"); return -1; } return 0; } int render_lsm_profile(char *profile, char **val) { *val = NULL; switch (kdat.lsm) { case LSMTYPE__APPARMOR: return render_aa_profile(val, profile); case LSMTYPE__SELINUX: if (asprintf(val, "%s", profile) < 0) { *val = NULL; return -1; } break; default: pr_err("can't render profile %s for lsmtype %d\n", profile, LSMTYPE__NO_LSM); return -1; } return 0; } int lsm_check_opts(void) { char *aux; if (!opts.lsm_supplied) return 0; aux = strchr(opts.lsm_profile, ':'); if (aux == NULL) { pr_err("invalid argument %s for --lsm-profile\n", opts.lsm_profile); return -1; } *aux = '\0'; aux++; if (strcmp(opts.lsm_profile, "apparmor") == 0) { if (kdat.lsm != LSMTYPE__APPARMOR) { pr_err("apparmor LSM specified but apparmor not supported by kernel\n"); return -1; } SET_CHAR_OPTS(lsm_profile, aux); } else if (strcmp(opts.lsm_profile, "selinux") == 0) { if (kdat.lsm != LSMTYPE__SELINUX) { pr_err("selinux LSM specified but selinux not supported by kernel\n"); return -1; } SET_CHAR_OPTS(lsm_profile, aux); } else if (strcmp(opts.lsm_profile, "none") == 0) { xfree(opts.lsm_profile); opts.lsm_profile = NULL; } else { pr_err("unknown lsm %s\n", opts.lsm_profile); return -1; } return 0; } crac-criu-1.5.0/criu/mem.c000066400000000000000000001072471471504326700152750ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "types.h" #include "cr_options.h" #include "servicefd.h" #include "mem.h" #include "parasite-syscall.h" #include "parasite.h" #include "page-pipe.h" #include "page-xfer.h" #include "log.h" #include "kerndat.h" #include "stats.h" #include "vma.h" #include "shmem.h" #include "uffd.h" #include "pstree.h" #include "restorer.h" #include "rst-malloc.h" #include "bitmap.h" #include "sk-packet.h" #include "files-reg.h" #include "pagemap-cache.h" #include "fault-injection.h" #include "prctl.h" #include "compel/infect-util.h" #include "pidfd-store.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" static int task_reset_dirty_track(int pid) { int ret; if (!opts.track_mem) return 0; BUG_ON(!kdat.has_dirty_track); ret = do_task_reset_dirty_track(pid); BUG_ON(ret == 1); return ret; } int do_task_reset_dirty_track(int pid) { int fd, ret; char cmd[] = "4"; pr_info("Reset %d's dirty tracking\n", pid); fd = __open_proc(pid, EACCES, O_RDWR, "clear_refs"); if (fd < 0) return errno == EACCES ? 1 : -1; ret = write(fd, cmd, sizeof(cmd)); if (ret < 0) { if (errno == EINVAL) /* No clear-soft-dirty in kernel */ ret = 1; else { pr_perror("Can't reset %d's dirty memory tracker", pid); ret = -1; } } else { pr_info(" ... done\n"); ret = 0; } close(fd); return ret; } unsigned long dump_pages_args_size(struct vm_area_list *vmas) { /* In the worst case I need one iovec for each page */ return sizeof(struct parasite_dump_pages_args) + vmas->nr * sizeof(struct parasite_vma_entry) + (vmas->nr_priv_pages + 1) * sizeof(struct iovec); } static inline bool __page_is_zero(u64 pme) { return (pme & PME_PFRAME_MASK) == kdat.zero_page_pfn; } static inline bool __page_in_parent(bool dirty) { /* * If we do memory tracking, but w/o parent images, * then we have to dump all memory */ return opts.track_mem && opts.img_parent && !dirty; } bool should_dump_page(VmaEntry *vmae, u64 pme) { /* * vDSO area must be always dumped because on restore * we might need to generate a proxy. */ if (vma_entry_is(vmae, VMA_AREA_VDSO)) return true; /* * In turn VVAR area is special and referenced from * vDSO area by IP addressing (at least on x86) thus * never ever dump its content but always use one provided * by the kernel on restore, ie runtime VVAR area must * be remapped into proper place.. */ if (vma_entry_is(vmae, VMA_AREA_VVAR)) return false; /* * Optimisation for private mapping pages, that haven't * yet being COW-ed */ if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) return true; return false; } bool page_is_zero(u64 pme) { return __page_is_zero(pme); } bool page_in_parent(bool dirty) { return __page_in_parent(dirty); } static bool is_stack(struct pstree_item *item, unsigned long vaddr) { int i; for (i = 0; i < item->nr_threads; i++) { uint64_t sp = dmpi(item)->thread_sp[i]; if (!((sp ^ vaddr) & ~PAGE_MASK)) return true; } return false; } /* * This routine finds out what memory regions to grab from the * dumpee. The iovs generated are then fed into vmsplice to * put the memory into the page-pipe's pipe. * * "Holes" in page-pipe are regions, that should be dumped, but * the memory contents is present in the parent image set. */ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, bool has_parent) { u64 *at = &map[PAGE_PFN(*off)]; unsigned long pfn, nr_to_scan; unsigned long pages[3] = {}; int ret = 0; nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; for (pfn = 0; pfn < nr_to_scan; pfn++) { unsigned long vaddr; unsigned int ppb_flags = 0; int st; if (!should_dump_page(vma->e, at[pfn])) continue; vaddr = vma->e->start + *off + pfn * PAGE_SIZE; if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) ppb_flags |= PPB_LAZY; /* * If we're doing incremental dump (parent images * specified) and page is not soft-dirty -- we dump * hole and expect the parent images to contain this * page. The latter would be checked in page-xfer. */ if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { ret = page_pipe_add_page(pp, vaddr, ppb_flags); if (ppb_flags & PPB_LAZY && opts.lazy_pages) st = 1; else st = 2; } if (ret) { /* Do not do pfn++, just bail out */ pr_debug("Pagemap full\n"); break; } pages[st]++; } *off += pfn * PAGE_SIZE; cnt_add(CNT_PAGES_SCANNED, nr_to_scan); cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); pr_info("Pagemap generated: %lu pages (%lu lazy) %lu holes\n", pages[2] + pages[1], pages[1], pages[0]); return ret; } static struct parasite_dump_pages_args * prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_list, bool skip_non_trackable) { struct parasite_dump_pages_args *args; struct parasite_vma_entry *p_vma; struct vma_area *vma; args = compel_parasite_args_s(ctl, dump_pages_args_size(vma_area_list)); p_vma = pargs_vmas(args); args->nr_vmas = 0; list_for_each_entry(vma, &vma_area_list->h, list) { if (!vma_area_is_private(vma, kdat.task_size)) continue; /* * Kernel write to aio ring is not soft-dirty tracked, * so we ignore them at pre-dump. */ if (vma_entry_is(vma->e, VMA_AREA_AIORING) && skip_non_trackable) continue; /* * We totally ignore MAP_HUGETLB on pre-dump. * See also generate_vma_iovs() comment. */ if ((vma->e->flags & MAP_HUGETLB) && skip_non_trackable) continue; if (vma->e->prot & PROT_READ) continue; p_vma->start = vma->e->start; p_vma->len = vma_area_len(vma); p_vma->prot = vma->e->prot; args->nr_vmas++; p_vma++; } return args; } static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct parasite_dump_pages_args *args) { struct page_pipe_buf *ppb; int ret = 0; debug_show_page_pipe(pp); /* Step 2 -- grab pages into page-pipe */ list_for_each_entry(ppb, &pp->bufs, l) { args->nr_segs = ppb->nr_segs; args->nr_pages = ppb->pages_in; pr_debug("PPB: %d pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); if (ret < 0) return -1; ret = compel_util_send_fd(ctl, ppb->p[1]); if (ret) return -1; ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl); if (ret < 0) return -1; args->off += args->nr_segs; } return 0; } static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer) { int ret; /* * Step 3 -- write pages into image (or delay writing for * pre-dump action (see pre_dump_one_task) */ timing_start(TIME_MEMWRITE); ret = page_xfer_dump_pages(xfer, pp); timing_stop(TIME_MEMWRITE); return ret; } static int detect_pid_reuse(struct pstree_item *item, struct proc_pid_stat *pps, InventoryEntry *parent_ie) { unsigned long long dump_ticks; struct proc_pid_stat pps_buf; unsigned long long tps; /* ticks per second */ int ret; /* Check pid reuse using pidfds */ if (pidfd_store_ready()) return pidfd_store_check_pid_reuse(item->pid->real); if (!parent_ie) { pr_err("Pid-reuse detection failed: no parent inventory, " "check warnings in get_parent_inventory\n"); return -1; } tps = sysconf(_SC_CLK_TCK); if (tps == -1) { pr_perror("Failed to get clock ticks via sysconf"); return -1; } if (!pps) { pps = &pps_buf; ret = parse_pid_stat(item->pid->real, pps); if (ret < 0) return -1; } dump_ticks = parent_ie->dump_uptime / (USEC_PER_SEC / tps); if (pps->start_time >= dump_ticks) { /* Print "*" if unsure */ pr_warn("Pid reuse%s detected for pid %d\n", pps->start_time == dump_ticks ? "*" : "", item->pid->real); return 1; } return 0; } static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl, pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode) { u64 off = 0; u64 *map; int ret; if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED)) return 0; /* * To facilitate any combination of pre-dump modes to run after * one another, we need to take extra care as discussed below. * * The SPLICE mode pre-dump, processes all type of memory regions, * whereas READ mode pre-dump skips processing those memory regions * which lacks PROT_READ flag. * * Now on mixing pre-dump modes: * If SPLICE mode follows SPLICE mode : no issue * -> everything dumped both the times * * If READ mode follows READ mode : no issue * -> non-PROT_READ skipped both the time * * If READ mode follows SPLICE mode : no issue * -> everything dumped at first, * the non-PROT_READ skipped later * * If SPLICE mode follows READ mode : Need special care * * If READ pre-dump happens first, then it has skipped processing * non-PROT_READ regions. Following SPLICE pre-dump expects pagemap * entries for all mappings in parent pagemap, but last READ mode * pre-dump cycle has skipped processing & pagemap generation for * non-PROT_READ regions. So SPLICE mode throws error of missing * pagemap entry for encountered non-PROT_READ mapping. * * To resolve this, the pre-dump-mode is stored in current pre-dump's * inventoy file. This pre-dump mode is read back from this file * (present in parent pre-dump dir) as parent-pre-dump-mode during * next pre-dump. * * If parent-pre-dump-mode and next-pre-dump-mode are in READ-mode -> * SPLICE-mode order, then SPLICE mode doesn't expect mappings for * non-PROT_READ regions in parent-image and marks "has_parent=false". */ if (!(vma->e->prot & PROT_READ)) { if (opts.pre_dump_mode == PRE_DUMP_READ && pre_dump) return 0; if ((parent_predump_mode == PRE_DUMP_READ && opts.pre_dump_mode == PRE_DUMP_SPLICE) || !pre_dump) has_parent = false; } /* * We want to completely ignore these VMA types on the pre-dump: * 1. VMA_AREA_AIORING because it is not soft-dirty trackable (kernel writes) * 2. MAP_HUGETLB mappings because they are not premapped and we can't use * parent images from pre-dump stages. Instead, the content is restored from * the parasite context using full memory image. */ if (vma_entry_is(vma->e, VMA_AREA_AIORING) || vma->e->flags & MAP_HUGETLB) { if (pre_dump) return 0; has_parent = false; } map = pmc_get_map(pmc, vma); if (!map) return -1; if (vma_area_is(vma, VMA_ANON_SHARED)) return add_shmem_area(item->pid->real, vma->e, map); again: ret = generate_iovs(item, vma, pp, map, &off, has_parent); if (ret == -EAGAIN) { BUG_ON(!(pp->flags & PP_CHUNK_MODE)); ret = drain_pages(pp, ctl, args); if (!ret) ret = xfer_pages(pp, xfer); if (!ret) { page_pipe_reinit(pp); goto again; } } return ret; } static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasite_dump_pages_args *args, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl) { pmc_t pmc = PMC_INIT; struct page_pipe *pp; struct vma_area *vma_area; struct page_xfer xfer = { .parent = NULL }; int ret, exit_code = -1; unsigned cpp_flags = 0; unsigned long pmc_size; int possible_pid_reuse = 0; bool has_parent; int parent_predump_mode = -1; pr_info("\n"); pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real); pr_info("----------------------------------------\n"); timing_start(TIME_MEMDUMP); pr_debug(" Private vmas %lu/%lu pages\n", vma_area_list->nr_priv_pages_longest, vma_area_list->nr_priv_pages); /* * Step 0 -- prepare */ pmc_size = max(vma_area_list->nr_priv_pages_longest, vma_area_list->nr_shared_pages_longest); if (pmc_init(&pmc, item->pid->real, &vma_area_list->h, pmc_size * PAGE_SIZE)) return -1; if (!(mdc->pre_dump || mdc->lazy)) /* * Chunk mode pushes pages portion by portion. This mode * only works when we don't need to keep pp for later * use, i.e. on non-lazy non-predump. */ cpp_flags |= PP_CHUNK_MODE; pp = create_page_pipe(vma_area_list->nr_priv_pages, mdc->lazy ? NULL : pargs_iovs(args), cpp_flags); if (!pp) goto out; if (!mdc->pre_dump) { /* * Regular dump -- create xfer object and send pages to it * right here. For pre-dumps the pp will be taken by the * caller and handled later. */ ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, vpid(item)); if (ret < 0) goto out_pp; xfer.transfer_lazy = !mdc->lazy; } else { ret = check_parent_page_xfer(CR_FD_PAGEMAP, vpid(item)); if (ret < 0) goto out_pp; if (ret) xfer.parent = NULL + 1; } if (xfer.parent) { possible_pid_reuse = detect_pid_reuse(item, mdc->stat, mdc->parent_ie); if (possible_pid_reuse == -1) goto out_xfer; } /* * Step 1 -- generate the pagemap */ args->off = 0; has_parent = !!xfer.parent && !possible_pid_reuse; if (mdc->parent_ie) parent_predump_mode = mdc->parent_ie->pre_dump_mode; list_for_each_entry(vma_area, &vma_area_list->h, list) { ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); if (ret < 0) goto out_xfer; } if (mdc->lazy) memcpy(pargs_iovs(args), pp->iovs, sizeof(struct iovec) * pp->nr_iovs); /* * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump * will happen after task unfreezing in cr_pre_dump_finish(). This is * actual optimization which reduces time for which process was frozen * during pre-dump. */ if (mdc->pre_dump && opts.pre_dump_mode == PRE_DUMP_READ) ret = 0; else ret = drain_pages(pp, ctl, args); if (!ret && !mdc->pre_dump) ret = xfer_pages(pp, &xfer); if (ret) goto out_xfer; timing_stop(TIME_MEMDUMP); /* * Step 4 -- clean up */ ret = task_reset_dirty_track(item->pid->real); if (ret) goto out_xfer; exit_code = 0; out_xfer: if (!mdc->pre_dump) xfer.close(&xfer); out_pp: if (ret || !(mdc->pre_dump || mdc->lazy)) destroy_page_pipe(pp); else dmpi(item)->mem_pp = pp; out: pmc_fini(&pmc); pr_info("----------------------------------------\n"); return exit_code; } int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl) { int ret; struct parasite_dump_pages_args *pargs; pargs = prep_dump_pages_args(ctl, vma_area_list, mdc->pre_dump); /* * Add PROT_READ protection for all VMAs we're about to * dump if they don't have one. Otherwise we'll not be * able to read the memory contents. * * Afterwards -- reprotect memory back. * * This step is required for "splice" mode pre-dump and dump. * Skip this step for "read" mode pre-dump. * "read" mode pre-dump delegates processing of non-PROT_READ * regions to dump stage. Adding PROT_READ works fine for * static processing (target process frozen during pre-dump) * and fails for dynamic as explained below. * * Consider following sequence of instances to reason, why * not to add PROT_READ in "read" mode pre-dump ? * * CRIU- "read" pre-dump Target Process * * 1. Creates mapping M * without PROT_READ * 2. CRIU freezes target * process * 3. Collect the mappings * 4. Add PROT_READ to M * (non-PROT_READ region) * 5. CRIU unfreezes target * process * 6. Add flag PROT_READ * to mapping M * 7. Revoke flag PROT_READ * from mapping M * 8. process_vm_readv tries * to copy mapping M * (believing M have * PROT_READ flag) * 9. syscall fails to copy * data from M */ if (!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE) { pargs->add_prot = PROT_READ; ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl); if (ret) { pr_err("Can't dump unprotect vmas with parasite\n"); return ret; } } if (fault_injected(FI_DUMP_PAGES)) { pr_err("fault: Dump VMA pages failure!\n"); return -1; } ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl); if (ret) { pr_err("Can't dump page with parasite\n"); /* Parasite will unprotect VMAs after fail in fini() */ return ret; } if (!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE) { pargs->add_prot = 0; if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) { pr_err("Can't rollback unprotected vmas with parasite\n"); ret = -1; } } return ret; } int prepare_mm_pid(struct pstree_item *i) { pid_t pid = vpid(i); int ret = -1, vn = 0; struct cr_img *img; struct rst_info *ri = rsti(i); img = open_image(CR_FD_MM, O_RSTR, pid); if (!img) return -1; ret = pb_read_one_eof(img, &ri->mm, PB_MM); close_image(img); if (ret <= 0) return ret; if (collect_special_file(ri->mm->exe_file_id) == NULL) return -1; pr_debug("Found %zd VMAs in image\n", ri->mm->n_vmas); img = NULL; if (ri->mm->n_vmas == 0) { /* * Old image. Read VMAs from vma-.img */ img = open_image(CR_FD_VMAS, O_RSTR, pid); if (!img) return -1; } while (vn < ri->mm->n_vmas || img != NULL) { struct vma_area *vma; ret = -1; vma = alloc_vma_area(); if (!vma) break; ri->vmas.nr++; if (!img) vma->e = ri->mm->vmas[vn++]; else { ret = pb_read_one_eof(img, &vma->e, PB_VMA); if (ret <= 0) { xfree(vma); close_image(img); img = NULL; break; } } list_add_tail(&vma->list, &ri->vmas.h); if (vma_area_is_private(vma, kdat.task_size)) { ri->vmas.rst_priv_size += vma_area_len(vma); if (vma_has_guard_gap_hidden(vma)) ri->vmas.rst_priv_size += PAGE_SIZE; } pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end); if (vma_area_is(vma, VMA_ANON_SHARED)) ret = collect_shmem(pid, vma); else if (vma_area_is(vma, VMA_FILE_PRIVATE) || vma_area_is(vma, VMA_FILE_SHARED)) ret = collect_filemap(vma); else if (vma_area_is(vma, VMA_AREA_SOCKET)) ret = collect_socket_map(vma); else ret = 0; if (ret) break; } if (img) close_image(img); return ret; } static inline bool check_cow_vmas(struct vma_area *vma, struct vma_area *pvma) { /* * VMAs that _may_[1] have COW-ed pages should ... * * [1] I say "may" because whether or not particular pages are * COW-ed is determined later in restore_priv_vma_content() by * memcmp'aring the contents. */ /* ... coincide by start/stop pair (start is checked by caller) */ if (vma->e->end != pvma->e->end) return false; /* ... both be private (and thus have space in premmaped area) */ if (!vma_area_is_private(vma, kdat.task_size)) return false; if (!vma_area_is_private(pvma, kdat.task_size)) return false; /* ... but not hugetlb mappings */ if (vma->e->flags & MAP_HUGETLB || pvma->e->flags & MAP_HUGETLB) return false; /* ... have growsdown and anon flags coincide */ if ((vma->e->flags ^ pvma->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS)) return false; /* ... belong to the same file if being filemap */ if (!(vma->e->flags & MAP_ANONYMOUS) && vma->e->shmid != pvma->e->shmid) return false; pr_debug("Found two COW VMAs @0x%" PRIx64 "-0x%" PRIx64 "\n", vma->e->start, pvma->e->end); return true; } static inline bool vma_inherited(struct vma_area *vma) { return (vma->pvma != NULL && vma->pvma != VMA_COW_ROOT); } static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list *pvmas) { struct vma_area *vma, *pvma; vma = list_first_entry(&vmas->h, struct vma_area, list); pvma = list_first_entry(&pvmas->h, struct vma_area, list); while (1) { if ((vma->e->start == pvma->e->start) && check_cow_vmas(vma, pvma)) { vma->pvma = pvma; if (pvma->pvma == NULL) pvma->pvma = VMA_COW_ROOT; } /* <= here to shift from matching VMAs and ... */ while (vma->e->start <= pvma->e->start) { vma = vma_next(vma); if (&vma->list == &vmas->h) return; } /* ... no == here since we must stop on matching pair */ while (pvma->e->start < vma->e->start) { pvma = vma_next(pvma); if (&pvma->list == &pvmas->h) return; } } } void prepare_cow_vmas(void) { struct pstree_item *pi; for_each_pstree_item(pi) { struct pstree_item *ppi; struct vm_area_list *vmas, *pvmas; ppi = pi->parent; if (!ppi) continue; vmas = &rsti(pi)->vmas; if (vmas->nr == 0) /* Zombie */ continue; pvmas = &rsti(ppi)->vmas; if (pvmas->nr == 0) /* zombies cannot have kids, * but helpers can (and do) */ continue; if (rsti(pi)->mm->exe_file_id != rsti(ppi)->mm->exe_file_id) /* * Tasks running different executables have * close to zero chance of having cow-ed areas * and actually kernel never creates such. */ continue; prepare_cow_vmas_for(vmas, pvmas); } } /* Map a private vma, if it is not mapped by a parent yet */ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void **tgt_addr) { int ret; void *addr; unsigned long nr_pages, size; nr_pages = vma_entry_len(vma->e) / PAGE_SIZE; vma->page_bitmap = xzalloc(BITS_TO_LONGS(nr_pages) * sizeof(long)); if (vma->page_bitmap == NULL) return -1; /* * A grow-down VMA has a guard page, which protect a VMA below it. * So one more page is mapped here to restore content of the first page */ if (vma_has_guard_gap_hidden(vma)) vma->e->start -= PAGE_SIZE; size = vma_entry_len(vma->e); if (!vma_inherited(vma)) { int flag = 0; /* * The respective memory area was NOT found in the parent. * Map a new one. */ /* * Restore AIO ring buffer content to temporary anonymous area. * This will be placed in io_setup'ed AIO in restore_aio_ring(). */ if (vma_entry_is(vma->e, VMA_AREA_AIORING)) flag |= MAP_ANONYMOUS; else if (vma_area_is(vma, VMA_FILE_PRIVATE)) { ret = vma->vm_open(vpid(t), vma); if (ret < 0) { pr_err("Can't fixup VMA's fd\n"); return -1; } } /* * All mappings here get PROT_WRITE regardless of whether we * put any data into it or not, because this area will get * mremap()-ed (branch below) so we MIGHT need to have WRITE * bits there. Ideally we'd check for the whole COW-chain * having any data in. */ addr = mmap(*tgt_addr, size, vma->e->prot | PROT_WRITE, vma->e->flags | MAP_FIXED | flag, vma->e->fd, vma->e->pgoff); if (addr == MAP_FAILED) { pr_perror("Unable to map ANON_VMA"); return -1; } } else { void *paddr; /* * The area in question can be COWed with the parent. Remap the * parent area. Note, that it has already being passed through * the restore_priv_vma_content() call and thus may have some * pages in it. */ paddr = decode_pointer(vma->pvma->premmaped_addr); if (vma_has_guard_gap_hidden(vma)) paddr -= PAGE_SIZE; addr = mremap(paddr, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr); if (addr != *tgt_addr) { pr_perror("Unable to remap a private vma"); return -1; } } vma->e->status |= VMA_PREMMAPED; vma->premmaped_addr = (unsigned long)addr; pr_debug("\tpremap %#016" PRIx64 "-%#016" PRIx64 " -> %016lx\n", vma->e->start, vma->e->end, (unsigned long)addr); if (vma_has_guard_gap_hidden(vma)) { /* Skip guard page */ vma->e->start += PAGE_SIZE; vma->premmaped_addr += PAGE_SIZE; } if (vma_area_is(vma, VMA_FILE_PRIVATE)) vma->vm_open = NULL; /* prevent from 2nd open in prepare_vmas */ *tgt_addr += size; return 0; } static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head) { /* * On kernels with 4K guard pages, growsdown VMAs * always have one guard page at the * beginning and sometimes this page contains data. * In case the VMA is premmaped, we premmap one page * larger VMA. In case of in place restore we can only * do this if the VMA in question is not "guarded" by * some other VMA. */ if (vma->e->flags & MAP_GROWSDOWN) { if (vma->list.prev != head) { struct vma_area *prev; prev = list_entry(vma->list.prev, struct vma_area, list); if (prev->e->end == vma->e->start) { pr_debug("Force premmap for 0x%" PRIx64 ":0x%" PRIx64 "\n", vma->e->start, vma->e->end); return true; } } } return false; } /* * Ensure for s390x that vma is below task size on restore system */ static int task_size_check(pid_t pid, VmaEntry *entry) { #ifdef __s390x__ if (entry->end <= kdat.task_size) return 0; pr_err("Can't restore high memory region %lx-%lx because kernel does only support vmas up to %lx\n", entry->start, entry->end, kdat.task_size); return -1; #else return 0; #endif } static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, void **at, struct page_read *pr) { struct vma_area *vma; unsigned long pstart = 0; int ret = 0; LIST_HEAD(empty); filemap_ctx_init(true); list_for_each_entry(vma, &vmas->h, list) { if (task_size_check(vpid(t), vma->e)) { ret = -1; break; } if (pstart > vma->e->start) { ret = -1; pr_err("VMA-s are not sorted in the image file\n"); break; } pstart = vma->e->start; if (!vma_area_is_private(vma, kdat.task_size)) continue; if (vma->e->flags & MAP_HUGETLB) continue; /* VMA offset may change due to plugin so we cannot premap */ if (vma->e->status & VMA_EXT_PLUGIN) continue; if (vma->pvma == NULL && pr->pieok && !vma_force_premap(vma, &vmas->h)) { /* * VMA in question is not shared with anyone. We'll * restore it with its contents in restorer. * Now let's check whether we need to map it with * PROT_WRITE or not. */ do { if (pr->pe->vaddr + pr->pe->nr_pages * PAGE_SIZE <= vma->e->start) continue; if (pr->pe->vaddr > vma->e->end) vma->e->status |= VMA_NO_PROT_WRITE; break; } while (pr->advance(pr)); continue; } ret = premap_private_vma(t, vma, at); if (ret < 0) break; } filemap_ctx_fini(); return ret; } static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) { struct vma_area *vma; int ret = 0; struct list_head *vmas = &rsti(t)->vmas.h; struct list_head *vma_io = &rsti(t)->vma_io; unsigned int nr_restored = 0; unsigned int nr_shared = 0; unsigned int nr_dropped = 0; unsigned int nr_compared = 0; unsigned int nr_lazy = 0; unsigned long va; vma = list_first_entry(vmas, struct vma_area, list); rsti(t)->pages_img_id = pr->pages_img_id; /* * Read page contents. */ while (1) { unsigned long off, i, nr_pages; ret = pr->advance(pr); if (ret <= 0) break; va = (unsigned long)decode_pointer(pr->pe->vaddr); nr_pages = pr->pe->nr_pages; /* * This means that userfaultfd is used to load the pages * on demand. */ if (opts.lazy_pages && pagemap_lazy(pr->pe)) { pr_debug("Lazy restore skips %ld pages at %lx\n", nr_pages, va); pr->skip_pages(pr, nr_pages * PAGE_SIZE); nr_lazy += nr_pages; continue; } for (i = 0; i < nr_pages; i++) { unsigned char buf[PAGE_SIZE]; void *p; /* * The lookup is over *all* possible VMAs * read from image file. */ while (va >= vma->e->end) { if (vma->list.next == vmas) goto err_addr; vma = vma_next(vma); } /* * Make sure the page address is inside existing VMA * and the VMA it refers to still private one, since * there is no guarantee that the data from pagemap is * valid. */ if (va < vma->e->start) goto err_addr; else if (unlikely(!vma_area_is_private(vma, kdat.task_size))) { pr_err("Trying to restore page for non-private VMA\n"); goto err_addr; } if (!vma_area_is(vma, VMA_PREMMAPED)) { unsigned long len = min_t(unsigned long, (nr_pages - i) * PAGE_SIZE, vma->e->end - va); if (vma->e->status & VMA_NO_PROT_WRITE) { pr_debug("VMA 0x%" PRIx64 ":0x%" PRIx64 " RO %#lx:%lu IO\n", vma->e->start, vma->e->end, va, nr_pages); BUG(); } if (pagemap_enqueue_iovec(pr, (void *)va, len, vma_io)) return -1; pr->skip_pages(pr, len); va += len; len >>= PAGE_SHIFT; nr_restored += len; i += len - 1; pr_debug("Enqueue page-read\n"); continue; } /* * Otherwise to the COW restore */ off = (va - vma->e->start) / PAGE_SIZE; p = decode_pointer((off)*PAGE_SIZE + vma->premmaped_addr); set_bit(off, vma->page_bitmap); if (vma_inherited(vma)) { clear_bit(off, vma->pvma->page_bitmap); ret = pr->read_pages(pr, va, 1, buf, 0); if (ret < 0) goto err_read; va += PAGE_SIZE; nr_compared++; if (memcmp(p, buf, PAGE_SIZE) == 0) { nr_shared++; /* the page is cowed */ continue; } nr_restored++; memcpy(p, buf, PAGE_SIZE); } else { int nr; /* * Try to read as many pages as possible at once. * * Within the t pagemap we still have * nr_pages - i pages (not all, as we might have * switched VMA above), within the t VMA * we have at most (vma->end - t_addr) bytes. */ nr = min_t(int, nr_pages - i, (vma->e->end - va) / PAGE_SIZE); ret = pr->read_pages(pr, va, nr, p, PR_ASYNC); if (ret < 0) goto err_read; va += nr * PAGE_SIZE; nr_restored += nr; i += nr - 1; bitmap_set(vma->page_bitmap, off + 1, nr - 1); } } } err_read: if (pr->sync(pr)) return -1; pr->close(pr); if (ret < 0) return ret; /* Remove pages, which were not shared with a child */ list_for_each_entry(vma, vmas, list) { unsigned long size, i = 0; void *addr = decode_pointer(vma->premmaped_addr); if (!vma_inherited(vma)) continue; size = vma_entry_len(vma->e) / PAGE_SIZE; while (1) { /* Find all pages, which are not shared with this child */ i = find_next_bit(vma->pvma->page_bitmap, size, i); if (i >= size) break; ret = madvise(addr + PAGE_SIZE * i, PAGE_SIZE, MADV_DONTNEED); if (ret < 0) { pr_perror("madvise failed"); return -1; } i++; nr_dropped++; } } cnt_add(CNT_PAGES_COMPARED, nr_compared); cnt_add(CNT_PAGES_SKIPPED_COW, nr_shared); cnt_add(CNT_PAGES_RESTORED, nr_restored); pr_info("nr_restored_pages: %d\n", nr_restored); pr_info("nr_shared_pages: %d\n", nr_shared); pr_info("nr_dropped_pages: %d\n", nr_dropped); pr_info("nr_lazy: %d\n", nr_lazy); return 0; err_addr: pr_err("Page entry address %lx outside of VMA %lx-%lx\n", va, (long)vma->e->start, (long)vma->e->end); return -1; } static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) { /* * There is no need to disable it if the page read doesn't * have parent. In this case VMA will be empty until * userfaultfd_register, so there would be no pages to * collapse. And, once we register the VMA with uffd, * khugepaged will skip it. */ if (!(opts.lazy_pages && page_read_has_parent(pr))) return 0; if (!kdat.has_thp_disable) pr_warn("Disabling transparent huge pages. " "It may affect performance!\n"); /* * temporarily disable THP to avoid collapse of pages * in the areas that will be monitored by uffd */ if (prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0)) { pr_perror("Cannot disable THP"); return -1; } return 0; } int prepare_mappings(struct pstree_item *t) { int ret = 0; void *addr; struct vm_area_list *vmas; struct page_read pr; void *old_premmapped_addr = NULL; unsigned long old_premmapped_len; vmas = &rsti(t)->vmas; if (vmas->nr == 0) /* Zombie */ goto out; /* Reserve a place for mapping private vma-s one by one */ addr = mmap(NULL, vmas->rst_priv_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (addr == MAP_FAILED) { ret = -1; pr_perror("Unable to reserve memory (%lu bytes)", vmas->rst_priv_size); goto out; } old_premmapped_addr = rsti(t)->premmapped_addr; old_premmapped_len = rsti(t)->premmapped_len; rsti(t)->premmapped_addr = addr; rsti(t)->premmapped_len = vmas->rst_priv_size; ret = open_page_read(vpid(t), &pr, PR_TASK); if (ret <= 0) return -1; if (maybe_disable_thp(t, &pr)) return -1; pr.advance(&pr); /* shift to the 1st iovec */ ret = premap_priv_vmas(t, vmas, &addr, &pr); if (ret < 0) goto out; pr.reset(&pr); ret = restore_priv_vma_content(t, &pr); if (ret < 0) goto out; if (old_premmapped_addr) { ret = munmap(old_premmapped_addr, old_premmapped_len); if (ret < 0) pr_perror("Unable to unmap %p(%lx)", old_premmapped_addr, old_premmapped_len); } /* * Not all VMAs were premmaped. Find out the unused tail of the * premapped area and unmap it. */ old_premmapped_len = addr - rsti(t)->premmapped_addr; if (old_premmapped_len < rsti(t)->premmapped_len) { unsigned long tail; tail = rsti(t)->premmapped_len - old_premmapped_len; ret = munmap(addr, tail); if (ret < 0) pr_perror("Unable to unmap %p(%lx)", addr, tail); rsti(t)->premmapped_len = old_premmapped_len; pr_info("Shrunk premap area to %p(%lx)\n", rsti(t)->premmapped_addr, rsti(t)->premmapped_len); } out: return ret; } bool vma_has_guard_gap_hidden(struct vma_area *vma) { return kdat.stack_guard_gap_hidden && (vma->e->flags & MAP_GROWSDOWN); } /* * A guard page must be unmapped after restoring content and * forking children to restore COW memory. */ int unmap_guard_pages(struct pstree_item *t) { struct vma_area *vma; struct list_head *vmas = &rsti(t)->vmas.h; if (!kdat.stack_guard_gap_hidden) return 0; list_for_each_entry(vma, vmas, list) { if (!vma_area_is(vma, VMA_PREMMAPED)) continue; if (vma->e->flags & MAP_GROWSDOWN) { void *addr = decode_pointer(vma->premmaped_addr); if (munmap(addr - PAGE_SIZE, PAGE_SIZE)) { pr_perror("Can't unmap guard page"); return -1; } } } return 0; } int open_vmas(struct pstree_item *t) { int pid = vpid(t); struct vma_area *vma; struct vm_area_list *vmas = &rsti(t)->vmas; filemap_ctx_init(false); list_for_each_entry(vma, &vmas->h, list) { if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) continue; pr_info("Opening %#016" PRIx64 "-%#016" PRIx64 " %#016" PRIx64 " (%x) vma\n", vma->e->start, vma->e->end, vma->e->pgoff, vma->e->status); if (vma->vm_open(pid, vma)) { pr_err("`- Can't open vma\n"); return -1; } /* * File mappings have vm_open set to open_filemap which, in * turn, puts the VMA_CLOSE bit itself. For all the rest we * need to put it by hands, so that the restorer closes the fd */ if (!(vma_area_is(vma, VMA_FILE_PRIVATE) || vma_area_is(vma, VMA_FILE_SHARED))) vma->e->status |= VMA_CLOSE; } filemap_ctx_fini(); return 0; } static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta) { struct cr_img *pages; /* * We optimize the case when rsti(t)->vma_io is empty. * * This is useful when using the image streamer, where all VMAs are * premapped (pr->pieok is false). This avoids re-opening the * CR_FD_PAGES file, which may only be readable only once. */ if (list_empty(&rsti(t)->vma_io)) { ta->vma_ios = NULL; ta->vma_ios_n = 0; ta->vma_ios_fd = -1; return 0; } /* * If auto-dedup is on we need RDWR mode to be able to punch holes in * the input files (in restorer.c) */ if (opts.compress) { pages = open_image(CR_FD_PAGES_COMP, opts.auto_dedup ? O_RDWR : O_RSTR, rsti(t)->pages_img_id); } else { pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR, rsti(t)->pages_img_id); } if (!pages) return -1; ta->vma_ios_fd = img_raw_fd(pages); return pagemap_render_iovec(&rsti(t)->vma_io, ta); } int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) { struct vma_area *vma; struct vm_area_list *vmas = &rsti(t)->vmas; ta->vmas = (VmaEntry *)rst_mem_align_cpos(RM_PRIVATE); ta->vmas_n = vmas->nr; list_for_each_entry(vma, &vmas->h, list) { VmaEntry *vme; vme = rst_mem_alloc(sizeof(*vme), RM_PRIVATE); if (!vme) return -1; /* * Copy VMAs to private rst memory so that it's able to * walk them and m(un|re)map. */ *vme = *vma->e; if (vma_area_is(vma, VMA_PREMMAPED)) vma_premmaped_start(vme) = vma->premmaped_addr; } return prepare_vma_ios(t, ta); } crac-criu-1.5.0/criu/memfd.c000066400000000000000000000253711471504326700156040ustar00rootroot00000000000000#include #include #include "common/compiler.h" #include "common/lock.h" #include "memfd.h" #include "fdinfo.h" #include "imgset.h" #include "image.h" #include "util.h" #include "log.h" #include "files.h" #include "fs-magic.h" #include "kerndat.h" #include "files-reg.h" #include "rst-malloc.h" #include "fdstore.h" #include "file-ids.h" #include "namespaces.h" #include "shmem.h" #include "hugetlb.h" #include "protobuf.h" #include "images/memfd.pb-c.h" #define MEMFD_PREFIX "/memfd:" #define MEMFD_PREFIX_LEN (sizeof(MEMFD_PREFIX) - 1) #define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ /* Linux 5.1+ */ #define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ struct memfd_dump_inode { struct list_head list; u32 id; u32 dev; u32 ino; }; struct memfd_restore_inode { struct list_head list; mutex_t lock; int fdstore_id; unsigned int pending_seals; MemfdInodeEntry *mie; bool was_opened_rw; }; static LIST_HEAD(memfd_inodes); /* * Dump only */ static u32 memfd_inode_ids = 1; int is_memfd(dev_t dev) { return dev == kdat.shmem_dev; } static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char *name, const struct stat *st) { MemfdInodeEntry mie = MEMFD_INODE_ENTRY__INIT; int ret = -1, flag; u32 shmid; /* * shmids are chosen as the inode number of the corresponding mmapped * file. See handle_vma() in proc_parse.c. * It works for memfd too, because we share the same device as the * shmem device. */ shmid = inode->ino; pr_info("Dumping memfd:%s contents (id %#x, shmid: %#x, size: %" PRIu64 ")\n", name, inode->id, shmid, st->st_size); if (dump_one_memfd_shmem(fd, shmid, st->st_size) < 0) goto out; mie.inode_id = inode->id; mie.uid = userns_uid(st->st_uid); mie.gid = userns_gid(st->st_gid); mie.name = (char *)name; mie.size = st->st_size; mie.shmid = shmid; if (is_hugetlb_dev(inode->dev, &flag)) { mie.has_hugetlb_flag = true; mie.hugetlb_flag = flag | MFD_HUGETLB; } mie.mode = st->st_mode; mie.has_mode = true; mie.seals = fcntl(fd, F_GET_SEALS); if (mie.seals == -1) { if (errno != EINVAL || ~mie.hugetlb_flag & MFD_HUGETLB) { pr_perror("fcntl(F_GET_SEALS)"); goto out; } /* Kernels before 4.16 don't allow MFD_HUGETLB | * MFD_ALLOW_SEALING and return EINVAL for * fcntl(MFD_HUGETLB-enabled fd). */ mie.seals = F_SEAL_SEAL; } if (pb_write_one(img_from_set(glob_imgset, CR_FD_MEMFD_INODE), &mie, PB_MEMFD_INODE)) goto out; ret = 0; out: return ret; } static struct memfd_dump_inode *dump_unique_memfd_inode(int lfd, const char *name, const struct stat *st) { struct memfd_dump_inode *inode; int fd; list_for_each_entry(inode, &memfd_inodes, list) if ((inode->dev == st->st_dev) && (inode->ino == st->st_ino)) return inode; inode = xmalloc(sizeof(*inode)); if (inode == NULL) return NULL; inode->dev = st->st_dev; inode->ino = st->st_ino; inode->id = memfd_inode_ids++; fd = open_proc(PROC_SELF, "fd/%d", lfd); if (fd < 0) { xfree(inode); return NULL; } if (dump_memfd_inode(fd, inode, name, st)) { close(fd); xfree(inode); return NULL; } close(fd); list_add_tail(&inode->list, &memfd_inodes); return inode; } static int dump_one_memfd(int lfd, u32 id, const struct fd_parms *p) { MemfdFileEntry mfe = MEMFD_FILE_ENTRY__INIT; FileEntry fe = FILE_ENTRY__INIT; struct memfd_dump_inode *inode; struct fd_link _link, *link; const char *name; if (!p->link) { if (fill_fdlink(lfd, p, &_link)) return -1; link = &_link; } else link = p->link; link_strip_deleted(link); /* link->name is always started with "." which has to be skipped. */ if (strncmp(link->name + 1, MEMFD_PREFIX, MEMFD_PREFIX_LEN) == 0) name = &link->name[1 + MEMFD_PREFIX_LEN]; else name = link->name + 1; inode = dump_unique_memfd_inode(lfd, name, &p->stat); if (!inode) return -1; mfe.id = id; mfe.flags = p->flags; mfe.pos = p->pos; mfe.fown = (FownEntry *)&p->fown; mfe.inode_id = inode->id; fe.type = FD_TYPES__MEMFD; fe.id = mfe.id; fe.memfd = &mfe; return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms) { if (fd_id_generate_special(parms, id)) return dump_one_memfd(lfd, *id, parms); return 0; } const struct fdtype_ops memfd_dump_ops = { .type = FD_TYPES__MEMFD, .dump = dump_one_memfd, }; /* * Restore only */ struct memfd_info { MemfdFileEntry *mfe; struct file_desc d; struct memfd_restore_inode *inode; }; static struct memfd_restore_inode *memfd_alloc_inode(int id) { struct memfd_restore_inode *inode; list_for_each_entry(inode, &memfd_inodes, list) if (inode->mie->inode_id == id) return inode; pr_err("Unable to find the %d memfd inode\n", id); return NULL; } static int collect_one_memfd_inode(void *o, ProtobufCMessage *base, struct cr_img *i) { MemfdInodeEntry *mie = pb_msg(base, MemfdInodeEntry); struct memfd_restore_inode *inode = o; inode->mie = mie; mutex_init(&inode->lock); inode->fdstore_id = -1; inode->pending_seals = 0; inode->was_opened_rw = false; list_add_tail(&inode->list, &memfd_inodes); return 0; } static struct collect_image_info memfd_inode_cinfo = { .fd_type = CR_FD_MEMFD_INODE, .pb_type = PB_MEMFD_INODE, .priv_size = sizeof(struct memfd_restore_inode), .collect = collect_one_memfd_inode, .flags = COLLECT_SHARED | COLLECT_NOFREE, }; int prepare_memfd_inodes(void) { return collect_image(&memfd_inode_cinfo); } static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) { MemfdInodeEntry *mie = NULL; int fd = -1; int ret = -1; int flags; mie = inode->mie; if (mie->seals == F_SEAL_SEAL) { inode->pending_seals = 0; flags = 0; } else { /* Seals are applied later due to F_SEAL_FUTURE_WRITE */ inode->pending_seals = mie->seals; flags = MFD_ALLOW_SEALING; } if (mie->has_hugetlb_flag) flags |= mie->hugetlb_flag; fd = memfd_create(mie->name, flags); if (fd < 0) { pr_perror("Can't create memfd:%s", mie->name); goto out; } if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) goto out; if (mie->has_mode) ret = cr_fchperm(fd, mie->uid, mie->gid, mie->mode); else ret = cr_fchown(fd, mie->uid, mie->gid); if (ret) { pr_perror("Can't set permissions { uid %d gid %d mode %#o } of memfd:%s", (int)mie->uid, (int)mie->gid, mie->has_mode ? (int)mie->mode : -1, mie->name); goto out; } inode->fdstore_id = fdstore_add(fd); if (inode->fdstore_id < 0) goto out; ret = fd; fd = -1; out: if (fd != -1) close(fd); return ret; } static int memfd_open_inode(struct memfd_restore_inode *inode) { int fd; if (inode->fdstore_id != -1) return fdstore_get(inode->fdstore_id); mutex_lock(&inode->lock); if (inode->fdstore_id != -1) fd = fdstore_get(inode->fdstore_id); else fd = memfd_open_inode_nocache(inode); mutex_unlock(&inode->lock); return fd; } int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap) { struct memfd_info *mfi; MemfdFileEntry *mfe; int fd, _fd; u32 flags; mfi = container_of(d, struct memfd_info, d); mfe = mfi->mfe; pr_info("Restoring memfd id=%d\n", mfe->id); fd = memfd_open_inode(mfi->inode); if (fd < 0) return -1; /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; if (filemap && (flags & O_ACCMODE) == O_RDWR) return fd; if (!mfi->inode->was_opened_rw && (flags & O_ACCMODE) == O_RDWR) { /* * If there is only a single RW-opened fd for a memfd, it can * be used to pass it to execveat() with AT_EMPTY_PATH to have * its contents executed. This currently works only for the * original fd from memfd_create() so return the original fd * once -- in case the caller expects to be the sole opener * and does execveat() from this memfd. */ if (!fcntl(fd, F_SETFL, flags)) { mfi->inode->was_opened_rw = true; return fd; } pr_pwarn("Can't change fd flags to %#o for memfd id=%d", flags, mfe->id); } /* * Ideally we should call compat version open() to not force the * O_LARGEFILE file flag with regular open(). It doesn't seem that * important though. */ _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); if (_fd < 0) pr_perror("Can't reopen memfd id=%d", mfe->id); else if (!filemap && (flags & O_ACCMODE) == O_RDWR) pr_warn("execveat(fd=%d, ..., AT_EMPTY_PATH) might fail after restore; memfd id=%d\n", _fd, mfe->id); close(fd); return _fd; } static int memfd_open_fe_fd(struct file_desc *d, int *new_fd) { MemfdFileEntry *mfe; int fd; if (inherited_fd(d, new_fd)) return 0; fd = memfd_open(d, NULL, false); if (fd < 0) return -1; mfe = container_of(d, struct memfd_info, d)->mfe; if (restore_fown(fd, mfe->fown) < 0) goto err; if (lseek(fd, mfe->pos, SEEK_SET) < 0) { pr_perror("Can't restore file position of %d for memfd id=%d", fd, mfe->id); goto err; } *new_fd = fd; return 0; err: close(fd); return -1; } static char *memfd_d_name(struct file_desc *d, char *buf, size_t s) { MemfdInodeEntry *mie = NULL; struct memfd_info *mfi; mfi = container_of(d, struct memfd_info, d); mie = mfi->inode->mie; if (snprintf(buf, s, "%s%s", MEMFD_PREFIX, mie->name) >= s) { pr_err("Buffer too small for memfd name %s\n", mie->name); return NULL; } return buf; } static struct file_desc_ops memfd_desc_ops = { .type = FD_TYPES__MEMFD, .open = memfd_open_fe_fd, .name = memfd_d_name, }; static int collect_one_memfd(void *o, ProtobufCMessage *msg, struct cr_img *i) { struct memfd_info *info = o; info->mfe = pb_msg(msg, MemfdFileEntry); info->inode = memfd_alloc_inode(info->mfe->inode_id); if (!info->inode) return -1; return file_desc_add(&info->d, info->mfe->id, &memfd_desc_ops); } struct collect_image_info memfd_cinfo = { .fd_type = CR_FD_MEMFD_FILE, .pb_type = PB_MEMFD_FILE, .priv_size = sizeof(struct memfd_info), .collect = collect_one_memfd, }; struct file_desc *collect_memfd(u32 id) { struct file_desc *fdesc; fdesc = find_file_desc_raw(FD_TYPES__MEMFD, id); if (fdesc == NULL) pr_err("No entry for memfd %#x\n", id); return fdesc; } int apply_memfd_seals(void) { /* * We apply the seals after all the mappings are done because the seal * F_SEAL_FUTURE_WRITE prevents future write access (added in * Linux 5.1). Thus we must make sure all writable mappings are opened * before applying this seal. */ int ret, fd; struct memfd_restore_inode *inode; list_for_each_entry(inode, &memfd_inodes, list) { if (!inode->pending_seals) continue; fd = memfd_open_inode(inode); if (fd < 0) return -1; ret = fcntl(fd, F_ADD_SEALS, inode->pending_seals); close(fd); if (ret < 0) { pr_perror("Cannot apply seals on memfd"); return -1; } } return 0; } crac-criu-1.5.0/criu/mount-v2.c000066400000000000000000000775731471504326700162160ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "kerndat.h" #include "log.h" #include "cr_options.h" #include "xmalloc.h" #include "util.h" #include "filesystems.h" #include "mount.h" #include "mount-v2.h" #include "namespaces.h" #include "fs-magic.h" #include "path.h" #include "files-reg.h" #include "fdstore.h" #include "common/list.h" #include "common/bug.h" #include "common/compiler.h" #include "images/mnt.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "mnt-v2: " LIST_HEAD(sharing_groups); int check_mount_v2(void) { if (!kdat.has_move_mount_set_group) { pr_debug("Mounts-v2 requires MOVE_MOUNT_SET_GROUP support\n"); return -1; } if (!kdat.has_openat2) { pr_debug("Mounts-v2 requires openat2 support\n"); return -1; } return 0; } static struct sharing_group *get_sharing_group(int shared_id, int master_id) { struct sharing_group *sg; list_for_each_entry(sg, &sharing_groups, list) { if (sg->shared_id == shared_id && sg->master_id == master_id) return sg; } return NULL; } static struct sharing_group *alloc_sharing_group(int shared_id, int master_id) { struct sharing_group *sg; sg = xzalloc(sizeof(struct sharing_group)); if (!sg) return NULL; sg->shared_id = shared_id; sg->master_id = master_id; INIT_LIST_HEAD(&sg->list); INIT_LIST_HEAD(&sg->mnt_list); INIT_LIST_HEAD(&sg->children); INIT_LIST_HEAD(&sg->siblings); list_add(&sg->list, &sharing_groups); return sg; } int resolve_shared_mounts_v2(void) { struct sharing_group *sg; struct mount_info *mi; /* * Create sharing groups for each unique shared_id+master_id pair and * link each mount to the corresponding sharing group. */ for (mi = mntinfo; mi; mi = mi->next) { if (!mi->shared_id && !mi->master_id) continue; pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", mi->mnt_id, mi->shared_id, mi->master_id, mi->ns_mountpoint); sg = get_sharing_group(mi->shared_id, mi->master_id); if (!sg) { sg = alloc_sharing_group(mi->shared_id, mi->master_id); if (!sg) return -1; } list_add(&mi->mnt_sharing, &sg->mnt_list); mi->sg = sg; } /* * Collect sharing groups tree. Mount propagation between sharing * groups only goes down this tree, meaning that only mounts of same or * descendant sharing groups receive mount propagation. */ list_for_each_entry(sg, &sharing_groups, list) { if (sg->master_id) { struct sharing_group *p; /* * Lookup parent sharing group. If one sharing group * has master_id equal to shared_id of another sharing * group than the former is a child (slave) of the * latter. Also sharing groups should not have two * parents so we check this here too. */ list_for_each_entry(p, &sharing_groups, list) { if (p->shared_id != sg->master_id) continue; if (sg->parent) { pr_err("Sharing group (%d, %d) parent collision (%d, %d) (%d, %d)\n", sg->shared_id, sg->master_id, p->shared_id, p->master_id, sg->parent->shared_id, sg->parent->master_id); return -1; } sg->parent = p; if (!list_empty(&sg->siblings)) { pr_err("External slavery sharing group (%d, %d) has parent (%d, %d)\n", sg->shared_id, sg->master_id, p->shared_id, p->master_id); return -1; } list_add(&sg->siblings, &p->children); /* Don't break to check for parent collision */ } /* * If sharing group has master_id but we did't find * parent for it inside the dumped container yet, this * means that the master_id is external and a mount on * host should exist with corresponding shared_id. */ if (!sg->parent && list_empty(&sg->siblings)) { struct mount_info *ext; struct sharing_group *s; char *source = NULL; /* * Though we don't have parent sharing group * (inaccessible sharing), we can still have * siblings, sharing groups with same master_id * but different shared_id, let's collect them * to the list. */ list_for_each_entry(s, &sharing_groups, list) { if (s->master_id != sg->master_id) continue; if (s->parent) { pr_err("External slavery sharing group (%d, %d) has parent (%d, %d)\n", sg->shared_id, sg->master_id, s->parent->shared_id, s->parent->master_id); return -1; } if (!list_empty(&s->siblings)) { pr_err("External slavery sharing group collision (%d, %d) (%d, %d)\n", sg->shared_id, sg->master_id, s->shared_id, s->master_id); return -1; } list_add(&s->siblings, &sg->siblings); } BUG_ON(list_empty(&sg->mnt_list)); mi = list_entry(sg->mnt_list.next, struct mount_info, mnt_sharing); /* * We need to know from which mount on host we * can get this external master_id. There are * two options: mountpoint external mount or * root mount of container. */ if ((ext = mnt_get_external_bind_nodev(mi))) source = ext->external; else if (mnt_is_root_bind(mi)) source = opts.root; if (!source) { pr_err("Sharing group (%d, %d) " "has unreachable sharing. Try --enable-external-masters.\n", sg->shared_id, sg->master_id); return -1; } sg->source = source; list_for_each_entry(s, &sg->siblings, siblings) s->source = sg->source; pr_debug("Detected external slavery for shared group (%d, %d) with source %s\n", sg->shared_id, sg->master_id, source); } } } return 0; } /* * When first mount from superblock is mounted, give other mounts * a hint that they can now just bindmount from the first one. */ static int propagate_mount_v2(struct mount_info *mi) { struct mount_info *t; list_for_each_entry(t, &mi->mnt_bind, mnt_bind) { if (t->mounted) continue; if (t->bind) continue; if (!issubpath(t->root, mi->root)) continue; pr_debug("\t\tPropagate %d to %d\n", mi->mnt_id, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } return 0; } /* * Mounts first mount of superblock */ static int do_new_mount_v2(struct mount_info *mi) { unsigned long sflags = mi->sb_flags; unsigned long mflags = mi->flags & (~MS_PROPAGATE); char *src; struct fstype *tp = mi->fstype; bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY); mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount; src = resolve_source(mi); if (!src) return -1; /* Merge superblock and mount flags if it's possible */ if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) { sflags |= mflags; mflags = 0; } if (remount_ro) sflags &= ~MS_RDONLY; if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) { pr_perror("Can't mount at %s", mi->plain_mountpoint); return -1; } /* * Mount-v2 relies that before mount tree is constructed all mounts * should remain private. Newly created mounts can become non-private * initially depending on parent/source sharing, let's be as explicit * as possible here and make it obvious that mount becomes private. */ if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); return -1; } if (tp->restore && tp->restore(mi)) return -1; if (remount_ro) { int fd; fd = open(mi->plain_mountpoint, O_PATH); if (fd < 0) { pr_perror("Unable to open %s", mi->plain_mountpoint); return -1; } sflags |= MS_RDONLY | MS_REMOUNT; if (userns_call(apply_sb_flags, 0, &sflags, sizeof(sflags), fd)) { pr_perror("Unable to apply mount flags %d for %s", mi->sb_flags, mi->plain_mountpoint); close(fd); return -1; } close(fd); } if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { pr_perror("Unable to apply bind-mount options"); return -1; } mi->mounted = true; return 0; } /* * Does simple bindmount, but via new kernel mount api, * which also handles autofs and symlink without resolving. */ static int __do_bind_mount_v2(char *from, char *to) { int detached_fd; detached_fd = sys_open_tree(AT_FDCWD, from, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE); if (detached_fd == -1) { pr_perror("Failed to open_tree %s", from); return -1; } if (sys_move_mount(detached_fd, "", AT_FDCWD, to, MOVE_MOUNT_F_EMPTY_PATH)) { pr_perror("Failed to move_mount from %s to %s", from, to); close(detached_fd); return -1; } close(detached_fd); return 0; } LIST_HEAD(deleted_mounts); /* * Bind-mounts all later mounts of superblock from first one, * also handles first mounts of mountpoint external mounts. */ static int do_bind_mount_v2(struct mount_info *mi) { char *root = NULL, *cut_root, rpath[PATH_MAX]; unsigned long mflags; int exit_code = -1; char *mnt_path = NULL; int level = 0; if (mi->need_plugin) { if (restore_ext_mount(mi)) return -1; goto out; } if (mnt_is_nodev_external(mi)) { root = mi->external; goto do_bind; } cut_root = get_relative_path(mi->root, mi->bind->root); if (!cut_root) { pr_err("Failed to find root for %d in our supposed bind %d\n", mi->mnt_id, mi->bind->mnt_id); return -1; } /* * Mount ->private can be initialized on fstype->mount() callback, * which is called for first mount of superblock in do_new_mount(). * Also ->private have to be copied to all other mounts of superblock * to provide users of it with actual data. */ mi->private = mi->bind->private; mnt_path = mi->bind->plain_mountpoint; if (cut_root[0]) { snprintf(rpath, sizeof(rpath), "%s/%s", mnt_path, cut_root); root = rpath; } else { root = mnt_path; } do_bind: pr_info("\tBind %s to %s\n", root, mi->plain_mountpoint); if (unlikely(mi->deleted)) { level = make_parent_dirs_if_need(-1, root); if (level < 0) goto err; if (mi->is_dir) { if (mkdir(root, 0600)) { pr_perror("Can't re-create deleted directory %s", root); goto err; } } else { int fd = open(root, O_WRONLY | O_CREAT | O_EXCL, 0600); if (fd < 0) { pr_perror("Can't re-create deleted file %s", root); goto err; } close(fd); } } if (__do_bind_mount_v2(root, mi->plain_mountpoint)) goto err; /* * Mount-v2 relies that before mount tree is constructed all mounts * should remain private. Newly created mounts can become non-private * initially depending on parent/source sharing, let's be as explicit * as possible here and make it obvious that mount becomes private. */ if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); goto err; } mflags = mi->flags & (~MS_PROPAGATE); if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE))) if (mount(NULL, mi->plain_mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { pr_perror("Can't bind remount 0x%lx at %s", mflags, mi->plain_mountpoint); goto err; } if (mi->deleted) { /* * Deleted mounts can't be moved, will delete source after * moving to proper position in the mount tree FIXME. */ mi->deleted_level = level; level = 0; list_add(&mi->deleted_list, &deleted_mounts); } out: mi->mounted = true; exit_code = 0; err: if (level) rm_parent_dirs(-1, root, level); return exit_code; } /* Mounts root container mount. */ static int do_mount_root_v2(struct mount_info *mi) { unsigned long flags = MS_BIND; int fd; if (root_ns_mask & CLONE_NEWUSER) { fd = open(mi->plain_mountpoint, O_PATH); if (fd < 0) { pr_perror("Unable to open %s", mi->plain_mountpoint); return -1; } if (userns_call(mount_root, 0, &flags, sizeof(flags), fd)) { pr_err("Unable to mount %s\n", mi->plain_mountpoint); close(fd); return -1; } close(fd); } else { if (mount(opts.root, mi->plain_mountpoint, NULL, flags, NULL)) { pr_perror("Unable to mount %s %s (id=%d)", opts.root, mi->plain_mountpoint, mi->mnt_id); return -1; } } /* * Mount-v2 relies that before mount tree is constructed all mounts * should remain private. Newly created mounts can become non-private * initially depending on parent/source sharing, let's be as explicit * as possible here and make it obvious that mount becomes private. */ if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); return -1; } mi->mounted = true; return 0; } /* Check if mount is ready to be mounted. */ static bool can_mount_now_v2(struct mount_info *mi) { struct mount_info *root, *ext; /* Parent should be mounted already, that's how mnt_tree_for_each works */ BUG_ON(mi->parent && !mi->parent->mounted); /* Root mounts can be mounted at any moment */ if (rst_mnt_is_root(mi)) { pr_debug("%s: true as %d is global root\n", __func__, mi->mnt_id); return true; } /* External mounts can be mounted at any moment */ if (mi->external) { pr_debug("%s: true as %d is external\n", __func__, mi->mnt_id); return true; } /* * Container root and external mounts should go before * anything which should be bindmounted from them. */ if (!mi->bind) { root = mnt_get_root_bind(mi); if (root) { pr_debug("%s: false as %d is bind of not mounted global root %d\n", __func__, mi->mnt_id, root->mnt_id); return false; } ext = mnt_get_external_bind(mi); if (ext) { pr_debug("%s: false as %d is a bind of not mounted external %d\n", __func__, mi->mnt_id, ext->mnt_id); return false; } } /* Non fsroot mounts can not be mounted without bind-mount */ if (!fsroot_mounted(mi) && !mi->bind && !mi->need_plugin) { pr_debug("%s: false as %d is non-root without bind or plugin\n", __func__, mi->mnt_id); return false; } return true; } static int __set_unbindable_v2(struct mount_info *mi) { if (mi->flags & MS_UNBINDABLE) { if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { pr_perror("Failed to set mount %d unbindable", mi->mnt_id); return -1; } } return 0; } /* * Setting MS_UNBINDABLE flag is slightly delayed, * obviousely until we finish bind-mounting everything. */ static int set_unbindable_v2(void) { int orig_nsfd = -1, nsfd = -1, exit_code = -1; struct mount_info *mi; struct ns_id *nsid; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { bool ns_has_unbindable = false; if (nsid->nd != &mnt_ns_desc) continue; for (mi = mntinfo; mi != NULL; mi = mi->next) if (mi->nsid == nsid && mi->flags & MS_UNBINDABLE) ns_has_unbindable = true; if (!ns_has_unbindable) continue; nsfd = fdstore_get(nsid->mnt.nsfd_id); if (nsfd < 0) goto err; if (switch_ns_by_fd(nsfd, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) goto err; close_safe(&nsfd); if (mnt_tree_for_each(nsid->mnt.mntinfo_tree, __set_unbindable_v2)) goto err; } exit_code = 0; err: if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) exit_code = -1; close_safe(&nsfd); return exit_code; } /* * Detects if mount is a directory mount or file mount based on stat on * its mountpoint inside already mounted parent mount. This is deeply * integrated in plain mount creation process because before mounting * something plain we need to create right type of mountpoint for it. */ static int detect_is_dir(struct mount_info *mi) { static char mountpoint[PATH_MAX]; char *rel_path; struct stat st; if (mi->is_dir != -1) return 0; if (mi->mnt_id == HELPER_MNT_ID) { pr_err("Helper %s should have is_dir pre-set\n", mi->ns_mountpoint); return -1; } if (!mi->parent || mi->parent == root_yard_mp) { pr_err("Mount namespace root mount %d should have is_dir pre-set\n", mi->mnt_id); return -1; } if (!mi->parent->mounted) { pr_err("Parent mount %d of %d should be mounted\n", mi->parent->mnt_id, mi->mnt_id); return -1; } rel_path = get_relative_path(mi->ns_mountpoint, mi->parent->ns_mountpoint); if (!rel_path) { pr_err("Child-parent mountpoint mismatch %d:%s %d:%s\n", mi->mnt_id, mi->ns_mountpoint, mi->parent->mnt_id, mi->parent->ns_mountpoint); return -1; } snprintf(mountpoint, sizeof(mountpoint), "%s%s%s", mi->parent->plain_mountpoint, rel_path[0] ? "/" : "", rel_path); if (stat(mountpoint, &st)) { pr_perror("Can't stat mountpoint %s", mountpoint); return -1; } if (S_ISDIR(st.st_mode)) mi->is_dir = true; else mi->is_dir = false; pr_debug("Mount %d is detected as %s-mount\n", mi->mnt_id, mi->is_dir ? "dir" : "file"); return 0; } static int create_plain_mountpoint(struct mount_info *mi) { BUG_ON(mi->is_dir == -1); pr_debug("Create plain mountpoint %s for %d\n", mi->plain_mountpoint, mi->mnt_id); if (mi->is_dir) { if (mkdir(mi->plain_mountpoint, 0600)) { pr_perror("Unable to mkdir mountpoint %s", mi->plain_mountpoint); return -1; } } else { int fd; fd = creat(mi->plain_mountpoint, 0600); if (fd < 0) { pr_perror("Unable to create mountpoint %s", mi->plain_mountpoint); return -1; } close(fd); } return 0; } /* * At this point we already have a mount in service mount namespace now we * bind-mount it to the final restored mount namespace via new kernel mount * API. */ static int do_mount_in_right_mntns(struct mount_info *mi) { int nsfd = -1, orig_nsfd = -1, detached_fd = -1, exit_code = -1; if (!mi->nsid) return 0; detached_fd = sys_open_tree(AT_FDCWD, mi->plain_mountpoint, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE); if (detached_fd == -1) { pr_perror("Failed to open_tree %s", mi->plain_mountpoint); goto err; } nsfd = fdstore_get(mi->nsid->mnt.nsfd_id); if (nsfd < 0) goto err; if (switch_ns_by_fd(nsfd, &mnt_ns_desc, &orig_nsfd)) goto err; if (create_plain_mountpoint(mi)) goto err; if (sys_move_mount(detached_fd, "", AT_FDCWD, mi->plain_mountpoint, MOVE_MOUNT_F_EMPTY_PATH)) { pr_perror("Failed to cross-mntns move_mount plain mount %d", mi->mnt_id); goto err; } exit_code = 0; err: if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) exit_code = -1; close_safe(&nsfd); close_safe(&detached_fd); return exit_code; } static int do_mount_one_v2(struct mount_info *mi) { int ret; if (mi->mounted) return 0; if (!can_mount_now_v2(mi)) { pr_debug("Postpone mount %d\n", mi->mnt_id); return 1; } if (detect_is_dir(mi)) return -1; if (create_plain_mountpoint(mi)) return -1; pr_debug("\tMounting %s @%d (%d)\n", mi->fstype->name, mi->mnt_id, mi->need_plugin); if (rst_mnt_is_root(mi)) { if (opts.root == NULL) { pr_err("The --root option is required to restore a mount namespace\n"); return -1; } ret = do_mount_root_v2(mi); } else if (!mi->bind && !mi->need_plugin && (!mi->external || !strcmp(mi->external, EXTERNAL_DEV_MOUNT))) { ret = do_new_mount_v2(mi); } else { ret = do_bind_mount_v2(mi); } if (ret == 0 && fetch_rt_stat(mi, mi->plain_mountpoint)) return -1; if (ret == 0 && propagate_mount_v2(mi)) return -1; if (mi->fstype->code == FSTYPE__UNSUPPORTED) { struct statfs st; if (statfs(mi->plain_mountpoint, &st)) { pr_perror("Unable to statfs %s", mi->plain_mountpoint); return -1; } if (st.f_type == BTRFS_SUPER_MAGIC) mi->fstype = find_fstype_by_name("btrfs"); } if (ret == 0 && do_mount_in_right_mntns(mi)) return -1; return ret; } static int populate_mnt_ns_v2(void) { if (make_yard(mnt_roots)) return -1; if (mnt_tree_for_each(root_yard_mp, do_mount_one_v2)) return -1; return set_unbindable_v2(); } /* * This function moves plain mounts into actual mount tree. * * Mounts in children list are sorted the way that sibling overmount goes after * all siblings which it overmounts (see __mnt_resort_children). The function * mnt_tree_for_each is effectively DFS (in case we don't postpone), thus all * descendants of all mounts which we sibling-overmount are mounted before us. * Be careful, we can't postpone (return >0) from this function because of it. */ static int move_mount_to_tree(struct mount_info *mi) { int fd; fd = open(mi->mountpoint, O_PATH); if (fd < 0) { pr_perror("Failed to open real mountpoint of %d", mi->mnt_id); return -1; } mi->mp_fd_id = fdstore_add(fd); close(fd); if (mi->mp_fd_id < 0) { pr_err("Can't add mountpoint of mount %d to fdstore\n", mi->mnt_id); return -1; } pr_info("Move mount %d from %s to %s\n", mi->mnt_id, mi->plain_mountpoint, mi->mountpoint); if (sys_move_mount(AT_FDCWD, mi->plain_mountpoint, AT_FDCWD, mi->mountpoint, 0)) { pr_perror("Failed to move mount %d from %s to %s", mi->mnt_id, mi->plain_mountpoint, mi->mountpoint); return -1; } fd = open(mi->mountpoint, O_PATH); if (fd < 0) { pr_perror("Failed to open real mountpoint of %d", mi->mnt_id); return -1; } mi->mnt_fd_id = fdstore_add(fd); close(fd); if (mi->mnt_fd_id < 0) { pr_err("Can't add mount %d fd to fdstore\n", mi->mnt_id); return -1; } return 0; } static int assemble_tree_from_plain_mounts(struct ns_id *nsid) { return mnt_tree_for_each(nsid->mnt.mntinfo_tree, move_mount_to_tree); } /* * With MOVE_MOUNT_SET_GROUP source mount should have wider root than * destination, thus let's choose widest mount from group as first. */ static struct mount_info *get_first_mount(struct sharing_group *sg) { struct mount_info *first = NULL, *tmp; int min_len = 0; list_for_each_entry(tmp, &sg->mnt_list, mnt_sharing) { int len = strlen(tmp->root); if (!first || len < min_len) { first = tmp; min_len = len; } } return first; } struct set_group_arg { int src_id; char source[PATH_MAX]; int dst_id; }; static int __move_mount_set_group(void *arg, int dfd, int pid) { struct set_group_arg *sga = (struct set_group_arg *)arg; int src_fd, dst_fd, exit_code = -1; if (sga->src_id != -1) { src_fd = fdstore_get(sga->src_id); BUG_ON(src_fd < 0); } else { char *source_mp; BUG_ON(sga->source[0] == '\0'); /* * Source path should not always be a mountpoint as we * automatically resolve it to mountpoint below. */ source_mp = resolve_mountpoint(sga->source); if (!source_mp) { pr_err("Failed to find %s mountpoint\n", sga->source); return -1; } src_fd = open(source_mp, O_PATH); if (src_fd < 0) { pr_perror("Failed to open %s mountpoint", source_mp); xfree(source_mp); return -1; } xfree(source_mp); } dst_fd = fdstore_get(sga->dst_id); BUG_ON(dst_fd < 0); /* Copy shared_id of the source */ if (sys_move_mount(src_fd, "", dst_fd, "", MOVE_MOUNT_SET_GROUP | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH)) { pr_perror("Failed to copy sharing from %d:%s to %d", sga->src_id, sga->source ?: "", sga->dst_id); goto err; } exit_code = 0; err: close(src_fd); close(dst_fd); return exit_code; } /* * Copy sharing between mounts passing mountpoint fds via fdstore ids. Also it * is possible (for external mounts) to pass path on mountpoint via source path, * it would resolve to mountpoint automatically. */ static int move_mount_set_group(int src_id, char *source, int dst_id) { struct set_group_arg sga = { .src_id = src_id, .dst_id = dst_id, }; sga.source[0] = '\0'; if (source) { if (snprintf(sga.source, sizeof(sga.source), "%s", source) >= sizeof(sga.source)) { pr_err("Source %s is too long\n", source); return -1; } } if (userns_call(__move_mount_set_group, 0, &sga, sizeof(sga), -1)) return -1; return 0; } static int restore_one_sharing(struct sharing_group *sg, struct mount_info *target) { char target_path[PATH_MAX]; int target_fd; target_fd = fdstore_get(target->mnt_fd_id); BUG_ON(target_fd < 0); snprintf(target_path, sizeof(target_path), "/proc/self/fd/%d", target_fd); /* Restore target's master_id from shared_id of the source */ if (sg->master_id) { if (sg->parent) { struct mount_info *first; /* Get shared_id from parent sharing group */ first = get_first_mount(sg->parent); if (move_mount_set_group(first->mnt_fd_id, NULL, target->mnt_fd_id)) { pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, target->mnt_id); close(target_fd); return -1; } } else { /* * External slavery. We rely on the user to give us the * right source for external mount with all proper * sharing options setup (it should be either shared * or non-shared slave). If source is a private mount * we would fail. */ if (move_mount_set_group(-1, sg->source, target->mnt_fd_id)) { pr_err("Failed to copy sharing from source %s to %d\n", sg->source, target->mnt_id); close(target_fd); return -1; } } /* Convert shared_id to master_id */ if (mount(NULL, target_path, NULL, MS_SLAVE, NULL)) { pr_perror("Failed to make mount %d slave", target->mnt_id); close(target_fd); return -1; } } /* Restore target's shared_id */ if (sg->shared_id) { if (mount(NULL, target_path, NULL, MS_SHARED, NULL)) { pr_perror("Failed to make mount %d shared", target->mnt_id); close(target_fd); return -1; } } close(target_fd); return 0; } static int restore_one_sharing_group(struct sharing_group *sg) { struct mount_info *first, *other; first = get_first_mount(sg); if (restore_one_sharing(sg, first)) return -1; /* Restore sharing for other mounts from the sharing group */ list_for_each_entry(other, &sg->mnt_list, mnt_sharing) { if (other == first) continue; if (is_sub_path(other->root, first->root)) { if (move_mount_set_group(first->mnt_fd_id, NULL, other->mnt_fd_id)) { pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, other->mnt_id); return -1; } } else { /* * Case where mounts of this sharing group don't have common root. * For instance we can create two sub-directories .a and .b in some * shared mount, bindmount them separately somethere and umount the * original mount. Now we have both bindmounts shared between each * other. Kernel only allows to copy sharing between mounts when * source root contains destination root, which is not true for * these two, so we can't just copy from first to other. * * For external sharing (!sg->parent) with only master_id (shared_id * == 0) we can workaround this by copying from their external source * instead (same as we did for a first mount). * * This is a w/a runc usecase, see https://github.com/opencontainers/runc/pull/3442 */ if (!sg->parent && !sg->shared_id) { if (restore_one_sharing(sg, other)) return -1; } else { pr_err("Can't copy sharing from %d[%s] to %d[%s]\n", first->mnt_id, first->root, other->mnt_id, other->root); return -1; } } } return 0; } static struct sharing_group *sharing_group_next(struct sharing_group *sg) { if (!list_empty(&sg->children)) return list_entry(sg->children.next, struct sharing_group, siblings); while (sg->parent) { if (sg->siblings.next == &sg->parent->children) sg = sg->parent; else return list_entry(sg->siblings.next, struct sharing_group, siblings); } return NULL; } static int restore_mount_sharing_options(void) { struct sharing_group *sg; list_for_each_entry(sg, &sharing_groups, list) { struct sharing_group *t; if (sg->parent) continue; /* Handle dependent sharing groups in tree order */ for (t = sg; t != NULL; t = sharing_group_next(t)) { if (restore_one_sharing_group(t)) return -1; } } return 0; } static int remove_source_of_deleted_mount(struct mount_info *mi) { char *cut_root, path[PATH_MAX], *root; BUG_ON(!mi->deleted || !mi->bind); cut_root = get_relative_path(mi->root, mi->bind->root); if (!cut_root) { pr_err("Failed to find root for %d in our supposed bind %d\n", mi->mnt_id, mi->bind->mnt_id); return -1; } if (cut_root[0]) { snprintf(path, sizeof(path), "%s/%s", mi->bind->plain_mountpoint, cut_root); root = path; } else { root = mi->bind->plain_mountpoint; } if (mi->is_dir) { if (rmdir(root)) { pr_perror("Can't remove deleted directory %s", root); return -1; } } else { if (unlink(root)) { pr_perror("Can't unlink deleted file %s", root); return -1; } } if (mi->deleted_level) rm_parent_dirs(-1, root, mi->deleted_level); return 0; } /* Delay making mounts deleted until we've restored sharing groups */ static int remove_sources_of_deleted_mounts(void) { struct mount_info *mi; int ret = 0; list_for_each_entry(mi, &deleted_mounts, deleted_list) { if (remove_source_of_deleted_mount(mi)) ret = -1; } return ret; } static int get_empty_mntns(void) { int orig_nsfd, nsfd = -1; orig_nsfd = open_proc(PROC_SELF, "ns/mnt"); if (orig_nsfd < 0) return -1; /* Create the new mount namespace */ if (unshare(CLONE_NEWNS)) { pr_perror("Unable to create a new mntns"); close(orig_nsfd); return -1; } if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) { pr_perror("Can't remount \"/\" with MS_PRIVATE"); goto err; } if (make_yard(mnt_roots)) goto err; if (cr_pivot_root(mnt_roots)) goto err; if (mkdirpat(AT_FDCWD, mnt_roots, 0777)) { pr_err("Failed to setup root yard in empty mntns\n"); goto err; } nsfd = open_proc(PROC_SELF, "ns/mnt"); err: if (restore_ns(orig_nsfd, &mnt_ns_desc)) close_safe(&nsfd); return nsfd; } /* Create almost empty mount namespaces only with root yard precreated */ static int pre_create_mount_namespaces(void) { int orig_nsfd = -1, nsfd = -1, empty_mntns, exit_code = -1; char path[PATH_MAX]; struct ns_id *nsid; empty_mntns = get_empty_mntns(); if (empty_mntns == -1) { pr_err("Failed to create empty mntns\n"); goto err; } /* restore mount namespaces */ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; if (switch_ns_by_fd(empty_mntns, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) goto err; /* Create the new mount namespace */ if (unshare(CLONE_NEWNS)) { pr_perror("Unable to create a new mntns"); goto err; } nsfd = open_proc(PROC_SELF, "ns/mnt"); if (nsfd < 0) goto err; /* Pin new mntns with a file descriptor */ nsid->mnt.nsfd_id = fdstore_add(nsfd); close(nsfd); if (nsid->mnt.nsfd_id < 0) { pr_err("Can't add mntns fd to fdstore\n"); goto err; } if (make_yard(mnt_roots)) goto err; print_ns_root(nsid, 0, path, sizeof(path)); if (mkdir(path, 0600)) { pr_perror("Unable to create %s", path); goto err; } } exit_code = 0; err: if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) exit_code = -1; close_safe(&empty_mntns); return exit_code; } /* * Assemble the mount tree for each restored mount namespace * from pre-created plain mounts. */ static int assemble_mount_namespaces(void) { int orig_nsfd = -1, nsfd = -1, rootfd = -1, exit_code = -1; char path[PATH_MAX]; struct ns_id *nsid; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; nsfd = fdstore_get(nsid->mnt.nsfd_id); if (nsfd < 0) goto err; if (switch_ns_by_fd(nsfd, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) { close(nsfd); goto err; } close(nsfd); if (assemble_tree_from_plain_mounts(nsid)) goto err; /* Set its root */ print_ns_root(nsid, 0, path, sizeof(path) - 1); if (cr_pivot_root(path)) goto err; /* root fd is used to restore file mappings */ rootfd = open_proc(PROC_SELF, "root"); if (rootfd < 0) goto err; nsid->mnt.root_fd_id = fdstore_add(rootfd); if (nsid->mnt.root_fd_id < 0) { pr_err("Can't add root fd to fdstore\n"); close(rootfd); goto err; } close(rootfd); } exit_code = 0; err: if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) exit_code = -1; return exit_code; } /* The main entry point of mount-v2 for creating mounts */ int prepare_mnt_ns_v2(void) { if (!(root_ns_mask & CLONE_NEWNS)) return 0; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { /* * Add to root yard along with other plain mounts and mntns * directories. This mount would be created and restored by * generic mount creation code, but it would never be moved to * any restored mount namespaces. */ if (!add_cr_time_mount(root_yard_mp, "binfmt_misc", "binfmt_misc", 0, true)) return -1; } #endif if (validate_mounts(mntinfo, false)) return -1; if (pre_create_mount_namespaces()) return -1; if (populate_mnt_ns_v2()) return -1; if (assemble_mount_namespaces()) return -1; if (restore_mount_sharing_options()) return -1; return remove_sources_of_deleted_mounts(); } crac-criu-1.5.0/criu/mount.c000066400000000000000000003144131471504326700156540ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "cr_options.h" #include "util.h" #include "util-pie.h" #include "log.h" #include "plugin.h" #include "filesystems.h" #include "mount.h" #include "mount-v2.h" #include "pstree.h" #include "image.h" #include "namespaces.h" #include "protobuf.h" #include "fs-magic.h" #include "path.h" #include "files-reg.h" #include "external.h" #include "clone-noasan.h" #include "fdstore.h" #include "rst-malloc.h" #include "images/mnt.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "mnt: " #define CONTEXT_OPT "context=" /* A helper mount_info entry for the roots yard */ struct mount_info *root_yard_mp = NULL; static LIST_HEAD(delayed_unbindable); char *service_mountpoint(const struct mount_info *mi) { if (!opts.mntns_compat_mode && opts.mode == CR_RESTORE) { BUG_ON(!mi->plain_mountpoint); return mi->plain_mountpoint; } return mi->mountpoint; } int ext_mount_add(char *key, char *val) { cleanup_free char *e_str = NULL; e_str = xmalloc(strlen(key) + strlen(val) + 8); if (!e_str) return -1; /* * On dump the key is the mountpoint as seen from the mount * namespace, the val is some name that will be put into image * instead of the mount point's root path. * * On restore the key is the name from the image (the one * mentioned above) and the val is the path in criu's mount * namespace that will become the mount point's root, i.e. -- * be bind mounted to the respective mountpoint. */ sprintf(e_str, "mnt[%s]:%s", key, val); return add_external(e_str); } int ext_mount_parse_auto(char *key) { opts.autodetect_ext_mounts = true; if (*key == ':') { key++; if (*key == 'm') opts.enable_external_masters = true; else if (*key == 's') opts.enable_external_sharing = true; else if (*key != '\0') return -1; } return 0; } /* Lookup ext_mount by key field */ static char *ext_mount_lookup(char *key) { char *v; int len = strlen(key); char mkey[len + 6]; sprintf(mkey, "mnt[%s]", key); v = external_lookup_by_key(mkey); if (IS_ERR(v)) v = NULL; return v; } /* * Single linked list of mount points get from proc/images */ struct mount_info *mntinfo; static void mntinfo_add_list(struct mount_info *new) { if (!mntinfo) mntinfo = new; else { struct mount_info *pm; /* Add to the tail. (FIXME -- make O(1) ) */ for (pm = mntinfo; pm->next != NULL; pm = pm->next) ; pm->next = new; } } void mntinfo_add_list_before(struct mount_info **head, struct mount_info *new) { new->next = *head; *head = new; } static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath, unsigned int st_dev, unsigned int st_ino, unsigned int mnt_id) { /* * Goes through all entries in the mountinfo table * looking for a mount point that contains the file specified * in rpath. Uses the device number st_dev and the inode number st_ino * to make sure the file is correct. */ struct mount_info *mi_ret = NULL; struct mount_info *m; int mntns_root = -1; for (m = list; m != NULL; m = m->next) { struct stat f_stat; int ret_stat; if (m->fstype->code != FSTYPE__OVERLAYFS) continue; /* * We need the mntns root fd of the process to be dumped, * to make sure we stat the correct file */ if (mntns_root == -1) { mntns_root = __mntns_get_root_fd(root_item->pid->real); if (mntns_root < 0) { pr_err("Unable to get the root file descriptor of pid %d\n", root_item->pid->real); return ERR_PTR(-ENOENT); } } /* * Concatenates m->ns_mountpoint with rpath and attempts * to stat the resulting path at mntns_root */ if (is_root_mount(m)) { ret_stat = fstatat(mntns_root, rpath, &f_stat, 0); } else { char _full_path[PATH_MAX]; int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->ns_mountpoint, rpath); if (n >= PATH_MAX) { pr_err("Not enough space to concatenate %s and %s\n", m->ns_mountpoint, rpath); return ERR_PTR(-ENOSPC); } ret_stat = fstatat(mntns_root, _full_path, &f_stat, 0); } if (ret_stat == 0 && st_dev == f_stat.st_dev && st_ino == f_stat.st_ino) mi_ret = m; } return mi_ret; } /* * Looks up the mnt_id and path of a file in an overlayFS directory. * * This is useful in order to fix the OverlayFS bug present in the * Linux Kernel before version 4.2. See fixup_overlayfs for details. * * We first check to see if the mnt_id and st_dev numbers currently match * some entry in the mountinfo table. If so, we already have the correct mnt_id * and no fixup is needed. * * Then we proceed to see if there are any overlayFS mounted directories * in the mountinfo table. If so, we concatenate the mountpoint with the * name of the file, and stat the resulting path to check if we found the * correct device id and node number. If that is the case, we update the * mount id and link variables with the correct values. */ struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev, unsigned int st_ino, unsigned int mnt_id) { struct mount_info *m; /* If the mnt_id and device number match for some entry, no fixup is needed */ for (m = mntinfo; m != NULL; m = m->next) if (st_dev == kdev_to_odev(m->s_dev) && mnt_id == m->mnt_id) return NULL; return __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id); } static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id) { struct mount_info *m; for (m = list; m != NULL; m = m->next) if (m->mnt_id == id) return m; return NULL; } struct mount_info *lookup_mnt_id(unsigned int id) { return __lookup_mnt_id(mntinfo, id); } struct mount_info *lookup_mnt_sdev(unsigned int s_dev) { struct mount_info *m; for (m = mntinfo; m != NULL; m = m->next) /* * We should not provide notdir bindmounts to open_mount as * opening them can fail/hang for binds of unix sockets/fifos */ if (m->s_dev == s_dev && mnt_is_dir(m)) return m; pr_err("Unable to find suitable mount point for s_dev %x\n", s_dev); return NULL; } static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, const char *path) { size_t pathlen = strlen(path); struct mount_info *m = mntinfo_tree, *c; while (1) { list_for_each_entry(c, &m->children, siblings) { size_t n; n = strlen(c->ns_mountpoint + 1); if (n > pathlen) continue; if (strncmp(c->ns_mountpoint + 1, path, min(n, pathlen))) continue; if (n < pathlen && path[n] != '/') continue; m = c; break; } if (&c->siblings == &m->children) break; } pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->ns_mountpoint); return m; } dev_t phys_stat_resolve_dev(struct ns_id *ns, dev_t st_dev, const char *path) { struct mount_info *m; m = mount_resolve_path(ns->mnt.mntinfo_tree, path); /* * BTRFS returns subvolume dev-id instead of * superblock dev-id, in such case return device * obtained from mountinfo (ie subvolume0). */ return strcmp(m->fstype->name, "btrfs") ? MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev; } bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, struct ns_id *ns, const char *path) { if (st_dev == kdev_to_odev(phys_dev)) return true; return phys_dev == phys_stat_resolve_dev(ns, st_dev, path); } /* * Compare super-blocks mounted at two places */ static bool mounts_sb_equal(struct mount_info *a, struct mount_info *b) { if (a->s_dev != b->s_dev) return false; /* * If one of compared mounts is external its mount info can have fstype * and source fields changed by resolve_external_mounts() or * try_resolve_ext_mount(), but we still want to detect bindmounts of * this external mount, so let's skip source and fstype checks for it. */ if (!a->external && !b->external) { if (strcmp(a->source, b->source) != 0) return false; if (a->fstype != b->fstype) return false; if (a->fstype->sb_equal) return a->fstype->sb_equal(a, b); } else { if (a->fstype->sb_equal) return a->fstype->sb_equal(a, b); else if (b->fstype->sb_equal) return b->fstype->sb_equal(a, b); } if (strcmp(a->options, b->options)) return false; return true; } /* * Compare superblocks AND the way they are mounted */ static bool mounts_equal(struct mount_info *a, struct mount_info *b) { if (!mounts_sb_equal(a, b)) return false; if (strcmp(a->root, b->root)) return false; return true; } /* * mnt_roots is a temporary directory for restoring sub-trees of * non-root namespaces. */ char *mnt_roots; static struct mount_info *mnt_build_ids_tree(struct mount_info *list) { struct mount_info *m, *root = NULL; /* * Just resolve the mnt_id:parent_mnt_id relations */ pr_debug("\tBuilding plain mount tree\n"); for (m = list; m != NULL; m = m->next) { struct mount_info *parent; pr_debug("\t\tWorking on %d->%d\n", m->mnt_id, m->parent_mnt_id); if (m->mnt_id != m->parent_mnt_id) parent = __lookup_mnt_id(list, m->parent_mnt_id); else /* a circular mount reference. It's rootfs or smth like it. */ parent = NULL; if (!parent) { /* Only a root mount can be without parent */ if (!root && m->is_ns_root) { root = m; continue; } pr_err("No parent found for mountpoint %d (@%s)\n", m->mnt_id, m->ns_mountpoint); return NULL; } m->parent = parent; list_add_tail(&m->siblings, &parent->children); } if (!root) { pr_err("No root found for tree\n"); return NULL; } return root; } static unsigned int mnt_depth(struct mount_info *m) { unsigned int depth = 0; char *c; for (c = m->ns_mountpoint; *c != '\0'; c++) if (*c == '/') depth++; return depth; } static void __mnt_resort_children(struct mount_info *parent) { LIST_HEAD(list); /* * Put children mounts in an order they can be (u)mounted * I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/ * we should put them in the foo/bar/foobar/, foo/bar/, foo/ order. * Otherwise we will not be able to (u)mount them in a sequence. * * Funny, but all we need for this is to sort them in the descending * order of the amount of /-s in a path =) * * Use stupid insertion sort here, we're not expecting mount trees * to contain hundreds (or more) elements. */ pr_info("\tResorting children of %d in mount order\n", parent->mnt_id); while (!list_empty(&parent->children)) { struct mount_info *m, *p; unsigned int depth; m = list_first_entry(&parent->children, struct mount_info, siblings); list_del(&m->siblings); depth = mnt_depth(m); list_for_each_entry(p, &list, siblings) if (mnt_depth(p) < depth) break; list_add_tail(&m->siblings, &p->siblings); } list_splice(&list, &parent->children); } static struct mount_info *mnt_subtree_next(struct mount_info *mi, struct mount_info *root); static void resort_siblings(struct mount_info *root, void (*resort_children)(struct mount_info *)) { struct mount_info *mi = root; while (1) { /* * Explanation: sorting the children of the tree like these is * safe and does not break the tree search in mnt_subtree_next * (DFS-next search), as we sort children before calling next * on parent and thus before DFS-next ever touches them, so * from the perspective of DFS-next all children look like they * are already sorted. */ resort_children(mi); mi = mnt_subtree_next(mi, root); if (!mi) break; } } static void mnt_tree_show(struct mount_info *tree, int off) { struct mount_info *m; pr_info("%*s[%s](%d->%d)\n", off, "", tree->ns_mountpoint, tree->mnt_id, tree->parent_mnt_id); list_for_each_entry(m, &tree->children, siblings) mnt_tree_show(m, off + 1); pr_info("%*s<--\n", off, ""); } /* Returns -1 on error, 1 if external mount resolved, 0 otherwise */ static int try_resolve_ext_mount(struct mount_info *info) { char devstr[64]; /* * Only allow mountpoint-external mounts in root mntns. Their lookup is * based on mountpoint path, but in nested mntns we can have completely * different mount tree and at same mountpoint we can have completely * different mount. */ if (info->nsid->type == NS_ROOT) { char *ext; ext = ext_mount_lookup(info->ns_mountpoint + 1 /* trim the . */); if (ext) { pr_info("Found %s mapping for %s mountpoint\n", ext, info->ns_mountpoint); info->external = ext; return 1; } } snprintf(devstr, sizeof(devstr), "dev[%d/%d]", kdev_major(info->s_dev), kdev_minor(info->s_dev)); if (info->fstype->code == FSTYPE__UNSUPPORTED && fsroot_mounted(info)) { char *val; val = external_lookup_by_key(devstr); if (!IS_ERR_OR_NULL(val)) { char *source; int len; pr_info("Found %s dev-mapping for %s(%d) mountpoint\n", val, info->ns_mountpoint, info->mnt_id); info->external = EXTERNAL_DEV_MOUNT; len = strlen(val) + sizeof("dev[]"); source = xrealloc(info->source, len); if (source == NULL) return -1; snprintf(source, len, "dev[%s]", val); info->fstype = fstype_auto(); BUG_ON(info->fstype->code != FSTYPE__AUTO); info->source = source; return 1; } } return 0; } /* * Find the mount_info from which the respective bind-mount * can be created. It can be either an FS-root mount, or the * root of the tree (the latter only if its root path is the * sub-path of the bind mount's root). */ static struct mount_info *find_fsroot_mount_for(struct mount_info *bm) { struct mount_info *sm; list_for_each_entry(sm, &bm->mnt_bind, mnt_bind) if (fsroot_mounted(sm) || (sm->parent == root_yard_mp && strstartswith(bm->root, sm->root))) return sm; return NULL; } static bool mnt_needs_remap(struct mount_info *m) { struct mount_info *t; if (!m->parent || m->parent == root_yard_mp) return false; list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; if (issubpath(t->ns_mountpoint, m->ns_mountpoint)) return true; } /* * If we are children-overmount and parent is remapped, we should be * remapped too, else fixup_remap_mounts() won't be able to move parent * to it's real place, it will move child instead. */ if (!strcmp(m->parent->ns_mountpoint, m->ns_mountpoint)) return mnt_needs_remap(m->parent); return false; } static bool __mnt_is_external_bind(struct mount_info *mi, struct mount_info *bind) { if (bind->external && is_sub_path(mi->root, bind->root)) return true; return false; } /* * Say mount is external if it was explicitly specified as an external or it * can be bind-mounted from such an explicit external mount. */ struct mount_info *mnt_get_external_bind(struct mount_info *mi) { return mnt_bind_pick(mi, __mnt_is_external_bind); } bool mnt_is_external_bind(struct mount_info *mi) { return mnt_get_external_bind(mi); } static bool __can_receive_master_from_external(struct mount_info *mi, struct mount_info *bind) { if (mnt_is_nodev_external(bind) && bind->master_id == mi->master_id && is_sub_path(mi->root, bind->root)) return true; return false; } static struct mount_info *can_receive_master_from_external(struct mount_info *mi) { return mnt_bind_pick(mi, __can_receive_master_from_external); } static bool __has_mounted_external_bind(struct mount_info *mi, struct mount_info *bind) { if (bind->external && bind->mounted && is_sub_path(mi->root, bind->root)) return true; return false; } bool has_mounted_external_bind(struct mount_info *mi) { return mnt_bind_pick(mi, __has_mounted_external_bind); } bool rst_mnt_is_root(struct mount_info *mi) { return (mi->is_ns_root && mi->nsid->id == root_item->ids->mnt_ns_id); } static bool __mnt_is_root_bind(struct mount_info *mi, struct mount_info *bind) { if (rst_mnt_is_root(bind) && is_sub_path(mi->root, bind->root)) return true; return false; } struct mount_info *mnt_get_root_bind(struct mount_info *mi) { return mnt_bind_pick(mi, __mnt_is_root_bind); } bool mnt_is_root_bind(struct mount_info *mi) { return mnt_get_root_bind(mi); } static bool __can_receive_master_from_root(struct mount_info *mi, struct mount_info *bind) { if (rst_mnt_is_root(bind) && bind->master_id == mi->master_id && is_sub_path(mi->root, bind->root)) return true; return false; } static struct mount_info *can_receive_master_from_root(struct mount_info *mi) { return mnt_bind_pick(mi, __can_receive_master_from_root); } static bool __mnt_is_external_bind_nodev(struct mount_info *mi, struct mount_info *bind) { if (bind->external && !mnt_is_dev_external(bind) && is_sub_path(mi->root, bind->root)) return true; return false; } struct mount_info *mnt_get_external_bind_nodev(struct mount_info *mi) { return mnt_bind_pick(mi, __mnt_is_external_bind_nodev); } /* * Having two children with same mountpoint is unsupported. That can happen in * case of mount propagation inside of shared mounts, in that case it is hard * to find out mount propagation siblings and which of these mounts is above * (visible) and which is beneath (hidden). It would've broken mount restore * order in can_mount_now and also visibility assumptions in open_mountpoint. * * Anyway after kernel v4.11 such mounts will be impossible. */ static int validate_children_collision(struct mount_info *mnt) { struct mount_info *chi, *chj; list_for_each_entry(chi, &mnt->children, siblings) { list_for_each_entry(chj, &mnt->children, siblings) { if (chj == chi) break; if (!strcmp(chj->ns_mountpoint, chi->ns_mountpoint)) { pr_err("Mount %d has two children with same " "mountpoint: %d %d\n", mnt->mnt_id, chj->mnt_id, chi->mnt_id); return -1; } } } return 0; } int validate_mounts(struct mount_info *info, bool for_dump) { struct mount_info *m, *t; for (m = info; m; m = m->next) { if (validate_children_collision(m)) return -1; if (mnt_is_external_bind(m)) continue; if (mnt_is_root_bind(m)) continue; /* * Mountpoint can point to / of an FS. In that case this FS * should be of some known type so that we can just mount one. * * Otherwise it's a bindmount mountpoint and we try to find * what fsroot mountpoint it's bound to. If this point is the * root mount, the path to bindmount root should be accessible * form the rootmount path (the strstartswith check in the * else branch below). */ if (fsroot_mounted(m)) { if (m->fstype->code == FSTYPE__UNSUPPORTED) { pr_err("FS mnt %s dev %#x root %s unsupported id %d\n", m->ns_mountpoint, m->s_dev, m->root, m->mnt_id); return -1; } } else { t = find_fsroot_mount_for(m); if (!t) { int ret; /* * No root-mount found for this bind and it's neither * marked nor auto-resolved as external one. So last * chance not to fail is to talk to plugins. */ if (for_dump) { ret = run_plugins(DUMP_EXT_MOUNT, m->ns_mountpoint, m->mnt_id); if (ret == 0) m->need_plugin = true; } else /* * Plugin should take care of this one * in restore_ext_mount, or do_bind_mount * will mount it as external */ ret = m->need_plugin ? 0 : -ENOTSUP; if (ret < 0) { if (ret == -ENOTSUP) pr_err("%d:%s doesn't have a proper root mount\n", m->mnt_id, m->ns_mountpoint); return -1; } } } } return 0; } static struct mount_info *find_best_external_match(struct mount_info *list, struct mount_info *info) { struct mount_info *it, *candidate = NULL; for (it = list; it; it = it->next) { if (!mounts_sb_equal(info, it)) continue; /* * This means we have a situation like: * * root@criu:~# mount --bind bind1/subdir/ bind2 * root@criu:~# mount --bind bind1/ bind3 * * outside the container, and bind1 is directly bind mounted * inside the container. mounts_equal() considers these mounts * equal for bind purposes, but their roots are different, and * we want to match the one with the right root. */ if (!issubpath(info->root, it->root)) continue; candidate = it; /* * Consider the case of: * * mount /xxx * mount --bind /xxx /yyy * mount --make-shared /yyy * mount --bind /xxx /zzz * mount --make-shared /zzz * bind mount a shared mount into the namespace * * Here, we want to return the /right/ mount, not just a mount * that's equal. However, in the case: * * bind mount a shared mount into the namespace * inside the namespace, remount MS_PRIVATE * inside the namespace, remount MS_SHARED * * there will be no external mount with matching sharing * because the sharing is only internal; we still want to bind * mount from this mountinfo so we should return it, but we * should make the sharing namespace private after that bind * mount. * * Below are the cases where we found an exact match. */ if (info->flags & MS_SHARED && info->shared_id == it->shared_id) return candidate; if (info->flags & MS_SLAVE && info->master_id == it->shared_id) return candidate; } return candidate; } static struct ns_id *find_ext_ns_id(void) { struct ns_id *ns; for (ns = ns_ids; ns->next; ns = ns->next) if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) { if (!ns->mnt.mntinfo_list && !collect_mntinfo(ns, false)) break; return ns; } pr_err("Failed to find criu pid's mount ns\n"); return NULL; } static int resolve_external_mounts(struct mount_info *info) { struct ns_id *ext_ns = NULL; struct mount_info *m; if (opts.autodetect_ext_mounts) { ext_ns = find_ext_ns_id(); if (!ext_ns) return -1; } for (m = info; m; m = m->next) { int ret; char *p, *cut_root; struct mount_info *match; if (m->parent == NULL || m->is_ns_root) continue; ret = try_resolve_ext_mount(m); if (ret < 0) return ret; if (ret == 1 || !ext_ns) continue; match = find_best_external_match(ext_ns->mnt.mntinfo_list, m); if (!match) continue; if (m->flags & MS_SHARED) { if (!opts.enable_external_sharing) continue; if (m->shared_id != match->shared_id) m->internal_sharing = true; } if (m->flags & MS_SLAVE) { if (!opts.enable_external_masters) continue; /* * In order to support something like internal slavery, * we need to teach can_mount_now and do_mount_one * about slavery relationships in external mounts. This * seems like an uncommon case, so we punt for not. */ if (m->master_id != match->shared_id && m->master_id != match->master_id) continue; } cut_root = cut_root_for_bind(m->root, match->root); p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); if (!p) return -1; m->external = AUTODETECTED_MOUNT; /* * Put the guessed name in source. It will be picked up * as auto-root in get_mp_root() on restore. */ xfree(m->source); m->source = p; pr_info("autodetected external mount %s for %s(%d)\n", p, m->ns_mountpoint, m->mnt_id); } return 0; } static int root_path_from_parent(struct mount_info *m, char *buf, int size) { bool head_slash = false, tail_slash = false; int p_len, m_len, len; if (!m->parent || m->parent == root_yard_mp) return -1; p_len = strlen(m->parent->ns_mountpoint); m_len = strlen(m->ns_mountpoint); len = snprintf(buf, size, "%s", m->parent->root); if (len >= size) return -1; BUG_ON(len <= 0); if (buf[len - 1] == '/') tail_slash = true; size -= len; buf += len; len = m_len - p_len; BUG_ON(len < 0); if (len) { if (m->ns_mountpoint[p_len] == '/') head_slash = true; len = snprintf(buf, size, "%s%s", (!tail_slash && !head_slash) ? "/" : "", m->ns_mountpoint + p_len + (tail_slash && head_slash)); if (len >= size) return -1; } return 0; } static int same_propagation_group(struct mount_info *a, struct mount_info *b) { char root_path_a[PATH_MAX], root_path_b[PATH_MAX]; /* * If mounts are in same propagation group: * 1) Their parents should be different * 2) Their parents should be together in same shared group */ if (!a->parent || !b->parent || a->parent == b->parent || a->parent->shared_id != b->parent->shared_id) return 0; if (root_path_from_parent(a, root_path_a, PATH_MAX)) { pr_err("Failed to get root path for mount %d\n", a->mnt_id); return -1; } if (root_path_from_parent(b, root_path_b, PATH_MAX)) { pr_err("Failed to get root path for mount %d\n", b->mnt_id); return -1; } /* * 3) Their mountpoints relative to the root of the superblock of their * parent's share should be equal */ if (!strcmp(root_path_a, root_path_b)) return 1; return 0; } /* * Note: Only valid if called consequently on all mounts in mntinfo list. * * Note: We may want to iterate over all bindmounts of some mount, and we would * use ->mnt_bind list for this, but iterating over ->mnt_bind list is * obviously meaningless before search_bindmounts had actually put bindmounts * in it. That's why we have ->mnt_bind_is_populated to protect from misuse of * ->mnt_bind. (As ->mnt_bind list can validly be empty when mount has no * bindmounts we need separate field to indicate population.) */ static void __search_bindmounts(struct mount_info *mi) { struct mount_info *t; if (mi->mnt_bind_is_populated) return; for (t = mi->next; t; t = t->next) { if (mounts_sb_equal(mi, t)) { list_add(&t->mnt_bind, &mi->mnt_bind); t->mnt_bind_is_populated = true; pr_debug("\t" "The mount %3d is bind for %3d (@%s -> @%s)\n", t->mnt_id, mi->mnt_id, t->ns_mountpoint, mi->ns_mountpoint); } } mi->mnt_bind_is_populated = true; } static void search_bindmounts(void) { struct mount_info *mi; for (mi = mntinfo; mi; mi = mi->next) __search_bindmounts(mi); } struct mount_info *mnt_bind_pick(struct mount_info *mi, bool (*pick)(struct mount_info *mi, struct mount_info *bind)) { struct mount_info *bind; BUG_ON(!mi); if (pick(mi, mi)) return mi; /* * Shouldn't use mnt_bind list before it was populated in search_bindmounts */ BUG_ON(!mi->mnt_bind_is_populated); list_for_each_entry(bind, &mi->mnt_bind, mnt_bind) if (pick(mi, bind)) return bind; return NULL; } static int resolve_shared_mounts(struct mount_info *info) { struct mount_info *m, *t; /* * If we have a shared mounts, both master * slave targets are to be present in mount * list, otherwise we can't be sure if we can * recreate the scheme later on restore. */ for (m = info; m; m = m->next) { bool need_share, need_master; need_share = m->shared_id && list_empty(&m->mnt_share); need_master = m->master_id; pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", m->mnt_id, m->shared_id, m->master_id, m->ns_mountpoint); for (t = info; t && (need_share || need_master); t = t->next) { if (t == m) continue; if (need_master && t->shared_id == m->master_id) { pr_debug("\t" "The mount %3d is slave for %3d (@%s -> @%s)\n", m->mnt_id, t->mnt_id, m->ns_mountpoint, t->ns_mountpoint); list_add(&m->mnt_slave, &t->mnt_slave_list); m->mnt_master = t; need_master = false; } /* Collect all mounts from this group */ if (need_share && t->shared_id == m->shared_id) { pr_debug("\t" "Mount %3d is shared with %3d group %3d (@%s -> @%s)\n", m->mnt_id, t->mnt_id, m->shared_id, t->ns_mountpoint, m->ns_mountpoint); list_add(&t->mnt_share, &m->mnt_share); } } /* * External master detected */ if (need_master) { if ((t = can_receive_master_from_external(m)) || (t = can_receive_master_from_root(m))) { pr_debug("Detected external slavery for %d via %d\n", m->mnt_id, t->mnt_id); if (m != t) list_add(&m->mnt_ext_slave, &t->mnt_ext_slave); continue; } pr_err("Mount %d %s (master_id: %d shared_id: %d) " "has unreachable sharing. Try --enable-external-masters.\n", m->mnt_id, m->ns_mountpoint, m->master_id, m->shared_id); return -1; } } /* Search propagation groups */ for (m = info; m; m = m->next) { struct mount_info *sparent; if (!list_empty(&m->mnt_propagate)) continue; if (!m->parent || !m->parent->shared_id) continue; list_for_each_entry(sparent, &m->parent->mnt_share, mnt_share) { struct mount_info *schild; list_for_each_entry(schild, &sparent->children, siblings) { int ret; ret = same_propagation_group(m, schild); if (ret < 0) return -1; else if (ret) { BUG_ON(!mounts_equal(m, schild)); pr_debug("\tMount %3d is in same propagation group with %3d (@%s ~ @%s)\n", m->mnt_id, schild->mnt_id, m->ns_mountpoint, schild->ns_mountpoint); list_add(&schild->mnt_propagate, &m->mnt_propagate); } } } } return 0; } static struct mount_info *mnt_build_tree(struct mount_info *list) { struct mount_info *tree; /* * Organize them in a sequence in which they can be mounted/umounted. */ pr_info("Building mountpoints tree\n"); tree = mnt_build_ids_tree(list); if (!tree) return NULL; resort_siblings(tree, __mnt_resort_children); pr_info("Done:\n"); mnt_tree_show(tree, 0); return tree; } int mnt_is_dir(struct mount_info *pm) { int mntns_root; struct stat st; mntns_root = mntns_get_root_fd(pm->nsid); if (mntns_root < 0) { pr_warn("Can't get root fd of mntns for %d: %s\n", pm->mnt_id, strerror(errno)); return 0; } if (fstatat(mntns_root, pm->ns_mountpoint, &st, 0)) { pr_warn("Can't fstatat on %s: %s\n", pm->ns_mountpoint, strerror(errno)); return 0; } if (S_ISDIR(st.st_mode)) return 1; return 0; } int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinfo) { struct stat st; unsigned int dev; int ret; ret = fstat(mnt_fd, &st); if (ret < 0) { pr_perror("fstat(%s) failed", pm->ns_mountpoint); return -1; } if (pm->s_dev_rt == MOUNT_INVALID_DEV) { pr_err("Resolving over invalid device for %#x %s %s\n", pm->s_dev, pm->fstype->name, pm->ns_mountpoint); return -1; } dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); /* * Always check for @s_dev_rt here, because the @s_dev * from the image (in case of restore) has all rights * to not match the device (say it's migrated and kernel * allocates new device ID). */ if (dev != pm->s_dev_rt) { /* * For btrfs device numbers in stat and mountinfo can be * different, fallback to get_sdev_from_fd to get right dev. */ if (!strcmp(pm->fstype->name, "btrfs") && !get_sdev_from_fd(mnt_fd, &dev, parse_mountinfo) && dev == pm->s_dev_rt) return 0; pr_warn("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, pm->fstype->name, pm->ns_mountpoint); return -1; } return 0; } int check_mountpoint_fd(struct mount_info *pm, int mnt_fd) { return __check_mountpoint_fd(pm, mnt_fd, false); } /* * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case. * If mnt_fd is -1, the mountpoint will be opened by this function. */ int __open_mountpoint(struct mount_info *pm) { int mntns_root, mnt_fd; mntns_root = mntns_get_root_fd(pm->nsid); if (mntns_root < 0) return -1; mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY); if (mnt_fd < 0) { pr_perror("Can't open %s", pm->ns_mountpoint); return -1; } if (check_mountpoint_fd(pm, mnt_fd)) { close(mnt_fd); return -1; } return mnt_fd; } int open_mount(unsigned int s_dev) { struct mount_info *m; int mnt_fd; m = lookup_mnt_sdev(s_dev); if (!m) return -ENOENT; mnt_fd = __open_mountpoint(m); if (mnt_fd < 0) pr_err("Can't open mount %#x\n", s_dev); return mnt_fd; } /* Bind-mount a mount point in a temporary place without children */ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_path_root) { char *mnt_path; mnt_path = mkdtemp(mnt_path_tmp); if (mnt_path == NULL && errno == ENOENT) mnt_path = mkdtemp(mnt_path_root); if (mnt_path == NULL) { pr_warn("Can't create a temporary directory: %s\n", strerror(errno)); return NULL; } if (mount(mi->ns_mountpoint, mnt_path, NULL, MS_BIND, NULL)) { pr_perror("Can't bind-mount %d:%s to %s", mi->mnt_id, mi->ns_mountpoint, mnt_path); rmdir(mnt_path); return NULL; } return mnt_path; } static int get_clean_fd(struct mount_info *mi) { char *mnt_path = NULL; char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX"; char mnt_path_root[] = "/cr-tmpfs.XXXXXX"; int fd; mnt_path = get_clean_mnt(mi, mnt_path_tmp, mnt_path_root); if (!mnt_path) return -1; fd = open(mnt_path, O_RDONLY | O_DIRECTORY, 0); if (fd < 0) { pr_perror("Can't open directory %s", mnt_path); } else { if (__check_mountpoint_fd(mi, fd, true)) goto err_close; } if (umount2(mnt_path, MNT_DETACH)) { pr_perror("Can't detach mount %s", mnt_path); goto err_close; } if (rmdir(mnt_path)) { pr_perror("Can't remove tmp dir %s", mnt_path); goto err_close; } return fd; err_close: close_safe(&fd); return -1; } /* * Our children mount can have same mountpoint as it's parent, * call these - children-overmount. * Sibling mount's mountpoint can be a subpath of our mountpoint * call these - sibling-overmount. * In both above cases our mountpoint is not visible from the * root of our mount namespace as it is covered by other mount. * mnt_is_overmounted() checks if mount is not visible. */ bool mnt_is_overmounted(struct mount_info *mi) { struct mount_info *t, *c, *m = mi; if (mi->is_overmounted != -1) goto exit; mi->is_overmounted = 0; while (m->parent) { if (mi->parent->is_overmounted == 1) { mi->is_overmounted = 1; goto exit; } /* Check there is no sibling-overmount */ list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; if (issubpath(m->ns_mountpoint, t->ns_mountpoint)) { mi->is_overmounted = 1; goto exit; } } /* * If parent has sibling-overmount we are not visible too, * note that children-overmounts for parent are already * checked as our sibling overmounts. */ m = m->parent; } /* Check there is no children-overmount */ list_for_each_entry(c, &mi->children, siblings) if (!strcmp(c->ns_mountpoint, mi->ns_mountpoint)) { mi->is_overmounted = 1; goto exit; } exit: return mi->is_overmounted; } static int __set_is_overmounted(struct mount_info *mi) { /* coverity[check_return] */ mnt_is_overmounted(mi); return 0; } /* * mnt_is_overmounted is intended to detect overmounts in original dumped mount * tree, so we pre-save it just after loading mount tree from images, so that * it does not mess up with any helper mounts or tree changes we can do. */ static void prepare_is_overmounted(void) { struct ns_id *nsid; for (nsid = ns_ids; nsid; nsid = nsid->next) { struct mount_info *root; if (nsid->nd != &mnt_ns_desc) continue; root = nsid->mnt.mntinfo_tree; BUG_ON(root->parent); mnt_tree_for_each(root, __set_is_overmounted); } } /* * __umount_children_overmounts() assumes that the mountpoint and * it's ancestors have no sibling-overmounts, so we can see children * of these mount. Unmount our children-overmounts now. */ static int __umount_children_overmounts(struct mount_info *mi) { struct mount_info *c, *m = mi; /* * Our children-overmount can itself have children-overmount * which covers it, so find deepest children-overmount which * is visible for us now. */ again: list_for_each_entry(c, &m->children, siblings) { if (!strcmp(c->ns_mountpoint, m->ns_mountpoint)) { m = c; goto again; } } /* Unmout children-overmounts in the order of visibility */ while (m != mi) { if (umount2(m->ns_mountpoint, MNT_DETACH)) { pr_perror("Unable to umount child-overmount %s", m->ns_mountpoint); return -1; } BUG_ON(!m->parent); m = m->parent; } return 0; } /* Makes the mountpoint visible except for children-overmounts. */ static int __umount_overmounts(struct mount_info *m) { struct mount_info *t, *ovm; int ovm_len, ovm_len_min = 0; /* Root mount has no sibling-overmounts */ if (!m->parent) return 0; /* * If parent is sibling-overmounted we are not visible * too, so first try to unmount overmounts for parent. */ if (__umount_overmounts(m->parent)) return -1; /* Unmount sibling-overmounts in visibility order */ next: ovm = NULL; ovm_len = strlen(m->ns_mountpoint) + 1; list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; if (issubpath(m->ns_mountpoint, t->ns_mountpoint)) { int t_len = strlen(t->ns_mountpoint); if (t_len < ovm_len && t_len > ovm_len_min) { ovm = t; ovm_len = t_len; } } } if (ovm) { ovm_len_min = ovm_len; /* Our sibling-overmount can have children-overmount covering it */ if (__umount_children_overmounts(ovm)) return -1; if (umount2(ovm->ns_mountpoint, MNT_DETACH)) { pr_perror("Unable to umount %s", ovm->ns_mountpoint + 1); return -1; } goto next; } return 0; } /* Make our mountpoint fully visible */ static int umount_overmounts(struct mount_info *m) { if (__umount_overmounts(m)) return -1; if (__umount_children_overmounts(m)) return -1; return 0; } struct clone_arg { struct mount_info *mi; int *fd; }; /* * Get access to the mountpoint covered by overmounts * and open it's cleaned copy (without children mounts). */ int ns_open_mountpoint(void *arg) { struct clone_arg *ca = arg; struct mount_info *mi = ca->mi; int *fd = ca->fd; /* * We should enter user namespace owning mount namespace of our mount * before creating helper mount namespace. Else all mounts in helper * mount namespace will be locked (MNT_LOCKED) and we won't be able to * unmount them (see CL_UNPRIVILEGED in sys_umount(), clone_mnt() and * copy_mnt_ns() in linux kernel code). */ if ((root_ns_mask & CLONE_NEWUSER) && switch_ns(root_item->pid->real, &user_ns_desc, NULL) < 0) goto err; /* * Create a helper mount namespace in which we can safely do unmounts * without breaking dumping process' environment. */ if (unshare(CLONE_NEWNS)) { pr_perror("Unable to unshare a mount namespace"); goto err; } /* Remount all mounts as private to disable propagation */ if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) { pr_perror("Unable to remount"); goto err; } if (umount_overmounts(mi)) goto err; /* * Save fd which we opened for parent due to CLONE_FILES flag * * Mount can still have children in it, but we don't need to clean it * explicitly as when last process exits mntns all mounts in it are * cleaned from their children, and we are exactly the last process. */ *fd = open(mi->ns_mountpoint, O_DIRECTORY | O_RDONLY); if (*fd < 0) { pr_perror("Unable to open %s(%d)", mi->ns_mountpoint, mi->mnt_id); goto err; } if (__check_mountpoint_fd(mi, *fd, true)) { close(*fd); goto err; } return 0; err: return 1; } int open_mountpoint(struct mount_info *pm) { int fd = -1, cwd_fd, ns_old = -1; /* No overmounts and children - the entire mount is visible */ if (list_empty(&pm->children) && !mnt_is_overmounted(pm)) return __open_mountpoint(pm); pr_info("Mount is not fully visible %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); /* * We do two things below: * a) If mount has children mounts in it which partially cover it's * content, to get access to the content we create a "private" copy of * such a mount, bind-mounting mount w/o MS_REC in a temporary place. * b) If mount is overmounted we create a private copy of it's mount * namespace so that we can safely get rid of overmounts and get an * access to the mount. * In both cases we can't do the thing from criu's mount namespace, so * we need to switch to mount's mount namespace, and later switch back. */ if (switch_mnt_ns(pm->nsid->ns_pid, &ns_old, &cwd_fd) < 0) goto err; if (!mnt_is_overmounted(pm)) { pr_info("\tmount has children %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); fd = get_clean_fd(pm); } /* * Mount is overmounted or probably we can't create a temporary * directory for a cleaned mount */ if (fd < 0) { int pid, status; struct clone_arg ca = { .mi = pm, .fd = &fd }; pr_info("\tmount is overmounted or has children %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); /* * We are overmounted - not accessible in a regular way. We * need to clone "private" copy of mount's monut namespace and * unmount all covering overmounts in it. We also need to enter * user namespace owning these mount namespace just before that * (see explanation in ns_open_mountpoint). Thus we also have * to create helper process here as entering user namespace is * irreversible operation. */ pid = clone_noasan(ns_open_mountpoint, CLONE_VFORK | CLONE_VM | CLONE_FILES | CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, &ca); if (pid == -1) { pr_perror("Can't clone helper process"); goto err; } errno = 0; if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status) || WEXITSTATUS(status)) { pr_err("Can't wait or bad status: errno=%d, status=%d\n", errno, status); goto err; } } if (restore_mnt_ns(ns_old, &cwd_fd)) { ns_old = -1; goto err; } return fd < 0 ? __open_mountpoint(pm) : fd; err: if (ns_old >= 0) /* coverity[check_return] */ restore_mnt_ns(ns_old, &cwd_fd); close_safe(&fd); return -1; } /* * Helper for getting a path to mount's plain mountpoint */ char *get_plain_mountpoint(int mnt_id, char *name) { static char tmp[PATH_MAX]; int ret; if (!mnt_roots) return NULL; if (name) ret = snprintf(tmp, sizeof(tmp), "%s/mnt-%s", mnt_roots, name); else ret = snprintf(tmp, sizeof(tmp), "%s/mnt-%010d", mnt_roots, mnt_id); if (ret >= sizeof(tmp)) return NULL; return xstrdup(tmp); } struct mount_info __maybe_unused *add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev, bool rst) { struct mount_info *mi, *t, *parent; bool add_slash = false; int len; mi = mnt_entry_alloc(rst); if (!mi) return NULL; len = strlen(root->mountpoint); /* It may be "./" or "./path/to/dir" */ if (root->mountpoint[len - 1] != '/') { add_slash = true; len++; } mi->mountpoint = xmalloc(len + strlen(path) + 1); if (!mi->mountpoint) goto err; if (!rst) mi->ns_mountpoint = mi->mountpoint; if (!add_slash) sprintf(mi->mountpoint, "%s%s", root->mountpoint, path); else sprintf(mi->mountpoint, "%s/%s", root->mountpoint, path); if (rst) { mi->plain_mountpoint = get_plain_mountpoint(-1, "crtime"); if (!mi->plain_mountpoint) goto err; } mi->mnt_id = HELPER_MNT_ID; mi->is_dir = true; mi->flags = mi->sb_flags = 0; mi->root = xstrdup("/"); mi->fsname = xstrdup(fsname); mi->source = xstrdup(fsname); mi->options = xstrdup(""); if (!mi->root || !mi->fsname || !mi->source || !mi->options) goto err; mi->fstype = find_fstype_by_name(fsname); mi->s_dev = mi->s_dev_rt = s_dev; parent = root; while (1) { list_for_each_entry(t, &parent->children, siblings) { if (strstartswith(service_mountpoint(mi), service_mountpoint(t))) { parent = t; break; } } if (&t->siblings == &parent->children) break; } mi->mnt_bind_is_populated = true; mi->is_overmounted = false; mi->nsid = parent->nsid; mi->parent = parent; mi->parent_mnt_id = parent->mnt_id; list_add(&mi->siblings, &parent->children); pr_info("Add cr-time mountpoint %s with parent %s(%u)\n", service_mountpoint(mi), service_mountpoint(parent), parent->mnt_id); return mi; err: mnt_entry_free(mi); return NULL; } /* * Returns: * 0 - success * -1 - error * 1 - skip */ static __maybe_unused int mount_cr_time_mount(struct ns_id *ns, unsigned int *s_dev, const char *source, const char *target, const char *type) { int mnt_fd, cwd_fd, exit_code = -1; struct stat st; if (switch_mnt_ns(ns->ns_pid, &mnt_fd, &cwd_fd)) { pr_err("Can't switch mnt_ns\n"); return -1; } if (mount(source, target, type, 0, NULL)) { switch (errno) { case EPERM: case EBUSY: case ENODEV: case ENOENT: pr_debug("Skipping %s as was unable to mount it: %s\n", type, strerror(errno)); exit_code = 1; break; default: pr_perror("Unable to mount %s %s %s", type, source, target); } goto restore_ns; } if (stat(target, &st)) { pr_perror("Can't stat %s", target); goto restore_ns; } *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); exit_code = 0; restore_ns: if (restore_mnt_ns(mnt_fd, &cwd_fd)) exit_code = -1; return exit_code; } static int dump_one_fs(struct mount_info *mi) { struct mount_info *pm = mi; struct mount_info *t; bool first = true; if (mnt_is_root_bind(mi) || mi->need_plugin || mnt_is_external_bind(mi) || !mi->fstype->dump) return 0; /* mnt_bind is a cycled list, so list_for_each can't be used here. */ for (; &pm->mnt_bind != &mi->mnt_bind || first; pm = list_entry(pm->mnt_bind.next, typeof(*pm), mnt_bind)) { int ret; first = false; if (!fsroot_mounted(pm)) continue; ret = pm->fstype->dump(pm); if (ret == MNT_UNREACHABLE) continue; if (ret < 0) return ret; pm->dumped = true; list_for_each_entry(t, &pm->mnt_bind, mnt_bind) t->dumped = true; return 0; } pr_err("Unable to dump a file system for %d:%s\n", mi->mnt_id, mi->ns_mountpoint); return -1; } static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img) { MntEntry me = MNT_ENTRY__INIT; pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev, pm->root, pm->ns_mountpoint); me.fstype = pm->fstype->code; if (me.fstype == FSTYPE__AUTO) me.fsname = pm->fsname; if (!pm->dumped && dump_one_fs(pm)) return -1; if (!mnt_is_external_bind(pm) && !fsroot_mounted(pm) && pm->fstype->check_bindmount && pm->fstype->check_bindmount(pm)) return -1; if (pm->mnt_id == HELPER_MNT_ID) { pr_info("Skip dumping helper mountpoint: %s\n", pm->ns_mountpoint); return 0; } me.mnt_id = pm->mnt_id; me.root_dev = pm->s_dev; me.parent_mnt_id = pm->parent_mnt_id; me.flags = pm->flags; me.sb_flags = pm->sb_flags; me.has_sb_flags = true; me.mountpoint = pm->ns_mountpoint + 1; me.source = pm->source; me.options = pm->options; me.shared_id = pm->shared_id; me.has_shared_id = true; me.master_id = pm->master_id; me.has_master_id = true; if (pm->need_plugin) { me.has_with_plugin = true; me.with_plugin = true; } if (pm->deleted) { me.has_deleted = true; me.deleted = true; } if (pm->internal_sharing) { me.has_internal_sharing = true; me.internal_sharing = true; } if (pm->external) /* * For external mount points dump the mapping's * value, see collect_mnt_from_image -> get_mp_root * for reverse mapping details. */ me.ext_key = pm->external; me.root = pm->root; if (pb_write_one(img, &me, PB_MNT)) return -1; return 0; } static void free_mntinfo(struct mount_info *pms) { while (pms) { struct mount_info *pm; pm = pms->next; mnt_entry_free(pms); pms = pm; } } struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump) { struct mount_info *pm; pm = parse_mountinfo(ns->ns_pid, ns, for_dump); if (!pm) { pr_err("Can't parse %d's mountinfo\n", ns->ns_pid); return NULL; } ns->mnt.mntinfo_tree = mnt_build_tree(pm); if (ns->mnt.mntinfo_tree == NULL) goto err; ns->mnt.mntinfo_list = pm; return pm; err: free_mntinfo(pm); return NULL; } static int dump_mnt_ns(struct ns_id *ns, struct mount_info *pms) { struct mount_info *pm; int ret = -1; struct cr_img *img; unsigned int ns_id = ns->id; pr_info("Dumping mountpoints\n"); img = open_image(CR_FD_MNTS, O_DUMP, ns_id); if (!img) goto err; for (pm = pms; pm && pm->nsid == ns; pm = pm->next) if (dump_one_mountpoint(pm, img)) goto err_i; ret = 0; err_i: close_image(img); err: return ret; } /* * _fn_f - pre-order traversal function * _fn_r - post-order traversal function * _plist - a postpone list. _el is added to this list, if _fn_f returns * a positive value, and all lower elements are not enumerated. */ #define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) \ do { \ struct mount_info *_mi = _r; \ \ while (1) { \ int ret; \ \ list_del_init(&_mi->postpone); \ \ ret = _fn_f(_mi); \ if (ret < 0) \ return -1; \ else if (ret > 0) { \ list_add_tail(&_mi->postpone, _plist); \ goto up; \ } \ \ _prgs++; \ \ if (!list_empty(&_mi->children)) { \ _mi = list_entry(_mi->children._el, struct mount_info, siblings); \ continue; \ } \ up: \ if (_fn_r(_mi)) \ return -1; \ if (_mi == _r) \ break; \ if (_mi->siblings._el == &_mi->parent->children) { \ _mi = _mi->parent; \ goto up; \ } \ _mi = list_entry(_mi->siblings._el, struct mount_info, siblings); \ } \ } while (0) #define MNT_WALK_NONE 0 && int mnt_tree_for_each(struct mount_info *start, int (*fn)(struct mount_info *)) { struct mount_info *tmp; LIST_HEAD(postpone); LIST_HEAD(postpone2); int progress; pr_debug("Start with %d:%s\n", start->mnt_id, start->ns_mountpoint); list_add(&start->postpone, &postpone); again: progress = 0; list_for_each_entry_safe(start, tmp, &postpone, postpone) MNT_TREE_WALK(start, next, fn, MNT_WALK_NONE, &postpone2, progress); if (!progress) { struct mount_info *m; pr_err("A few mount points can't be mounted\n"); list_for_each_entry(m, &postpone2, postpone) { pr_err("%d:%d %s %s %s\n", m->mnt_id, m->parent_mnt_id, m->root, m->ns_mountpoint, m->source); } return -1; } list_splice_init(&postpone2, &postpone); if (!list_empty(&postpone)) goto again; return 0; } static int mnt_tree_for_each_reverse(struct mount_info *m, int (*fn)(struct mount_info *)) { int progress = 0; MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *)NULL, progress); (void)progress; // Suppress -Wused-but-unset-variable for clang>=15 return 0; } char *resolve_source(struct mount_info *mi) { if (kdev_major(mi->s_dev) == 0) /* * Anonymous block device. Kernel creates them for * diskless mounts. */ return mi->source; /* * FSTYPE__AUTO check is a fallback for old images which do not have * explicit EXTERNAL_DEV_MOUNT mark, but still have "dev[key]" in source. */ if (mnt_is_dev_external(mi) || mi->fstype->code == FSTYPE__AUTO) { struct stat st; char *val; val = external_lookup_by_key(mi->source); if (!IS_ERR_OR_NULL(val)) return val; if (!stat(mi->source, &st) && S_ISBLK(st.st_mode) && major(st.st_rdev) == kdev_major(mi->s_dev) && minor(st.st_rdev) == kdev_minor(mi->s_dev)) return mi->source; } pr_err("No device for %s(%d) mount\n", mi->ns_mountpoint, mi->mnt_id); return NULL; } static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave) { pr_debug("%d:%s private %d shared %d slave %d\n", mi->mnt_id, service_mountpoint(mi), private, shared, slave); if (mi->flags & MS_UNBINDABLE) { if (shared || slave) { pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", service_mountpoint(mi)); } else { if (!mnt_is_overmounted(mi)) { /* Someone may still want to bind from us, let them do it. */ pr_debug("Temporary leave unbindable mount %s as private\n", service_mountpoint(mi)); if (mount(NULL, service_mountpoint(mi), NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to make %d private", mi->mnt_id); return -1; } list_add(&mi->mnt_unbindable, &delayed_unbindable); return 0; } if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { pr_perror("Unable to make %d unbindable", mi->mnt_id); return -1; } return 0; } } if (private && mount(NULL, service_mountpoint(mi), NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to make %d private", mi->mnt_id); return -1; } if (slave && mount(NULL, service_mountpoint(mi), NULL, MS_SLAVE, NULL)) { pr_perror("Unable to make %d slave", mi->mnt_id); return -1; } if (shared && mount(NULL, service_mountpoint(mi), NULL, MS_SHARED, NULL)) { pr_perror("Unable to make %d shared", mi->mnt_id); return -1; } return 0; } /* * Umount points, which are propagated in slave parents, because * we can't be sure, that they were inherited in a real life. */ static int umount_from_slaves(struct mount_info *mi) { struct mount_info *t; char *mpath, buf[PATH_MAX]; BUG_ON(mi->parent == root_yard_mp); list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) { if (!t->mounted) continue; mpath = mnt_get_sibling_path(mi, t, buf, sizeof(buf)); if (mpath == NULL) continue; pr_debug("\t\tUmount slave %s\n", mpath); if (umount(mpath) == -1) { pr_perror("Can't umount slave %s", mpath); return -1; } } return 0; } /* * If something is mounted in one shared point, it will be spread in * all other points from this shared group. * * Look at Documentation/filesystems/sharedsubtree.txt for more details */ static int propagate_siblings(struct mount_info *mi) { struct mount_info *t; /* * Find all mounts, which must be bind-mounted from this one * to inherit shared group or master id */ list_for_each_entry(t, &mi->mnt_share, mnt_share) { if (t->mounted) continue; if (t->bind && t->bind->shared_id == t->shared_id) continue; pr_debug("\t\tBind share %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) { if (t->mounted || t->bind) continue; pr_debug("\t\tBind slave %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } list_for_each_entry(t, &mi->mnt_ext_slave, mnt_ext_slave) { if (t->mounted || t->bind) continue; pr_debug("\t\tBind ext-slave %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } return 0; } static int propagate_mount(struct mount_info *mi) { struct mount_info *p; propagate_siblings(mi); if (!mi->parent || mi->parent == root_yard_mp) goto skip_parent; umount_from_slaves(mi); /* Mark mounts in propagation group mounted */ list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { /* Should not propagate the same mount twice */ BUG_ON(p->mounted); pr_debug("\t\tPropagate %s(%d)\n", p->ns_mountpoint, p->mnt_id); /* * When a mount is propagated, the result mount * is always shared. If we want to get a private * mount, we need to convert it. */ restore_shared_options(p, !p->shared_id, 0, 0); p->mounted = true; propagate_siblings(p); umount_from_slaves(p); } skip_parent: /* * FIXME Currently non-root mounts can be restored * only if a proper root mount exists */ if (fsroot_mounted(mi) || mi->parent == root_yard_mp || mi->external) { struct mount_info *t; list_for_each_entry(t, &mi->mnt_bind, mnt_bind) { if (t->mounted) continue; if (t->bind) continue; if (t->master_id) continue; if (!issubpath(t->root, mi->root)) continue; pr_debug("\t\tBind private %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } } return 0; } int fetch_rt_stat(struct mount_info *m, const char *where) { struct stat st; if (stat(where, &st)) { pr_perror("Can't stat on %s", where); return -1; } m->s_dev_rt = MKKDEV(major(st.st_dev), minor(st.st_dev)); return 0; } int do_simple_mount(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags) { int ret = mount(src, service_mountpoint(mi), fstype, mountflags, mi->options); if (ret) pr_perror("Unable to mount %s %s (id=%d)", src, service_mountpoint(mi), mi->mnt_id); return ret; } char *mnt_fsname(struct mount_info *mi) { if (mi->fstype->code == FSTYPE__AUTO) return mi->fsname; return mi->fstype->name; } static int userns_mount(char *src, void *args, int fd, pid_t pid) { unsigned long flags = *(unsigned long *)args; int rst = -1, err = -1; char target[PSFDS]; snprintf(target, sizeof(target), "/proc/self/fd/%d", fd); if (pid != getpid() && switch_ns(pid, &mnt_ns_desc, &rst)) return -1; err = mount(src, target, NULL, flags, NULL); if (err) pr_perror("Unable to mount %s", target); if (rst >= 0 && restore_ns(rst, &mnt_ns_desc)) return -1; return err; } int apply_sb_flags(void *args, int fd, pid_t pid) { return userns_mount(NULL, args, fd, pid); } int mount_root(void *args, int fd, pid_t pid) { return userns_mount(opts.root, args, fd, pid); } static int do_new_mount(struct mount_info *mi) { unsigned long sflags = mi->sb_flags; unsigned long mflags = mi->flags & (~MS_PROPAGATE); char *src; struct fstype *tp = mi->fstype; bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY); mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount; src = resolve_source(mi); if (!src) return -1; /* Merge superblock and mount flags if it's possible */ if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) { sflags |= mflags; mflags = 0; } if (remount_ro) sflags &= ~MS_RDONLY; if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) { pr_perror("Can't mount at %s", service_mountpoint(mi)); return -1; } if (tp->restore && tp->restore(mi)) return -1; if (remount_ro) { int fd; fd = open(service_mountpoint(mi), O_PATH); if (fd < 0) { pr_perror("Unable to open %s", service_mountpoint(mi)); return -1; } sflags |= MS_RDONLY | MS_REMOUNT; if (userns_call(apply_sb_flags, 0, &sflags, sizeof(sflags), fd)) { pr_err("Unable to apply mount flags %d for %s\n", mi->sb_flags, service_mountpoint(mi)); close(fd); return -1; } close(fd); } if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { pr_perror("Unable to apply bind-mount options"); return -1; } /* * A slave should be mounted from do_bind_mount(). * Look at can_mount_now() for details. */ BUG_ON(mi->master_id); if (restore_shared_options(mi, !mi->shared_id, mi->shared_id, 0)) return -1; mi->mounted = true; return 0; } int restore_ext_mount(struct mount_info *mi) { int ret; pr_debug("Restoring external bind mount %s\n", service_mountpoint(mi)); ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, service_mountpoint(mi), "/", NULL); if (ret) pr_err("Can't restore ext mount (%d)\n", ret); return ret; } static char mnt_clean_path[] = "/tmp/cr-tmpfs.XXXXXX"; static int mount_clean_path(void) { /* * To make a bind mount, we need to have access to a source directory, * which can be over-mounted. The idea is to mount a source mount in * an intermediate place without MS_REC and then create a target mounts. * This intermediate place should be a private mount to not affect * properties of the source mount. */ if (mkdtemp(mnt_clean_path) == NULL) { pr_perror("Unable to create a temporary directory"); return -1; } if (mount(mnt_clean_path, mnt_clean_path, NULL, MS_BIND, NULL)) { pr_perror("Unable to mount tmpfs into %s", mnt_clean_path); return -1; } if (mount(NULL, mnt_clean_path, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to mark %s as private", mnt_clean_path); return -1; } return 0; } static int umount_clean_path(void) { if (umount2(mnt_clean_path, MNT_DETACH)) { pr_perror("Unable to umount %s", mnt_clean_path); return -1; } if (rmdir(mnt_clean_path)) { pr_perror("Unable to remove %s", mnt_clean_path); } return 0; } static int do_bind_mount(struct mount_info *mi) { char mnt_fd_path[PSFDS]; char *root, *cut_root, rpath[PATH_MAX]; unsigned long mflags; int exit_code = -1, mp_len; bool shared = false; bool master = false; bool priv = false; char *mnt_path = NULL; struct stat st; bool umount_mnt_path = false; struct mount_info *c; if (mi->need_plugin) { if (restore_ext_mount(mi)) return -1; goto out; } if (mnt_is_nodev_external(mi)) { /* * We have / pointing to criu's ns root still, * so just use the mapping's path. The mountpoint * is tuned in collect_mnt_from_image to refer * to proper location in the namespace we restore. */ root = mi->external; priv = !mi->master_id && (mi->internal_sharing || !mi->shared_id); goto do_bind; } shared = mi->shared_id && mi->shared_id == mi->bind->shared_id; master = mi->master_id && mi->master_id == mi->bind->master_id; priv = !mi->master_id && !shared; cut_root = cut_root_for_bind(mi->root, mi->bind->root); /* Mount private can be initialized on mount() callback, which is * called only once. * It have to be copied to all it's sibling structures to provide users * of it with actual data. */ mi->private = mi->bind->private; mnt_path = service_mountpoint(mi->bind); /* Access a mount by fd if service_mountpoint(mi->bind) is overmounted */ if (mi->bind->fd >= 0) { snprintf(mnt_fd_path, sizeof(mnt_fd_path), "/proc/self/fd/%d", mi->bind->fd); mnt_path = mnt_fd_path; } if (cut_root[0] == 0) /* This case is handled by mi->bind->fd */ goto skip_overmount_check; /* * The target path may be over-mounted by one of child mounts * and we need to create a new bind-mount to get access to the path. */ mp_len = strlen(service_mountpoint(mi->bind)); if (mp_len > 1) /* skip a joining / if service_mountpoint(mi->bind) isn't "/" */ mp_len++; list_for_each_entry(c, &mi->bind->children, siblings) { if (!c->mounted) continue; if (issubpath(cut_root, service_mountpoint(c) + mp_len)) break; /* a source path is overmounted */ } if (&c->siblings != &mi->bind->children) { /* Get a copy of mi->bind without child mounts */ if (mount(mnt_path, mnt_clean_path, NULL, MS_BIND, NULL)) { pr_perror("Unable to bind-mount %s to %s", mnt_path, mnt_clean_path); return -1; } mnt_path = mnt_clean_path; umount_mnt_path = true; } if (mnt_path == NULL) return -1; skip_overmount_check: snprintf(rpath, sizeof(rpath), "%s/%s", mnt_path, cut_root); root = rpath; do_bind: pr_info("\tBind %s to %s\n", root, service_mountpoint(mi)); if (unlikely(mi->deleted)) { if (stat(service_mountpoint(mi), &st)) { pr_perror("Can't fetch stat on %s", service_mountpoint(mi)); goto err; } if (S_ISDIR(st.st_mode)) { if (mkdir(root, (st.st_mode & ~S_IFMT))) { pr_perror("Can't re-create deleted directory %s", root); goto err; } } else if (S_ISREG(st.st_mode)) { int fd = open(root, O_WRONLY | O_CREAT | O_EXCL, st.st_mode & ~S_IFMT); if (fd < 0) { pr_perror("Can't re-create deleted file %s", root); goto err; } close(fd); } else { pr_err("Unsupported st_mode 0%o deleted root %s\n", (int)st.st_mode, root); goto err; } } if (mount(root, service_mountpoint(mi), NULL, MS_BIND | (mi->flags & MS_REC), NULL) < 0) { pr_perror("Can't bind-mount at %s", service_mountpoint(mi)); goto err; } mflags = mi->flags & (~MS_PROPAGATE); if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE))) if (mount(NULL, service_mountpoint(mi), NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { pr_perror("Can't re-mount at %s", service_mountpoint(mi)); goto err; } if (unlikely(mi->deleted)) { if (S_ISDIR(st.st_mode)) { if (rmdir(root)) { pr_perror("Can't remove deleted directory %s", root); goto err; } } else if (S_ISREG(st.st_mode)) { if (unlink(root)) { pr_perror("Can't unlink deleted file %s", root); goto err; } } } out: /* * shared - the mount is in the same shared group with mi->bind * mi->shared_id && !shared - create a new shared group */ if (restore_shared_options(mi, priv, mi->shared_id && !shared, mi->master_id && !master)) goto err; mi->mounted = true; exit_code = 0; err: if (umount_mnt_path) { /* * If mnt_path was shared, a new mount may be propagated * into it. */ if (mount(NULL, mnt_path, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to make %s private", mnt_path); return -1; } if (umount2(mnt_path, MNT_DETACH)) { pr_perror("Unable to umount %s", mnt_path); return -1; } } return exit_code; } static bool can_mount_now(struct mount_info *mi) { struct mount_info *ext; if (rst_mnt_is_root(mi)) { pr_debug("%s: true as %d is mntns root\n", __func__, mi->mnt_id); return true; } /* Parent should be mounted already, that's how mnt_tree_for_each works */ BUG_ON(mi->parent && !mi->parent->mounted); if (mnt_is_nodev_external(mi)) goto shared; if (!mi->bind && !mi->external && (ext = mnt_get_external_bind(mi)) && !has_mounted_external_bind(mi)) { pr_debug("%s: false as %d's external %d is not mounted\n", __func__, mi->mnt_id, ext->mnt_id); return false; } /* * We're the slave peer: * - Make sure the master peer is already mounted * - Make sure all children of master's share are * mounted as well to eliminate mounts duplications */ if (mi->mnt_master) { struct mount_info *c, *s; if (mi->bind == NULL) { pr_debug("%s: false as %d is slave with unmounted master %d\n", __func__, mi->mnt_id, mi->mnt_master->mnt_id); return false; } list_for_each_entry(c, &mi->mnt_master->children, siblings) { if (!c->mounted) { pr_debug("%s: false as %d is slave with unmounted master's children %d\n", __func__, mi->mnt_id, c->mnt_id); return false; } } list_for_each_entry(s, &mi->mnt_master->mnt_share, mnt_share) { list_for_each_entry(c, &s->children, siblings) { if (!c->mounted) { pr_debug("%s: false as %d is slave with unmounted children of master's share\n", __func__, mi->mnt_id); return false; } } } } if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin)) { pr_debug("%s: false as %d is non-root without bind or plugin\n", __func__, mi->mnt_id); return false; } shared: /* Mount only after all parents of our propagation group mounted */ if (!list_empty(&mi->mnt_propagate)) { struct mount_info *p; list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { BUG_ON(!p->parent); if (!p->parent->mounted) { pr_debug("%s: false as %d has unmounted parent %d of its propagation group\n", __func__, mi->mnt_id, p->parent->mnt_id); return false; } } } /* * Mount only after all children of share, which shouldn't * (but can if wrong order) propagate to us, are mounted */ if (mi->shared_id) { struct mount_info *s, *c, *p, *t; LIST_HEAD(mi_notprop); bool can = true; /* Add all children of the shared group */ list_for_each_entry(s, &mi->mnt_share, mnt_share) { list_for_each_entry(c, &s->children, siblings) { char root_path[PATH_MAX]; int ret; ret = root_path_from_parent(c, root_path, PATH_MAX); BUG_ON(ret); /* Mount is out of our root */ if (!issubpath(root_path, mi->root)) continue; list_add(&c->mnt_notprop, &mi_notprop); } } /* Delete all members of our children's propagation groups */ list_for_each_entry(c, &mi->children, siblings) { list_for_each_entry(p, &c->mnt_propagate, mnt_propagate) { list_del_init(&p->mnt_notprop); } } /* Delete all members of our propagation group */ list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { list_del_init(&p->mnt_notprop); } /* Delete self */ list_del_init(&mi->mnt_notprop); /* Check not propagated mounts mounted and cleanup list */ list_for_each_entry_safe(p, t, &mi_notprop, mnt_notprop) { if (!p->mounted) { pr_debug("%s: false as %d has unmounted 'anti'-propagation mount %d\n", __func__, mi->mnt_id, p->mnt_id); can = false; } list_del_init(&p->mnt_notprop); } if (!can) return false; } return true; } static int do_mount_root(struct mount_info *mi) { if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; return fetch_rt_stat(mi, service_mountpoint(mi)); } static int do_close_one(struct mount_info *mi) { close_safe(&mi->fd); return 0; } static int set_unbindable(struct mount_info *mi) { if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { pr_perror("Failed setting unbindable flag on %d", mi->mnt_id); return -1; } return 0; } static int do_mount_one(struct mount_info *mi) { int ret; if (mi->mounted) return 0; if (!can_mount_now(mi)) { pr_debug("Postpone mount %s(%d)\n", mi->ns_mountpoint, mi->mnt_id); return 1; } if ((mi->parent && mi->parent != root_yard_mp) && !strcmp(mi->parent->ns_mountpoint, mi->ns_mountpoint)) { mi->parent->fd = open(service_mountpoint(mi->parent), O_PATH); if (mi->parent->fd < 0) { pr_perror("Unable to open %s", service_mountpoint(mi)); return -1; } } pr_debug("\tMounting %s %d@%s (%d)\n", mi->fstype->name, mi->mnt_id, service_mountpoint(mi), mi->need_plugin); if (rst_mnt_is_root(mi)) { int fd; unsigned long flags = MS_BIND | MS_REC; if (opts.root == NULL) { pr_err("The --root option is required to restore a mount namespace\n"); return -1; } /* do_mount_root() is called from populate_mnt_ns() */ if (root_ns_mask & CLONE_NEWUSER) { fd = open(service_mountpoint(mi), O_PATH); if (fd < 0) { pr_perror("Unable to open %s", service_mountpoint(mi)); return -1; } if (userns_call(mount_root, 0, &flags, sizeof(flags), fd)) { pr_err("Unable to mount %s\n", service_mountpoint(mi)); close(fd); return -1; } close(fd); } else { if (mount(opts.root, service_mountpoint(mi), NULL, flags, NULL)) { pr_perror("Unable to mount %s %s (id=%d)", opts.root, service_mountpoint(mi), mi->mnt_id); return -1; } } if (do_mount_root(mi)) return -1; mi->mounted = true; ret = 0; } else if (!mi->bind && !mi->need_plugin && !mnt_is_nodev_external(mi)) { ret = do_new_mount(mi); } else { ret = do_bind_mount(mi); } if (ret == 0 && fetch_rt_stat(mi, service_mountpoint(mi))) return -1; if (ret == 0 && propagate_mount(mi)) return -1; if (mi->fstype->code == FSTYPE__UNSUPPORTED) { struct statfs st; if (statfs(service_mountpoint(mi), &st)) { pr_perror("Unable to statfs %s", service_mountpoint(mi)); return -1; } if (st.f_type == BTRFS_SUPER_MAGIC) mi->fstype = find_fstype_by_name("btrfs"); } return ret; } static int do_umount_one(struct mount_info *mi) { if (!mi->parent) return 0; if (mount("none", service_mountpoint(mi->parent), "none", MS_REC | MS_PRIVATE, NULL)) { pr_perror("Can't mark %s as private", service_mountpoint(mi->parent)); return -1; } if (umount(service_mountpoint(mi))) { pr_perror("Can't umount at %s", service_mountpoint(mi)); return -1; } pr_info("Umounted at %s\n", service_mountpoint(mi)); return 0; } /* * If a mount overmounts other mounts, it is restored separately in the roots * yard and then moved to the right place. * * mnt_remap_entry is created for each such mount and it's added into * mnt_remap_list. The origin mount point is replaced on a new one in * roots_yard where it will be restored. The remapped mount will be * moved to the right places after restoring all mounts. */ static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { struct mount_info *mi; /* child is remapped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; static int do_remap_mount(struct mount_info *m) { int len; /* A path in root_yard has a fixed size, so it can be replaced. */ len = print_ns_root(m->nsid, remap_id, m->mountpoint, PATH_MAX); m->mountpoint[len] = '/'; return 0; } static int try_remap_mount(struct mount_info *m) { struct mnt_remap_entry *r; if (!mnt_needs_remap(m)) return 0; BUG_ON(!m->parent); r = xmalloc(sizeof(struct mnt_remap_entry)); if (!r) return -1; r->mi = m; list_add_tail(&r->node, &mnt_remap_list); return 0; } static int find_remap_mounts(struct mount_info *root) { struct mnt_remap_entry *r; struct mount_info *m; /* * It's impossible to change a tree without interrupting * enumeration, so on the first step mounts are added * into mnt_remap_list and then they are connected to root_yard_mp. */ if (mnt_tree_for_each(root, try_remap_mount)) return -1; /* Move remapped mounts to root_yard */ list_for_each_entry(r, &mnt_remap_list, node) { m = r->mi; r->parent = m->parent; m->parent = root_yard_mp; list_del(&m->siblings); list_add(&m->siblings, &root_yard_mp->children); remap_id++; mnt_tree_for_each(m, do_remap_mount); pr_debug("Restore the %d mount in %s\n", m->mnt_id, m->mountpoint); } return 0; } /* Move remapped mounts to places where they have to be */ static int fixup_remap_mounts(void) { struct mnt_remap_entry *r; list_for_each_entry(r, &mnt_remap_list, node) { struct mount_info *m = r->mi; char path[PATH_MAX]; int len; strncpy(path, m->mountpoint, PATH_MAX - 1); path[PATH_MAX - 1] = 0; len = print_ns_root(m->nsid, 0, path, PATH_MAX); path[len] = '/'; pr_debug("Move mount %s -> %s\n", m->mountpoint, path); if (mount(m->mountpoint, path, NULL, MS_MOVE, NULL)) { pr_perror("Unable to move mount %s -> %s", m->mountpoint, path); return -1; } /* Insert child back to its place in the tree */ list_del(&r->mi->siblings); list_add(&r->mi->siblings, &r->parent->children); r->mi->parent = r->parent; } return 0; } int cr_pivot_root(char *root) { char tmp_dir_tmpl[] = "crtools-put-root.XXXXXX"; bool tmp_dir = false; char *put_root = "tmp"; int exit_code = -1; struct stat st; pr_info("Move the root to %s\n", root ?: "."); if (root) { if (chdir(root)) { pr_perror("chdir(%s) failed", root); return -1; } } if (stat(put_root, &st) || !S_ISDIR(st.st_mode)) { put_root = mkdtemp(tmp_dir_tmpl); if (put_root == NULL) { pr_perror("Can't create a temporary directory"); return -1; } tmp_dir = true; } if (mount(put_root, put_root, NULL, MS_BIND, NULL)) { pr_perror("Unable to mount tmpfs in %s", put_root); goto err_root; } if (mount(NULL, put_root, NULL, MS_PRIVATE, NULL)) { pr_perror("Can't remount %s with MS_PRIVATE", put_root); goto err_tmpfs; } if (pivot_root(".", put_root)) { pr_perror("pivot_root(., %s) failed", put_root); goto err_tmpfs; } if (mount("none", put_root, "none", MS_REC | MS_SLAVE, NULL)) { pr_perror("Can't remount root with MS_PRIVATE"); return -1; } exit_code = 0; if (umount2(put_root, MNT_DETACH)) { pr_perror("Can't umount %s", put_root); return -1; } err_tmpfs: if (umount2(put_root, MNT_DETACH)) { pr_perror("Can't umount %s", put_root); return -1; } err_root: if (tmp_dir && rmdir(put_root)) { pr_perror("Can't remove the directory %s", put_root); return -1; } return exit_code; } struct mount_info *mnt_entry_alloc(bool rst) { struct mount_info *new; /* * We rely on xzalloc here for MOUNT_INVALID_DEV. */ BUILD_BUG_ON(MOUNT_INVALID_DEV); new = xzalloc(sizeof(struct mount_info)); if (new) { if (rst) { new->rmi = shmalloc(sizeof(struct rst_mount_info)); if (!new->rmi) { xfree(new); return NULL; } memset(new->rmi, 0, sizeof(struct rst_mount_info)); } new->mp_fd_id = -1; new->mnt_fd_id = -1; new->is_dir = -1; new->fd = -1; new->is_overmounted = -1; INIT_LIST_HEAD(&new->children); INIT_LIST_HEAD(&new->siblings); INIT_LIST_HEAD(&new->mnt_slave_list); INIT_LIST_HEAD(&new->mnt_ext_slave); INIT_LIST_HEAD(&new->mnt_share); INIT_LIST_HEAD(&new->mnt_bind); INIT_LIST_HEAD(&new->mnt_propagate); INIT_LIST_HEAD(&new->mnt_notprop); INIT_LIST_HEAD(&new->mnt_unbindable); INIT_LIST_HEAD(&new->postpone); INIT_LIST_HEAD(&new->deleted_list); } return new; } void mnt_entry_free(struct mount_info *mi) { if (mi) { xfree(mi->root); xfree(mi->mountpoint); xfree(mi->plain_mountpoint); xfree(mi->source); xfree(mi->options); xfree(mi->fsname); xfree(mi); } } /* * Helper for getting a path to where the namespace's root * is re-constructed. */ int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs) { return snprintf(buf, bs, "%s/%d-%010d", mnt_roots, ns->id, remap_id); } static int create_mnt_roots(void) { int exit_code = -1; if (mnt_roots) return 0; mnt_roots = xstrdup("/tmp/.criu.mntns.XXXXXX"); if (mnt_roots == NULL) goto out; if (mkdtemp(mnt_roots) == NULL) { pr_perror("Unable to create a temporary directory"); mnt_roots = NULL; goto out; } chmod(mnt_roots, 0777); exit_code = 0; out: return exit_code; } static int get_mp_root(MntEntry *me, struct mount_info *mi) { char *ext = NULL; BUG_ON(me->ext_mount && me->ext_key); /* Forward compatibility fixup */ if (me->ext_mount) { me->ext_key = me->root; /* * Putting the id of external mount which is provided by user, * to ->root can confuse mnt_is_external_bind and other functions * which expect to see the path in the file system to the root * of these mount (mounts_equal, mnt_build_ids_tree, * find_fsroot_mount_for, find_best_external_match, etc.) */ me->root = NO_ROOT_MOUNT; } mi->root = xstrdup(me->root); if (!mi->root) return -1; if (!me->ext_key) goto out; /* * External mount point -- get the reverse mapping * from the command line and put into root's place */ if (!strcmp(me->ext_key, AUTODETECTED_MOUNT)) { if (!opts.autodetect_ext_mounts) { pr_err("Mount %d:%s is autodetected external mount. " "Try \"--ext-mount-map auto\" to allow them.\n", mi->mnt_id, mi->ns_mountpoint); return -1; } /* * Make up an external mount entry for this * mount point, since we couldn't find a user * supplied one. * * The 'val' was put into mi->source during * dump by resolve_external_mounts(). */ ext = mi->source; } else if (!strcmp(me->ext_key, EXTERNAL_DEV_MOUNT)) { ext = EXTERNAL_DEV_MOUNT; } else { ext = ext_mount_lookup(me->ext_key); if (!ext) { pr_err("No mapping for %d:%s mountpoint\n", mi->mnt_id, mi->ns_mountpoint); return -1; } } mi->external = ext; out: pr_debug("\t\tWill mount %d from %s%s\n", mi->mnt_id, ext ?: mi->root, ext ? " (E)" : ""); return 0; } static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root, int root_len) { int len; len = strlen(mountpoint) + root_len + 1; mi->mountpoint = xmalloc(len); if (!mi->mountpoint) return -1; /* * For bind-mounts we would also fix the root here * too, but bind-mounts restore merges mountpoint * and root paths together, so there's no need in * that. */ strcpy(mi->mountpoint, root); strcpy(mi->mountpoint + root_len, mountpoint); mi->ns_mountpoint = mi->mountpoint + root_len; mi->plain_mountpoint = get_plain_mountpoint(mi->mnt_id, NULL); if (!mi->plain_mountpoint) return -1; pr_debug("\t\tWill mount %d @ %s %s\n", mi->mnt_id, service_mountpoint(mi), mi->ns_mountpoint); return 0; } static char *mount_update_lsm_context(char *mount_opts) { cleanup_free char *before_context = NULL; char *other_options; char *context_start; char *context_end; char *old_context; char *new_options; int ret; old_context = strstr(mount_opts, CONTEXT_OPT); if (!old_context || !opts.lsm_mount_context) return xstrdup(mount_opts); /* * If the user specified a different mount_context we need * to replace the existing mount context in the mount * options with the one specified by the user. * * The original mount options will be something like: * * context="system_u:object_r:container_file_t:s0:c82,c137",inode64 * * and it needs to be replaced with opts.lsm_mount_context. * * The content between 'context=' and ',inode64' will be replaced * with opts.lsm_mount_context in quotes. */ /* Skip 'context=' */ context_start = old_context + strlen(CONTEXT_OPT); if (context_start[0] == '"' && context_start + 1 < mount_opts + strlen(mount_opts)) { /* Skip quotes */ context_end = strchr(context_start + 1, '"'); if (!context_end) { pr_err("Failed parsing mount option 'context'\n"); return NULL; } } else { context_end = context_start; } /* Find next after optionally skipping quotes. */ other_options = strchr(context_end, ','); before_context = xstrdup(mount_opts); if (unlikely(!before_context)) return NULL; before_context[context_start - mount_opts] = 0; ret = asprintf(&new_options, "%s\"%s\"%s", before_context, opts.lsm_mount_context, other_options ? other_options : ""); if (unlikely(ret < 0)) return NULL; pr_debug("\t\tChanged mount 'context=' to %s\n", new_options); return new_options; } static int collect_mnt_from_image(struct mount_info **head, struct mount_info **tail, struct ns_id *nsid) { MntEntry *me = NULL; int ret, root_len = 1; struct cr_img *img; char root[PATH_MAX] = "."; img = open_image(CR_FD_MNTS, O_RSTR, nsid->id); if (!img) return -1; root_len = print_ns_root(nsid, 0, root, sizeof(root)); pr_debug("Reading mountpoint images (id %d pid %d)\n", nsid->id, (int)nsid->ns_pid); while (1) { struct mount_info *pm; ret = pb_read_one_eof(img, &me, PB_MNT); if (ret <= 0) break; pm = mnt_entry_alloc(true); if (!pm) goto err; pm->nsid = nsid; mntinfo_add_list_before(head, pm); if (!*tail) *tail = pm; pm->mnt_id = me->mnt_id; pm->parent_mnt_id = me->parent_mnt_id; pm->s_dev = me->root_dev; pm->flags = me->flags; pm->sb_flags = me->sb_flags; if (!me->has_sb_flags) { const unsigned int mflags = MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE | MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_NOATIME | MS_NODIRATIME | MS_RELATIME; /* * In old images mnt and sb flags are saved together. * Here we separate them and save the old logic about MS_RDONLY. */ pm->sb_flags = pm->flags & ~mflags; pm->flags = pm->flags & mflags; } pm->shared_id = me->shared_id; pm->master_id = me->master_id; pm->need_plugin = me->with_plugin; pm->deleted = me->deleted; pm->is_ns_root = is_root(me->mountpoint); if (me->has_internal_sharing) pm->internal_sharing = me->internal_sharing; pm->source = xstrdup(me->source); if (!pm->source) goto err; pm->options = mount_update_lsm_context(me->options); if (unlikely(!pm->options)) goto err; if (me->fstype != FSTYPE__AUTO && me->fsname) { pr_err("fsname can be set only for FSTYPE__AUTO mounts\n"); goto err; } /* FIXME: abort unsupported early */ pm->fstype = decode_fstype(me->fstype); if (pm->fstype->collect && (pm->fstype->collect(pm) < 0)) goto err; if (me->fsname) { pm->fsname = xstrdup(me->fsname); if (!pm->fsname) goto err; } if (get_mp_root(me, pm)) goto err; if (get_mp_mountpoint(me->mountpoint, pm, root, root_len)) goto err; pr_debug("\t" "Read %d mp @ %s\n", pm->mnt_id, pm->ns_mountpoint); } if (me) mnt_entry__free_unpacked(me, NULL); close_image(img); return 0; err: close_image(img); return -1; } static int merge_mount_trees(void) { struct ns_id *nsid; root_yard_mp = mnt_entry_alloc(true); if (!root_yard_mp) return -1; root_yard_mp->mountpoint = mnt_roots; root_yard_mp->plain_mountpoint = xstrdup(mnt_roots); if (!root_yard_mp->plain_mountpoint) return -1; root_yard_mp->is_dir = true; root_yard_mp->mounted = true; root_yard_mp->mnt_bind_is_populated = true; root_yard_mp->is_overmounted = false; root_yard_mp->mnt_id = HELPER_MNT_ID; /* Merge mount trees together under root_yard_mp */ for (nsid = ns_ids; nsid; nsid = nsid->next) { struct mount_info *root; if (nsid->nd != &mnt_ns_desc) continue; root = nsid->mnt.mntinfo_tree; pr_debug("Mountpoint %d (@%s) moved to the root yard\n", root->mnt_id, root->ns_mountpoint); root->parent = root_yard_mp; list_add(&root->siblings, &root_yard_mp->children); } return 0; } int read_mnt_ns_img(void) { struct mount_info *pms = NULL; struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) { mntinfo = NULL; return 0; } for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { struct mount_info *head = NULL, *tail = NULL; if (nsid->nd != &mnt_ns_desc) continue; if (collect_mnt_from_image(&head, &tail, nsid)) return -1; nsid->mnt.mntinfo_tree = mnt_build_tree(head); if (!nsid->mnt.mntinfo_tree) return -1; /* mntns root mounts are always directories */ nsid->mnt.mntinfo_tree->is_dir = true; tail->next = pms; pms = head; } mntinfo = pms; search_bindmounts(); prepare_is_overmounted(); if (!opts.mntns_compat_mode && resolve_shared_mounts_v2()) return -1; if (merge_mount_trees()) return -1; return 0; } int rst_get_mnt_root(int mnt_id, char *path, int plen) { struct mount_info *m; if (!(root_ns_mask & CLONE_NEWNS) || mnt_id == -1) goto rroot; m = lookup_mnt_id(mnt_id); if (m == NULL) return -1; return print_ns_root(m->nsid, 0, path, plen); rroot: path[0] = '/'; path[1] = '\0'; return 1; } int mntns_maybe_create_roots(void) { if (!(root_ns_mask & CLONE_NEWNS)) return 0; return create_mnt_roots(); } static int do_restore_task_mnt_ns(struct ns_id *nsid) { int fd; fd = fdstore_get(nsid->mnt.nsfd_id); if (fd < 0) return -1; if (setns(fd, CLONE_NEWNS)) { pr_perror("Can't restore mntns"); close(fd); return -1; } close(fd); return 0; } int restore_task_mnt_ns(struct pstree_item *current) { if ((root_ns_mask & CLONE_NEWNS) == 0) return 0; if (current->ids && current->ids->has_mnt_ns_id) { struct pstree_item *parent = current->parent; unsigned int id = current->ids->mnt_ns_id; struct ns_id *nsid; /* Zombies and helpers can have ids == 0 so we skip them */ while (parent && !parent->ids) parent = parent->parent; /** * Our parent had restored the mount namespace before forking * us and if we have the same mntns we just stay there. */ if (parent && id == parent->ids->mnt_ns_id) return 0; nsid = lookup_ns_by_id(id, &mnt_ns_desc); if (nsid == NULL) { pr_err("Can't find mount namespace %d\n", id); return -1; } BUG_ON(nsid->type == NS_CRIU); if (do_restore_task_mnt_ns(nsid)) return -1; } return 0; } void fini_restore_mntns(void) { struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) return; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; nsid->ns_populated = true; } } /* * All nested mount namespaces are restore as sub-trees of the root namespace. */ static int populate_roots_yard(struct mount_info *cr_time) { struct mnt_remap_entry *r; char path[PATH_MAX]; struct ns_id *nsid; if (make_yard(mnt_roots)) return -1; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; print_ns_root(nsid, 0, path, sizeof(path)); if (mkdir(path, 0600)) { pr_perror("Unable to create %s", path); return -1; } } /* * mnt_remap_list is filled in find_remap_mounts() and * contains mounts which has to be restored separately */ list_for_each_entry(r, &mnt_remap_list, node) { if (mkdirpat(AT_FDCWD, service_mountpoint(r->mi), 0755)) { pr_perror("Unable to create %s", service_mountpoint(r->mi)); return -1; } } if (cr_time && mkdirpat(AT_FDCWD, service_mountpoint(cr_time), 0755)) { pr_perror("Unable to create %s", service_mountpoint(cr_time)); return -1; } return 0; } static int populate_mnt_ns(void) { struct mount_info *cr_time = NULL; int ret; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { /* Add to mount tree. Generic code will mount it later */ cr_time = add_cr_time_mount(root_yard_mp, "binfmt_misc", "binfmt_misc", 0, true); if (!cr_time) return -1; } #endif if (resolve_shared_mounts(mntinfo)) return -1; if (validate_mounts(mntinfo, false)) return -1; if (find_remap_mounts(root_yard_mp)) return -1; if (populate_roots_yard(cr_time)) return -1; if (mount_clean_path()) return -1; ret = mnt_tree_for_each(root_yard_mp, do_mount_one); mnt_tree_for_each(root_yard_mp, do_close_one); if (ret == 0) { struct mount_info *mi; /* * Mounts in delayed_unbindable list were temporary mounted as * private instead of unbindable so that do_mount_one can bind * from them, now we are ready to fix it. */ list_for_each_entry(mi, &delayed_unbindable, mnt_unbindable) if (set_unbindable(mi)) return -1; } if (ret == 0 && fixup_remap_mounts()) return -1; if (umount_clean_path()) return -1; return ret; } static int __depopulate_roots_yard(void) { int ret = 0; if (mnt_roots == NULL) return 0; if (mount("none", mnt_roots, "none", MS_REC | MS_PRIVATE, NULL)) { pr_perror("Can't remount root with MS_PRIVATE"); ret = 1; } /* * Don't exit after a first error, because this function * can be used to rollback in a error case. * Don't worry about MNT_DETACH, because files are restored after this * and nobody will not be restored from a wrong mount namespace. */ if (umount2(mnt_roots, MNT_DETACH)) { pr_perror("Can't unmount %s", mnt_roots); ret = -1; } if (rmdir(mnt_roots)) { pr_perror("Can't remove the directory %s", mnt_roots); ret = -1; } return ret; } int depopulate_roots_yard(int mntns_fd, bool only_ghosts) { int ret = 0, old_cwd = -1, old_ns = -1; if (mntns_fd < 0) { ret |= try_clean_remaps(only_ghosts); cleanup_mnt_ns(); return ret; } pr_info("Switching to new ns to clean ghosts\n"); old_cwd = open(".", O_PATH); if (old_cwd < 0) { pr_perror("Unable to open cwd"); return -1; } old_ns = open_proc(PROC_SELF, "ns/mnt"); if (old_ns < 0) { pr_perror("`- Can't keep old ns"); close(old_cwd); return -1; } if (setns(mntns_fd, CLONE_NEWNS) < 0) { pr_perror("`- Can't switch"); close(old_ns); close(old_cwd); return -1; } if (try_clean_remaps(only_ghosts)) ret = -1; if (__depopulate_roots_yard()) ret = -1; if (setns(old_ns, CLONE_NEWNS) < 0) { pr_perror("Fail to switch back!"); ret = -1; } close(old_ns); if (fchdir(old_cwd)) { pr_perror("Unable to restore cwd"); ret = -1; } close(old_cwd); return ret; } void cleanup_mnt_ns(void) { if (mnt_roots == NULL) return; if (rmdir(mnt_roots)) pr_perror("Can't remove the directory %s", mnt_roots); } int prepare_mnt_ns(void) { int ret = -1, rst = -1, fd; struct ns_id ns = { .type = NS_CRIU, .ns_pid = PROC_SELF, .nd = &mnt_ns_desc }; struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) return 0; pr_info("Restoring mount namespace\n"); if (!opts.root) { struct mount_info *old; if (chdir("/")) { pr_perror("chdir(\"/\") failed"); return -1; } old = collect_mntinfo(&ns, false); if (old == NULL) return -1; /* * The new mount namespace is filled with the mountpoint * clones from the original one. We have to umount them * prior to recreating new ones. */ pr_info("Cleaning mount namespace\n"); if (mnt_tree_for_each_reverse(ns.mnt.mntinfo_tree, do_umount_one)) { free_mntinfo(old); return -1; } free_mntinfo(old); } if (!opts.mntns_compat_mode) return prepare_mnt_ns_v2(); ret = populate_mnt_ns(); if (ret) return -1; rst = open_proc(PROC_SELF, "ns/mnt"); if (rst < 0) return -1; /* restore non-root namespaces */ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { char path[PATH_MAX]; if (nsid->nd != &mnt_ns_desc) continue; /* Create the new mount namespace */ if (unshare(CLONE_NEWNS)) { pr_perror("Unable to create a new mntns"); goto err; } fd = open_proc(PROC_SELF, "ns/mnt"); if (fd < 0) goto err; if (nsid->type == NS_ROOT) { /* * We need to create a mount namespace which will be * used to clean up remap files * (depopulate_roots_yard). The namespace where mounts * was restored has to be restored as a root mount * namespace, because there are file descriptors * linked with it (e.g. to bind-mount slave pty-s). */ if (setns(rst, CLONE_NEWNS)) { pr_perror("Can't restore mntns back"); goto err; } SWAP(rst, fd); } /* Pin one with a file descriptor */ nsid->mnt.nsfd_id = fdstore_add(fd); close(fd); if (nsid->mnt.nsfd_id < 0) { pr_err("Can't add ns fd\n"); goto err; } /* Set its root */ print_ns_root(nsid, 0, path, sizeof(path) - 1); if (cr_pivot_root(path)) goto err; /* root fd is used to restore file mappings */ fd = open_proc(PROC_SELF, "root"); if (fd < 0) goto err; nsid->mnt.root_fd_id = fdstore_add(fd); if (nsid->mnt.root_fd_id < 0) { pr_err("Can't add root fd\n"); close(fd); goto err; } close(fd); /* And return back to regain the access to the roots yard */ if (setns(rst, CLONE_NEWNS)) { pr_perror("Can't restore mntns back"); goto err; } } close(rst); return ret; err: if (rst >= 0) /* coverity[check_return] */ restore_ns(rst, &mnt_ns_desc); return -1; } static int mntns_root_pid = -1; static int mntns_set_root_fd(pid_t pid, int fd) { int ret; ret = install_service_fd(ROOT_FD_OFF, fd); if (ret >= 0) mntns_root_pid = pid; return ret; } int __mntns_get_root_fd(pid_t pid) { int fd, pfd; int ret; char path[PATH_MAX + 1]; if (mntns_root_pid == pid) /* The required root is already opened */ return get_service_fd(ROOT_FD_OFF); if (!(root_ns_mask & CLONE_NEWNS)) { /* * If criu and tasks we dump live in the same mount * namespace, we can just open the root directory. * All paths resolution would occur relative to criu's * root. Even if it is not namespace's root, provided * file paths are resolved, we'd get consistent dump. */ fd = open("/", O_RDONLY | O_DIRECTORY); if (fd < 0) { pr_perror("Can't open root"); return -1; } goto set_root; } /* * If /proc/pid/root links on '/', it signs that a root of the task * and a root of mntns is the same. */ pfd = open_pid_proc(pid); ret = readlinkat(pfd, "root", path, sizeof(path) - 1); if (ret < 0) { close_pid_proc(); return ret; } path[ret] = '\0'; if (ret != 1 || path[0] != '/') { pr_err("The root task has another root than mntns: %s\n", path); close_pid_proc(); return -1; } fd = openat(pfd, "root", O_RDONLY | O_DIRECTORY, 0); if (fd < 0) { pr_perror("Can't open the task root"); return -1; } set_root: return mntns_set_root_fd(pid, fd); } int mntns_get_root_fd(struct ns_id *mntns) { if (!(root_ns_mask & CLONE_NEWNS)) return __mntns_get_root_fd(0); if (!mntns) return -1; /* * All namespaces are restored from the root task and during the * CR_STATE_FORKING stage the root task has two file descriptors for * each mntns. One is associated with a namespace and another one is a * root of this mntns. * * When a non-root task is forked, it enters into a proper mount * namespace, restores private mappings and forks children. Some of * these mappings can be associated with files from other namespaces. * * After the CR_STATE_FORKING stage the root task has to close all * mntns file descriptors to restore its descriptors and at this moment * we know that all tasks live in their mount namespaces. * * If we find that a mount namespace isn't populated, we can get its * root from the root task. */ if (!mntns->ns_populated) { int fd; fd = fdstore_get(mntns->mnt.root_fd_id); if (fd < 0) return -1; return mntns_set_root_fd(mntns->ns_pid, fd); } return __mntns_get_root_fd(mntns->ns_pid); } struct ns_id *lookup_nsid_by_mnt_id(int mnt_id) { struct mount_info *mi; /* * Kernel before 3.15 doesn't show mnt_id for file descriptors. * mnt_id isn't saved for files, if mntns isn't dumped. * In both these cases we have only one root, so here * is not matter which mount will be restored. */ if (mnt_id == -1) mi = mntinfo; else mi = lookup_mnt_id(mnt_id); return mi ? mi->nsid : NULL; } int mntns_get_root_by_mnt_id(int mnt_id) { struct ns_id *mntns = NULL; if (root_ns_mask & CLONE_NEWNS) { mntns = lookup_nsid_by_mnt_id(mnt_id); BUG_ON(mntns == NULL); } return mntns_get_root_fd(mntns); } struct collect_mntns_arg { bool need_to_validate; bool for_dump; }; static int collect_mntns(struct ns_id *ns, void *__arg) { struct collect_mntns_arg *arg = __arg; struct mount_info *pms; pms = collect_mntinfo(ns, arg->for_dump); if (!pms) return -1; if (arg->for_dump && ns->type != NS_CRIU) arg->need_to_validate = true; mntinfo_add_list(pms); return 0; } int collect_mnt_namespaces(bool for_dump) { struct collect_mntns_arg arg; int ret; arg.for_dump = for_dump; arg.need_to_validate = false; ret = walk_namespaces(&mnt_ns_desc, collect_mntns, &arg); if (ret) goto err; search_bindmounts(); #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (for_dump && !opts.has_binfmt_misc) { unsigned int s_dev = 0; struct ns_id *ns; for (ns = ns_ids; ns != NULL; ns = ns->next) { if (ns->type == NS_ROOT && ns->nd == &mnt_ns_desc) break; } if (ns) { ret = mount_cr_time_mount(ns, &s_dev, "binfmt_misc", "/" BINFMT_MISC_HOME, "binfmt_misc"); if (ret == -1) { goto err; } else if (ret == 0 && !add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, s_dev, false)) { ret = -1; goto err; } } } #endif ret = resolve_external_mounts(mntinfo); if (ret) goto err; if (arg.need_to_validate) { ret = -1; if (resolve_shared_mounts(mntinfo)) goto err; if (validate_mounts(mntinfo, true)) goto err; } ret = 0; err: return ret; } int dump_mnt_namespaces(void) { struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) return 0; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc || nsid->type == NS_CRIU) continue; if ((nsid->type == NS_OTHER) && check_mnt_id()) { pr_err("Nested mount namespaces are not supported " "without mnt_id in fdinfo\n"); return -1; } if (dump_mnt_ns(nsid, nsid->mnt.mntinfo_list)) return -1; } return 0; } void clean_cr_time_mounts(void) { struct mount_info *mi; int ns_old, ret; for (mi = mntinfo; mi; mi = mi->next) { int cwd_fd; if (mi->mnt_id != HELPER_MNT_ID) continue; ret = switch_mnt_ns(mi->nsid->ns_pid, &ns_old, &cwd_fd); if (ret) { pr_err("Can't switch to pid's %u mnt_ns\n", mi->nsid->ns_pid); continue; } if (umount(mi->ns_mountpoint) < 0) pr_perror("Can't umount forced mount %s", mi->ns_mountpoint); if (restore_mnt_ns(ns_old, &cwd_fd)) { pr_err("cleanup_forced_mounts exiting with wrong mnt_ns\n"); return; } } } struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt"); static int call_helper_process(int (*call)(void *), void *arg) { int pid, status, exit_code = -1; /* * Running new helper process on the restore must be * done under last_pid mutex: other tasks may be restoring * threads and the PID we need there might be occupied by * this clone() call. */ lock_last_pid(); pid = clone_noasan(call, CLONE_VFORK | CLONE_VM | CLONE_FILES | CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, arg); if (pid == -1) { pr_perror("Can't clone helper process"); goto out; } errno = 0; if (waitpid(pid, &status, __WALL) != pid) { pr_perror("Unable to wait %d", pid); goto out; } if (status) { pr_err("Bad child exit status: %d\n", status); goto out; } exit_code = 0; out: unlock_last_pid(); return exit_code; } static int ns_remount_writable(void *arg) { struct mount_info *mi = (struct mount_info *)arg; struct ns_id *ns = mi->nsid; if (do_restore_task_mnt_ns(ns)) return 1; pr_debug("Switched to mntns %u:%u\n", ns->id, ns->kid); if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->ns_mountpoint); return 1; } return 0; } int try_remount_writable(struct mount_info *mi, bool ns) { int remounted = REMOUNTED_RW; /* Don't remount if we are in host mntns to be on the safe side */ if (!(root_ns_mask & CLONE_NEWNS)) return 0; if (!ns) remounted = REMOUNTED_RW_SERVICE; /* All mounts in mntinfo list should have it on restore */ BUG_ON(mi->rmi == NULL); if (mi->flags & MS_RDONLY && !(mi->rmi->remounted_rw & remounted)) { if (mnt_is_overmounted(mi)) { pr_err("The mount %d is overmounted so paths are invisible\n", mi->mnt_id); return -1; } /* There should be no ghost files on mounts with ro sb */ if (mi->sb_flags & MS_RDONLY) { pr_err("The mount %d has readonly sb\n", mi->mnt_id); return -1; } pr_info("Remount %d:%s writable\n", mi->mnt_id, service_mountpoint(mi)); if (!ns) { if (mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { pr_perror("Failed to remount %d:%s writable", mi->mnt_id, service_mountpoint(mi)); return -1; } } else { if (call_helper_process(ns_remount_writable, mi)) return -1; } mi->rmi->remounted_rw |= remounted; } return 0; } static int __remount_readonly_mounts(struct ns_id *ns) { struct mount_info *mi; bool mntns_set = false; for (mi = mntinfo; mi; mi = mi->next) { if (ns && mi->nsid != ns) continue; if (!(mi->rmi->remounted_rw & REMOUNTED_RW)) continue; /* * Lets enter the mount namespace lazily, only if we've found the * mount which should be remounted readonly. These saves us * from entering mntns if we have no mounts to remount in it. */ if (ns && !mntns_set) { if (do_restore_task_mnt_ns(ns)) return -1; mntns_set = true; pr_debug("Switched to mntns %u:%u\n", ns->id, ns->kid); } pr_info("Remount %d:%s back to readonly\n", mi->mnt_id, mi->ns_mountpoint); if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~MS_PROPAGATE), NULL)) { pr_perror("Failed to restore %d:%s mount flags %x", mi->mnt_id, mi->ns_mountpoint, mi->flags); return -1; } } return 0; } static int ns_remount_readonly_mounts(void *arg) { struct ns_id *nsid; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; if (__remount_readonly_mounts(nsid)) return 1; } return 0; } int remount_readonly_mounts(void) { /* * Need a helper process because the root task can share fs via * CLONE_FS and we would not be able to enter mount namespaces */ return call_helper_process(ns_remount_readonly_mounts, NULL); } static struct mount_info *mnt_subtree_next(struct mount_info *mi, struct mount_info *root) { if (!list_empty(&mi->children)) return list_entry(mi->children.next, struct mount_info, siblings); while (mi->parent && mi != root) { if (mi->siblings.next == &mi->parent->children) mi = mi->parent; else return list_entry(mi->siblings.next, struct mount_info, siblings); } return NULL; } crac-criu-1.5.0/criu/namespaces.c000066400000000000000000001153401471504326700166270ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "page.h" #include "rst-malloc.h" #include "cr_options.h" #include "imgset.h" #include "uts_ns.h" #include "ipc_ns.h" #include "timens.h" #include "mount.h" #include "pstree.h" #include "namespaces.h" #include "net.h" #include "cgroup.h" #include "fdstore.h" #include "kerndat.h" #include "util-caps.h" #include "protobuf.h" #include "util.h" #include "images/ns.pb-c.h" #include "images/userns.pb-c.h" #include "images/pidns.pb-c.h" static struct ns_desc *ns_desc_array[] = { &net_ns_desc, &uts_ns_desc, &ipc_ns_desc, &pid_ns_desc, &user_ns_desc, &mnt_ns_desc, &time_ns_desc, &cgroup_ns_desc, }; static unsigned int join_ns_flags; static int collect_pid_namespaces(bool); int check_namespace_opts(void) { errno = EINVAL; if (join_ns_flags & opts.empty_ns) { pr_err("Conflicting flags: --join-ns and --empty-ns\n"); return -1; } if (join_ns_flags & CLONE_NEWUSER) pr_warn("join-ns with user-namespace is not fully tested and dangerous\n"); errno = 0; return 0; } static int check_int_str(char *str) { char *endptr; long val; if (str == NULL) return 0; if (*str == '\0') { str = NULL; return 0; } errno = EINVAL; val = strtol(str, &endptr, 10); if ((errno == ERANGE) || (endptr == str) || (*endptr != '\0') || (val < 0) || (val > 65535)) { str = NULL; return -1; } errno = 0; return 0; } static int check_ns_file(char *ns_file) { int pid, ret, proc_dir; if (!check_int_str(ns_file)) { pid = atoi(ns_file); if (pid <= 0) { pr_err("Invalid join_ns pid %s\n", ns_file); return -1; } proc_dir = open_pid_proc(pid); if (proc_dir < 0) { pr_err("Invalid join_ns pid: /proc/%s not found\n", ns_file); return -1; } return 0; } ret = access(ns_file, 0); if (ret < 0) { pr_perror("Can't access join-ns file %s", ns_file); return -1; } return 0; } static int set_user_extra_opts(struct join_ns *jn, char *extra_opts) { char *uid, *gid, *aux; if (extra_opts == NULL) { jn->extra_opts.user_extra.uid = NULL; jn->extra_opts.user_extra.gid = NULL; return 0; } uid = extra_opts; aux = strchr(extra_opts, ','); if (aux == NULL) { gid = NULL; } else { *aux = '\0'; gid = aux + 1; } if (check_int_str(uid) || check_int_str(gid)) return -1; jn->extra_opts.user_extra.uid = uid; jn->extra_opts.user_extra.gid = gid; return 0; } int join_ns_add(const char *type, char *ns_file, char *extra_opts) { struct join_ns *jn; if (check_ns_file(ns_file)) return -1; jn = xmalloc(sizeof(*jn)); if (!jn) return -1; jn->ns_file = xstrdup(ns_file); if (!jn->ns_file) { xfree(jn); return -1; } if (!strncmp(type, "net", 4)) { jn->nd = &net_ns_desc; join_ns_flags |= CLONE_NEWNET; } else if (!strncmp(type, "uts", 4)) { jn->nd = &uts_ns_desc; join_ns_flags |= CLONE_NEWUTS; } else if (!strncmp(type, "time", 5)) { jn->nd = &time_ns_desc; join_ns_flags |= CLONE_NEWTIME; } else if (!strncmp(type, "ipc", 4)) { jn->nd = &ipc_ns_desc; join_ns_flags |= CLONE_NEWIPC; } else if (!strncmp(type, "pid", 4)) { pr_err("join-ns pid namespace not supported\n"); goto err; } else if (!strncmp(type, "user", 5)) { jn->nd = &user_ns_desc; if (set_user_extra_opts(jn, extra_opts)) { pr_err("invalid user namespace extra_opts %s\n", extra_opts); goto err; } join_ns_flags |= CLONE_NEWUSER; } else if (!strncmp(type, "mnt", 4)) { jn->nd = &mnt_ns_desc; join_ns_flags |= CLONE_NEWNS; } else { pr_err("invalid namespace type %s\n", type); goto err; } list_add_tail(&jn->list, &opts.join_ns); pr_info("Added %s:%s join namespace\n", type, ns_file); return 0; err: xfree(jn->ns_file); xfree(jn); return -1; } static unsigned int parse_ns_link(char *link, size_t len, struct ns_desc *d) { unsigned long kid = 0; char *end; if (len >= d->len + 2) { if (link[d->len] == ':' && !memcmp(link, d->str, d->len)) { kid = strtoul(&link[d->len + 2], &end, 10); if (end && *end == ']') BUG_ON(kid > UINT_MAX); else kid = 0; } } return (unsigned int)kid; } bool check_ns_proc(struct fd_link *link) { unsigned int i, kid; for (i = 0; i < ARRAY_SIZE(ns_desc_array); i++) { kid = parse_ns_link(link->name + 1, link->len - 1, ns_desc_array[i]); if (!kid) continue; link->ns_d = ns_desc_array[i]; link->ns_kid = kid; return true; } return false; } int switch_ns(int pid, struct ns_desc *nd, int *rst) { int nsfd; int ret; nsfd = open_proc(pid, "ns/%s", nd->str); if (nsfd < 0) return -1; ret = switch_ns_by_fd(nsfd, nd, rst); close(nsfd); return ret; } int switch_ns_by_fd(int nsfd, struct ns_desc *nd, int *rst) { int ret = -1, old_ns = -1; if (rst) { old_ns = open_proc(PROC_SELF, "ns/%s", nd->str); if (old_ns < 0) goto err_ns; } ret = setns(nsfd, nd->cflag); if (ret < 0) { pr_perror("Can't setns %d/%s", nsfd, nd->str); goto err_set; } if (rst) *rst = old_ns; return 0; err_set: close_safe(&old_ns); err_ns: return -1; } int restore_ns(int rst, struct ns_desc *nd) { int ret; ret = setns(rst, nd->cflag); if (ret < 0) pr_perror("Can't restore ns back"); close(rst); return ret; } int switch_mnt_ns(int pid, int *rst, int *cwd_fd) { int fd; if (!cwd_fd) return switch_ns(pid, &mnt_ns_desc, rst); fd = open(".", O_PATH); if (fd < 0) { pr_perror("unable to open current directory"); return -1; } if (switch_ns(pid, &mnt_ns_desc, rst)) { close(fd); return -1; } *cwd_fd = fd; return 0; } int restore_mnt_ns(int rst, int *cwd_fd) { int exit_code = -1; if (restore_ns(rst, &mnt_ns_desc)) goto err_restore; if (cwd_fd && fchdir(*cwd_fd)) { pr_perror("Unable to restore current directory"); goto err_restore; } exit_code = 0; err_restore: if (cwd_fd) close_safe(cwd_fd); return exit_code; } struct ns_id *ns_ids = NULL; static unsigned int ns_next_id = 1; unsigned long root_ns_mask = 0; static void nsid_add(struct ns_id *ns, struct ns_desc *nd, unsigned int id, pid_t pid) { ns->nd = nd; ns->id = id; ns->ns_pid = pid; ns->next = ns_ids; ns_ids = ns; pr_info("Add %s ns %d pid %d\n", nd->str, ns->id, ns->ns_pid); } static struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, struct ns_desc *nd, enum ns_type type) { struct ns_id *nsid; nsid = shmalloc(sizeof(*nsid)); if (nsid) { nsid->type = type; nsid_add(nsid, nd, id, pid); nsid->ns_populated = false; if (nd == &net_ns_desc) { INIT_LIST_HEAD(&nsid->net.ids); INIT_LIST_HEAD(&nsid->net.links); nsid->net.netns = NULL; } } return nsid; } int rst_add_ns_id(unsigned int id, struct pstree_item *i, struct ns_desc *nd) { pid_t pid = vpid(i); struct ns_id *nsid; nsid = lookup_ns_by_id(id, nd); if (nsid) { if (pid_rst_prio(pid, nsid->ns_pid)) nsid->ns_pid = pid; return 0; } nsid = rst_new_ns_id(id, pid, nd, i == root_item ? NS_ROOT : NS_OTHER); if (nsid == NULL) return -1; return 0; } struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd) { struct ns_id *nsid; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) if (nsid->kid == kid && nsid->nd->cflag == nd->cflag) return nsid; return NULL; } struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd) { struct ns_id *nsid; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) if (nsid->id == id && nsid->nd == nd) return nsid; return NULL; } /* * For all namespaces we support, there are two supported * tasks-to-namespaces layout. * * If root task lives in the same namespace as criu does * all other tasks should live in it too and we do NOT dump * this namespace. On restore tasks inherit the respective * namespace from criu. * * If root task lives in its own namespace, then all other * tasks may live in it. Sometimes (CLONE_SUBNS) there can * be more than one namespace of that type. For this case * we dump all namespace's info and recreate them on restore. */ int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void *oarg) { int ret = 0; struct ns_id *ns; for (ns = ns_ids; ns != NULL; ns = ns->next) { if (ns->nd != nd) continue; if (ns->type == NS_CRIU) { if (root_ns_mask & nd->cflag) continue; ret = cb(ns, oarg); break; } ret = cb(ns, oarg); if (ret) break; } return ret; } static unsigned int generate_ns_id(int pid, unsigned int kid, struct ns_desc *nd, struct ns_id **ns_ret) { struct ns_id *nsid; enum ns_type type; nsid = lookup_ns_by_kid(kid, nd); if (nsid) goto found; if (pid != getpid()) { type = NS_OTHER; if (pid == root_item->pid->real) { BUG_ON(root_ns_mask & nd->cflag); pr_info("Will take %s namespace in the image\n", nd->str); root_ns_mask |= nd->cflag; type = NS_ROOT; } else if (nd->cflag & ~CLONE_SUBNS) { pr_err("Can't dump nested %s namespace for %d\n", nd->str, pid); return 0; } } else type = NS_CRIU; nsid = xzalloc(sizeof(*nsid)); if (!nsid) return 0; nsid->type = type; nsid->kid = kid; nsid->ns_populated = true; nsid_add(nsid, nd, ns_next_id++, pid); if (nd == &net_ns_desc) { INIT_LIST_HEAD(&nsid->net.ids); INIT_LIST_HEAD(&nsid->net.links); } found: if (ns_ret) *ns_ret = nsid; return nsid->id; } static unsigned int __get_ns_id(int pid, struct ns_desc *nd, protobuf_c_boolean *supported, struct ns_id **ns) { int proc_dir; unsigned int kid; char ns_path[32]; struct stat st; proc_dir = open_pid_proc(pid); if (proc_dir < 0) return 0; snprintf(ns_path, sizeof(ns_path), "ns/%s", nd->str); if (fstatat(proc_dir, ns_path, &st, 0)) { if (errno == ENOENT) { /* The namespace is unsupported */ kid = 0; goto out; } pr_perror("Unable to stat %s", ns_path); return 0; } kid = st.st_ino; BUG_ON(!kid); out: if (supported) *supported = kid != 0; return generate_ns_id(pid, kid, nd, ns); } static unsigned int get_ns_id(int pid, struct ns_desc *nd, protobuf_c_boolean *supported) { return __get_ns_id(pid, nd, supported, NULL); } int dump_one_ns_file(int lfd, u32 id, const struct fd_parms *p) { struct cr_img *img; FileEntry fe = FILE_ENTRY__INIT; NsFileEntry nfe = NS_FILE_ENTRY__INIT; struct fd_link *link = p->link; struct ns_id *nsid; nsid = lookup_ns_by_kid(link->ns_kid, link->ns_d); if (!nsid) { pr_err("No NS ID with kid %u\n", link->ns_kid); return -1; } nfe.id = id; nfe.ns_id = nsid->id; nfe.ns_cflag = link->ns_d->cflag; nfe.flags = p->flags; fe.type = FD_TYPES__NS; fe.id = nfe.id; fe.nsf = &nfe; img = img_from_set(glob_imgset, CR_FD_FILES); return pb_write_one(img, &fe, PB_FILE); } const struct fdtype_ops nsfile_dump_ops = { .type = FD_TYPES__NS, .dump = dump_one_ns_file, }; struct ns_file_info { struct file_desc d; NsFileEntry *nfe; }; static int open_ns_fd(struct file_desc *d, int *new_fd) { struct ns_file_info *nfi = container_of(d, struct ns_file_info, d); struct pstree_item *item, *t; struct ns_desc *nd = NULL; struct ns_id *ns; int nsfd_id, fd; char path[64]; for (ns = ns_ids; ns != NULL; ns = ns->next) { if (ns->id != nfi->nfe->ns_id) continue; /* Check for CLONE_XXX as we use fdstore only if flag is set */ if (ns->nd == &net_ns_desc && (root_ns_mask & CLONE_NEWNET)) nsfd_id = ns->net.nsfd_id; else break; fd = fdstore_get(nsfd_id); if (fd < 0) { return -1; } goto out; } /* * Find out who can open us. * * FIXME I need a hash or RBtree here. */ for_each_pstree_item(t) { TaskKobjIdsEntry *ids = t->ids; if (ids->pid_ns_id == nfi->nfe->ns_id) { item = t; nd = &pid_ns_desc; break; } else if (ids->net_ns_id == nfi->nfe->ns_id) { item = t; nd = &net_ns_desc; break; } else if (ids->user_ns_id == nfi->nfe->ns_id) { item = t; nd = &user_ns_desc; break; } else if (ids->ipc_ns_id == nfi->nfe->ns_id) { item = t; nd = &ipc_ns_desc; break; } else if (ids->uts_ns_id == nfi->nfe->ns_id) { item = t; nd = &uts_ns_desc; break; } else if (ids->mnt_ns_id == nfi->nfe->ns_id) { item = t; nd = &mnt_ns_desc; break; } else if (ids->cgroup_ns_id == nfi->nfe->ns_id) { item = t; nd = &cgroup_ns_desc; break; } else if (ids->time_ns_id == nfi->nfe->ns_id) { item = t; nd = &time_ns_desc; break; } } if (!nd || !item) { pr_err("Can't find suitable NS ID for %#x\n", nfi->nfe->ns_id); return -1; } if (nd->cflag != nfi->nfe->ns_cflag) { pr_err("Clone flag mismatch for %#x\n", nfi->nfe->ns_id); return -1; } snprintf(path, sizeof(path) - 1, "/proc/%d/ns/%s", vpid(item), nd->str); path[sizeof(path) - 1] = '\0'; fd = open(path, nfi->nfe->flags); if (fd < 0) { pr_perror("Can't open file %s on restore", path); return fd; } out: *new_fd = fd; return 0; } static struct file_desc_ops ns_desc_ops = { .type = FD_TYPES__NS, .open = open_ns_fd, }; static int collect_one_nsfile(void *o, ProtobufCMessage *base, struct cr_img *img) { struct ns_file_info *nfi = o; nfi->nfe = pb_msg(base, NsFileEntry); pr_info("Collected ns file ID %#x NS-ID %#x\n", nfi->nfe->id, nfi->nfe->ns_id); return file_desc_add(&nfi->d, nfi->nfe->id, &ns_desc_ops); } struct collect_image_info nsfile_cinfo = { .fd_type = CR_FD_NS_FILES, .pb_type = PB_NS_FILE, .priv_size = sizeof(struct ns_file_info), .collect = collect_one_nsfile, }; /* * Same as dump_task_ns_ids(), but * a) doesn't keep IDs (don't need them) * b) generates them for mount and netns only * mnt ones are needed for open_mount() in * inotify pred-dump * net ones are needed for parasite socket */ int predump_task_ns_ids(struct pstree_item *item) { int pid = item->pid->real; if (!__get_ns_id(pid, &net_ns_desc, NULL, &dmpi(item)->netns)) return -1; if (!get_ns_id(pid, &mnt_ns_desc, NULL)) return -1; return 0; } int dump_task_ns_ids(struct pstree_item *item) { int pid = item->pid->real; TaskKobjIdsEntry *ids = item->ids; ids->has_pid_ns_id = true; ids->pid_ns_id = get_ns_id(pid, &pid_ns_desc, NULL); if (!ids->pid_ns_id) { pr_err("Can't make pidns id\n"); return -1; } ids->has_net_ns_id = true; ids->net_ns_id = __get_ns_id(pid, &net_ns_desc, NULL, &dmpi(item)->netns); if (!ids->net_ns_id) { pr_err("Can't make netns id\n"); return -1; } ids->has_ipc_ns_id = true; ids->ipc_ns_id = get_ns_id(pid, &ipc_ns_desc, NULL); if (!ids->ipc_ns_id) { pr_err("Can't make ipcns id\n"); return -1; } ids->has_uts_ns_id = true; ids->uts_ns_id = get_ns_id(pid, &uts_ns_desc, NULL); if (!ids->uts_ns_id) { pr_err("Can't make utsns id\n"); return -1; } ids->time_ns_id = get_ns_id(pid, &time_ns_desc, &ids->has_time_ns_id); if (!ids->time_ns_id) { pr_err("Can't make timens id\n"); return -1; } if (ids->has_time_ns_id) { unsigned int id; protobuf_c_boolean supported = false; id = get_ns_id(pid, &time_for_children_ns_desc, &supported); if (!supported || !id) { pr_err("Can't make timens id\n"); return -1; } if (id != ids->time_ns_id) { pr_err("Can't dump nested time namespace for %d\n", pid); return -1; } } ids->has_mnt_ns_id = true; ids->mnt_ns_id = get_ns_id(pid, &mnt_ns_desc, NULL); if (!ids->mnt_ns_id) { pr_err("Can't make mntns id\n"); return -1; } ids->has_user_ns_id = true; ids->user_ns_id = get_ns_id(pid, &user_ns_desc, NULL); if (!ids->user_ns_id) { pr_err("Can't make userns id\n"); return -1; } ids->cgroup_ns_id = get_ns_id(pid, &cgroup_ns_desc, &ids->has_cgroup_ns_id); if (!ids->cgroup_ns_id) { pr_err("Can't make cgroup id\n"); return -1; } return 0; } static UsernsEntry userns_entry = USERNS_ENTRY__INIT; #define INVALID_ID (~0U) static unsigned int userns_id(unsigned int id, UidGidExtent **map, int n) { int i; if (!(root_ns_mask & CLONE_NEWUSER)) return id; for (i = 0; i < n; i++) { if (map[i]->lower_first <= id && map[i]->lower_first + map[i]->count > id) return map[i]->first + (id - map[i]->lower_first); } return INVALID_ID; } static unsigned int host_id(unsigned int id, UidGidExtent **map, int n) { int i; if (!(root_ns_mask & CLONE_NEWUSER)) return id; for (i = 0; i < n; i++) { if (map[i]->first <= id && map[i]->first + map[i]->count > id) return map[i]->lower_first + (id - map[i]->first); } return INVALID_ID; } static uid_t host_uid(uid_t uid) { UsernsEntry *e = &userns_entry; return host_id(uid, e->uid_map, e->n_uid_map); } static gid_t host_gid(gid_t gid) { UsernsEntry *e = &userns_entry; return host_id(gid, e->gid_map, e->n_gid_map); } uid_t userns_uid(uid_t uid) { UsernsEntry *e = &userns_entry; return userns_id(uid, e->uid_map, e->n_uid_map); } gid_t userns_gid(gid_t gid) { UsernsEntry *e = &userns_entry; return userns_id(gid, e->gid_map, e->n_gid_map); } static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts) { UidGidExtent *extents = NULL; int len = 0, size = 0, ret, i; FILE *f; f = fopen_proc(pid, "%s", name); if (f == NULL) return -1; ret = -1; while (1) { UidGidExtent *ext; if (len == size) { UidGidExtent *t; size = size * 2 + 1; t = xrealloc(extents, size * sizeof(UidGidExtent)); if (t == NULL) break; extents = t; } ext = &extents[len]; uid_gid_extent__init(ext); ret = fscanf(f, "%d %d %d", &ext->first, &ext->lower_first, &ext->count); if (ret != 3) { if (ferror(f)) { pr_perror("Unable to parse extents: %d", ret); ret = -1; } else ret = 0; break; } pr_info("id_map: %d %d %d\n", ext->first, ext->lower_first, ext->count); len++; } fclose(f); if (ret) goto err; if (len) { *pb_exts = xmalloc(sizeof(UidGidExtent *) * len); if (*pb_exts == NULL) goto err; for (i = 0; i < len; i++) (*pb_exts)[i] = &extents[i]; } else { xfree(extents); *pb_exts = NULL; } return len; err: xfree(extents); return -1; } int collect_user_ns(struct ns_id *ns, void *oarg) { /* * User namespace is dumped before files to get uid and gid * mappings, which are used for converting local id-s to * userns id-s (userns_uid(), userns_gid()) */ if (dump_user_ns(root_item->pid->real, root_item->ids->user_ns_id)) return -1; return 0; } int collect_user_namespaces(bool for_dump) { if (!for_dump) return 0; if (!(root_ns_mask & CLONE_NEWUSER)) return 0; return walk_namespaces(&user_ns_desc, collect_user_ns, NULL); } static int check_user_ns(int pid) { int status; pid_t chld; chld = fork(); if (chld == -1) { pr_perror("Unable to fork a process"); return -1; } if (chld == 0) { struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; struct __user_cap_header_struct hdr; uid_t uid; gid_t gid; uid = host_uid(0); gid = host_gid(0); if (uid == INVALID_ID || gid == INVALID_ID) { pr_err("Unable to convert uid or gid\n"); exit(1); } if (prctl(PR_SET_KEEPCAPS, 1)) { pr_perror("Unable to set PR_SET_KEEPCAPS"); exit(1); } if (setresgid(gid, gid, gid)) { pr_perror("Unable to set group ID"); exit(1); } if (setgroups(0, NULL) < 0) { pr_perror("Unable to drop supplementary groups"); exit(1); } if (setresuid(uid, uid, uid)) { pr_perror("Unable to set user ID"); exit(1); } hdr.version = _LINUX_CAPABILITY_VERSION_3; hdr.pid = 0; if (capget(&hdr, data) < 0) { pr_perror("capget"); exit(1); } data[0].effective = data[0].permitted; data[1].effective = data[1].permitted; if (capset(&hdr, data) < 0) { pr_perror("capset"); exit(1); } /* * Check that we are able to enter into other namespaces * from the target userns namespace. This signs that these * namespaces were created from the target userns. */ if (switch_ns(pid, &user_ns_desc, NULL)) exit(1); if ((root_ns_mask & CLONE_NEWNET) && switch_ns(pid, &net_ns_desc, NULL)) exit(1); if ((root_ns_mask & CLONE_NEWUTS) && switch_ns(pid, &uts_ns_desc, NULL)) exit(1); if ((root_ns_mask & CLONE_NEWTIME) && switch_ns(pid, &time_ns_desc, NULL)) exit(1); if ((root_ns_mask & CLONE_NEWIPC) && switch_ns(pid, &ipc_ns_desc, NULL)) exit(1); if ((root_ns_mask & CLONE_NEWNS) && switch_ns(pid, &mnt_ns_desc, NULL)) exit(1); exit(0); } if (waitpid(chld, &status, 0) != chld) { pr_perror("Unable to wait for PID %d", chld); return -1; } if (status) { pr_err("One or more namespaces doesn't belong to the target user namespace\n"); return -1; } return 0; } int dump_user_ns(pid_t pid, int ns_id) { UsernsEntry *e = &userns_entry; struct cr_img *img; int ret; ret = parse_id_map(pid, "uid_map", &e->uid_map); if (ret < 0) goto err; e->n_uid_map = ret; ret = parse_id_map(pid, "gid_map", &e->gid_map); if (ret < 0) goto err; e->n_gid_map = ret; if (check_user_ns(pid)) goto err; img = open_image(CR_FD_USERNS, O_DUMP, ns_id); if (!img) goto err; ret = pb_write_one(img, e, PB_USERNS); close_image(img); if (ret < 0) goto err; return 0; err: if (e->uid_map) { xfree(e->uid_map[0]); xfree(e->uid_map); } if (e->gid_map) { xfree(e->gid_map[0]); xfree(e->gid_map); } return -1; } void free_userns_maps(void) { if (userns_entry.n_uid_map > 0) { xfree(userns_entry.uid_map[0]); xfree(userns_entry.uid_map); } if (userns_entry.n_gid_map > 0) { xfree(userns_entry.gid_map[0]); xfree(userns_entry.gid_map); } } static int do_dump_namespaces(struct ns_id *ns) { int ret; ret = switch_ns(ns->ns_pid, ns->nd, NULL); if (ret) return ret; switch (ns->nd->cflag) { case CLONE_NEWUTS: pr_info("Dump UTS namespace %d via %d\n", ns->id, ns->ns_pid); ret = dump_uts_ns(ns->id); break; case CLONE_NEWTIME: pr_info("Dump TIME namespace %d via %d\n", ns->id, ns->ns_pid); ret = dump_time_ns(ns->id); break; case CLONE_NEWIPC: pr_info("Dump IPC namespace %d via %d\n", ns->id, ns->ns_pid); ret = dump_ipc_ns(ns->id); break; case CLONE_NEWNET: pr_info("Dump NET namespace info %d via %d\n", ns->id, ns->ns_pid); ret = dump_net_ns(ns); break; default: pr_err("Unknown namespace flag %x\n", ns->nd->cflag); break; } return ret; } int dump_namespaces(struct pstree_item *item, unsigned int ns_flags) { struct pid *ns_pid = item->pid; struct ns_id *ns; int pid, nr = 0; int ret = 0; /* * The setns syscall is cool, we can switch to the other * namespace and then return back to our initial one, but * for me it's much easier just to fork another task and * let it do the job, all the more so it can be done in * parallel with task dumping routine. * * However, the question how to dump sockets from the target * net namespace with this is still open */ pr_info("Dumping %d(%d)'s namespaces\n", ns_pid->ns[0].virt, ns_pid->real); if ((ns_flags & CLONE_NEWPID) && ns_pid->ns[0].virt != INIT_PID) { char *val = NULL; ns = lookup_ns_by_id(item->ids->pid_ns_id, &pid_ns_desc); if (ns) { char id[64]; snprintf(id, sizeof(id), "pid[%u]", ns->kid); val = external_lookup_by_key(id); if (IS_ERR_OR_NULL(val)) val = NULL; } if (!val) { pr_err("Can't dump a pid namespace without the process init\n"); return -1; } } for (ns = ns_ids; ns; ns = ns->next) { /* Skip current namespaces, which are in the list too */ if (ns->type == NS_CRIU) continue; switch (ns->nd->cflag) { /* No data for pid namespaces to dump */ case CLONE_NEWPID: /* Dumped explicitly with dump_mnt_namespaces() */ case CLONE_NEWNS: /* Userns is dumped before dumping tasks */ case CLONE_NEWUSER: /* handled separately in cgroup dumping code */ case CLONE_NEWCGROUP: continue; } pid = fork(); if (pid < 0) { pr_perror("Can't fork ns dumper"); return -1; } if (pid == 0) { ret = do_dump_namespaces(ns); exit(ret); } nr++; } while (nr > 0) { int status; ret = waitpid(-1, &status, 0); if (ret < 0) { pr_perror("Can't wait ns dumper"); return -1; } if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { pr_err("Namespaces dumping finished with error %d\n", status); return -1; } nr--; } pr_info("Namespaces dump complete\n"); return 0; } static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) { char buf[PAGE_SIZE]; int off = 0, i; int fd; /* * We can perform only a single write (that may contain multiple * newline-delimited records) to a uid_map and a gid_map files. */ for (i = 0; i < n; i++) { int len; len = snprintf(buf + off, sizeof(buf) - off, "%u %u %u\n", extents[i]->first, extents[i]->lower_first, extents[i]->count); if (len < 0) { pr_perror("Unable to form the user/group mappings buffer"); return -1; } else if (len >= sizeof(buf) - off) { pr_err("The user/group mappings buffer truncated\n"); return -1; } off += len; } fd = open_proc_rw(pid, "%s", id_map); if (fd < 0) return -1; if (write(fd, buf, off) != off) { pr_perror("Unable to write into %s", id_map); close(fd); return -1; } close(fd); return 0; } static int usernsd_pid; inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid) { struct cmsghdr *ch; struct ucred *ucred; m->h.msg_iov = m->iov; m->h.msg_iovlen = 2; m->iov[0].iov_base = c; m->iov[0].iov_len = sizeof(*c); m->iov[1].iov_base = x; m->iov[1].iov_len = sizeof(*x); if (arg) { m->iov[2].iov_base = arg; m->iov[2].iov_len = asize; m->h.msg_iovlen++; } m->h.msg_name = NULL; m->h.msg_namelen = 0; m->h.msg_flags = 0; m->h.msg_control = &m->c; /* Need to memzero because of: * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514917 */ memzero(&m->c, sizeof(m->c)); m->h.msg_controllen = CMSG_SPACE(sizeof(struct ucred)); ch = CMSG_FIRSTHDR(&m->h); ch->cmsg_len = CMSG_LEN(sizeof(struct ucred)); ch->cmsg_level = SOL_SOCKET; ch->cmsg_type = SCM_CREDENTIALS; ucred = (struct ucred *)CMSG_DATA(ch); if (pid) ucred->pid = *pid; else ucred->pid = getpid(); ucred->uid = getuid(); ucred->gid = getgid(); if (fd >= 0) { m->h.msg_controllen += CMSG_SPACE(sizeof(int)); ch = CMSG_NXTHDR(&m->h, ch); BUG_ON(!ch); ch->cmsg_len = CMSG_LEN(sizeof(int)); ch->cmsg_level = SOL_SOCKET; ch->cmsg_type = SCM_RIGHTS; *((int *)CMSG_DATA(ch)) = fd; } } void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) { struct cmsghdr *ch; struct ucred *ucred; ch = CMSG_FIRSTHDR(&um->h); BUG_ON(!ch); BUG_ON(ch->cmsg_len != CMSG_LEN(sizeof(struct ucred))); BUG_ON(ch->cmsg_level != SOL_SOCKET); BUG_ON(ch->cmsg_type != SCM_CREDENTIALS); if (pid) { ucred = (struct ucred *)CMSG_DATA(ch); *pid = ucred->pid; } ch = CMSG_NXTHDR(&um->h, ch); if (ch && ch->cmsg_len == CMSG_LEN(sizeof(int))) { BUG_ON(ch->cmsg_level != SOL_SOCKET); BUG_ON(ch->cmsg_type != SCM_RIGHTS); *fd = *((int *)CMSG_DATA(ch)); } else { *fd = -1; } } static int usernsd(int sk) { pr_info("uns: Daemon started\n"); while (1) { struct unsc_msg um; static char msg[MAX_UNSFD_MSG_SIZE]; uns_call_t call; int flags, fd, ret; pid_t pid; unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0, NULL); if (recvmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: recv req error"); return -1; } unsc_msg_pid_fd(&um, &pid, &fd); pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags); /* * Caller has sent us bare address of the routine it * wants to call. Since the caller is fork()-ed from the * same process as the daemon is, the latter has exactly * the same code at exactly the same address as the * former guy has. So go ahead and just call one! */ ret = call(msg, fd, pid); if (fd >= 0) close(fd); if (flags & UNS_ASYNC) { /* * Async call failed and the called doesn't know * about it. Exit now and let the stop_usernsd() * check the exit code and abort the restoration. * * We'd get there either by the end of restore or * from the next userns_call() due to failed * sendmsg() in there. */ if (ret < 0) { pr_err("uns: Async call failed. Exiting\n"); return -1; } continue; } if (flags & UNS_FDOUT) fd = ret; else fd = -1; unsc_msg_init(&um, &call, &ret, NULL, 0, fd, NULL); if (sendmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: send resp error"); return -1; } if (fd >= 0) close(fd); } } int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, size_t arg_size, int fd) { int ret, res, sk; bool async = flags & UNS_ASYNC; struct unsc_msg um; if (unlikely(arg_size > MAX_UNSFD_MSG_SIZE)) { pr_err("uns: message size exceeded\n"); return -1; } if (!usernsd_pid) return call(arg, fd, getpid()); sk = get_service_fd(USERNSD_SK); if (sk < 0) { pr_err("Cannot get USERNSD_SK fd\n"); return -1; } pr_debug("uns: calling %s (%d, %x)\n", func_name, fd, flags); if (!async) /* * Why don't we lock for async requests? Because * they just put the request in the daemon's * queue and do not wait for the response. Thus * when daemon response there's only one client * waiting for it in recvmsg below, so he * responses to proper caller. */ mutex_lock(&task_entries->userns_sync_lock); else /* * If we want the callback to give us and FD then * we should NOT do the asynchronous call. */ BUG_ON(flags & UNS_FDOUT); /* Send the request */ unsc_msg_init(&um, &call, &flags, arg, arg_size, fd, NULL); ret = sendmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: send req error"); ret = -1; goto out; } if (async) { ret = 0; goto out; } /* Get the response back */ unsc_msg_init(&um, &call, &res, NULL, 0, 0, NULL); ret = recvmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: recv resp error"); ret = -1; goto out; } /* Decode the result and return */ if (flags & UNS_FDOUT) unsc_msg_pid_fd(&um, NULL, &ret); else ret = res; out: if (!async) mutex_unlock(&task_entries->userns_sync_lock); return ret; } int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) { int sk[2]; int one = 1; /* * Seqpacket to * * a) Help daemon distinguish individual requests from * each other easily. Stream socket require manual * messages boundaries. * * b) Make callers note the daemon death by seeing the * disconnected socket. In case of dgram socket * callers would just get stuck in receiving the * response. */ if (socketpair(PF_UNIX, SOCK_SEQPACKET, 0, sk)) { pr_perror("Can't make usernsd socket"); return -1; } if (setsockopt(sk[0], SOL_SOCKET, SO_PASSCRED, &one, sizeof(one)) < 0) { pr_perror("failed to setsockopt"); return -1; } if (setsockopt(sk[1], SOL_SOCKET, SO_PASSCRED, &one, sizeof(1)) < 0) { pr_perror("failed to setsockopt"); return -1; } *pid = fork(); if (*pid < 0) { pr_perror("Can't unix daemon"); close(sk[0]); close(sk[1]); return -1; } if (*pid == 0) { int ret; close(sk[0]); ret = daemon_func(sk[1]); exit(ret); } close(sk[1]); return sk[0]; } static int start_usernsd(void) { int sk; if (!(root_ns_mask & CLONE_NEWUSER)) return 0; sk = start_unix_cred_daemon(&usernsd_pid, usernsd); if (sk < 0) { pr_err("failed to start usernsd\n"); return -1; } if (install_service_fd(USERNSD_SK, sk) < 0) { kill(usernsd_pid, SIGKILL); waitpid(usernsd_pid, NULL, 0); return -1; } return 0; } static int exit_usernsd(void *arg, int fd, pid_t pid) { int code = *(int *)arg; pr_info("uns: `- daemon exits w/ %d\n", code); exit(code); } int stop_usernsd(void) { int ret = 0; if (usernsd_pid) { int status = -1; sigset_t blockmask, oldmask; /* * Don't let the sigchld_handler() mess with us * calling waitpid() on the exited daemon. The * same is done in cr_system(). */ sigemptyset(&blockmask); sigaddset(&blockmask, SIGCHLD); sigprocmask(SIG_BLOCK, &blockmask, &oldmask); /* * Send a message to make sure the daemon _has_ * proceeded all its queue of asynchronous requests. * * All the restoring processes might have already * closed their USERNSD_SK descriptors, but daemon * still has its in connected state -- this is us * who hold the last reference on the peer. * * If daemon has exited "in advance" due to async * call or socket error, the userns_call() and the * waitpid() below would both fail and we'll see * bad exit status. */ userns_call(exit_usernsd, UNS_ASYNC, &ret, sizeof(ret), -1); waitpid(usernsd_pid, &status, 0); if (WIFEXITED(status)) ret = WEXITSTATUS(status); else ret = -1; usernsd_pid = 0; sigprocmask(SIG_SETMASK, &oldmask, NULL); if (ret != 0) pr_err("uns: daemon exited abnormally\n"); else pr_info("uns: daemon stopped\n"); } return ret; } int prepare_userns(struct pstree_item *item) { struct cr_img *img; UsernsEntry *e; int ret; img = open_image(CR_FD_USERNS, O_RSTR, item->ids->user_ns_id); if (!img) return -1; ret = pb_read_one(img, &e, PB_USERNS); close_image(img); if (ret < 0) return -1; if (write_id_map(item->pid->real, e->uid_map, e->n_uid_map, "uid_map")) return -1; if (write_id_map(item->pid->real, e->gid_map, e->n_gid_map, "gid_map")) return -1; return 0; } int collect_namespaces(bool for_dump) { int ret; ret = collect_user_namespaces(for_dump); if (ret < 0) return ret; ret = collect_mnt_namespaces(for_dump); if (ret < 0) return ret; ret = collect_net_namespaces(for_dump); if (ret < 0) return ret; ret = collect_pid_namespaces(for_dump); if (ret < 0) return ret; return 0; } int prepare_userns_creds(void) { if (!opts.unprivileged || has_cap_setuid(opts.cap_eff)) { /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ if (setuid(0) || setgid(0) || setgroups(0, NULL)) { pr_perror("Unable to initialize id-s"); return -1; } } /* * This flag is dropped after entering userns, but is * required to access files in /proc, so put one here * temporarily. It will be set to proper value at the * very end. */ if (prctl(PR_SET_DUMPABLE, 1, 0)) { int seterr = errno; pr_warn("Unable to set PR_SET_DUMPABLE (current %d): %s\n", prctl(PR_GET_DUMPABLE), strerror(seterr)); } return 0; } static int get_join_ns_fd(struct join_ns *jn) { int pid, fd; char nsf[32]; char *pnsf; pid = atoi(jn->ns_file); if (pid > 0) { snprintf(nsf, sizeof(nsf), "/proc/%d/ns/%s", pid, jn->nd->str); pnsf = nsf; } else { pnsf = jn->ns_file; } fd = open(pnsf, O_RDONLY); if (fd < 0) { pr_perror("Can't open ns file %s", pnsf); return -1; } jn->ns_fd = fd; return 0; } static int switch_join_ns(struct join_ns *jn) { struct stat st, self_st; char buf[32]; if (jn->nd == &user_ns_desc) { /* It is not permitted to use setns() to reenter the caller's current * user namespace. This prevents a caller that has dropped capabilities * from regaining those capabilities via a call to setns() */ if (fstat(jn->ns_fd, &st) == -1) { pr_perror("Can't get ns file %s stat", jn->ns_file); return -1; } snprintf(buf, sizeof(buf), "/proc/self/ns/%s", jn->nd->str); if (stat(buf, &self_st) == -1) { pr_perror("Can't get ns file %s stat", buf); return -1; } if (st.st_ino == self_st.st_ino) return 0; } if (setns(jn->ns_fd, jn->nd->cflag)) { pr_perror("Failed to setns when join-ns %s:%s", jn->nd->str, jn->ns_file); return -1; } return 0; } static int switch_user_join_ns(struct join_ns *jn) { uid_t uid; gid_t gid; if (jn == NULL) return 0; if (switch_join_ns(jn)) return -1; if (jn->extra_opts.user_extra.uid == NULL) uid = getuid(); else uid = atoi(jn->extra_opts.user_extra.uid); if (jn->extra_opts.user_extra.gid == NULL) gid = getgid(); else gid = atoi(jn->extra_opts.user_extra.gid); /* FIXME: * if err occurs in setuid/setgid, should we just alert or * return an error */ if (setuid(uid)) { pr_perror("setuid failed while joining userns"); return -1; } if (setgid(gid)) { pr_perror("setgid failed while joining userns"); return -1; } return 0; } int join_namespaces(void) { struct join_ns *jn, *user_jn = NULL; int ret = -1; list_for_each_entry(jn, &opts.join_ns, list) if (get_join_ns_fd(jn)) goto err_out; list_for_each_entry(jn, &opts.join_ns, list) if (jn->nd == &user_ns_desc) { user_jn = jn; } else { if (switch_join_ns(jn)) goto err_out; } if (switch_user_join_ns(user_jn)) goto err_out; ret = 0; err_out: list_for_each_entry(jn, &opts.join_ns, list) close_safe(&jn->ns_fd); return ret; } int prepare_namespace(struct pstree_item *item, unsigned long clone_flags) { pid_t pid = vpid(item); sigset_t sig_mask; int id, ret = -1; pr_info("Restoring namespaces %d flags 0x%lx\n", vpid(item), clone_flags); if (block_sigmask(&sig_mask, SIGCHLD) < 0) return -1; if ((clone_flags & CLONE_NEWUSER) && prepare_userns_creds()) return -1; /* * On netns restore we launch an IP tool, thus we * have to restore it _before_ altering the mount * tree (i.e. -- mnt_ns restoring) */ id = ns_per_id ? item->ids->uts_ns_id : pid; if ((clone_flags & CLONE_NEWUTS) && prepare_utsns(id)) goto out; id = ns_per_id ? item->ids->ipc_ns_id : pid; if ((clone_flags & CLONE_NEWIPC) && prepare_ipc_ns(id)) goto out; if (prepare_net_namespaces()) goto out; /* * This one is special -- there can be several mount * namespaces and prepare_mnt_ns handles them itself. */ if (prepare_mnt_ns()) goto out; ret = 0; out: if (restore_sigmask(&sig_mask) < 0) ret = -1; return ret; } static int read_pid_ns_img(void) { struct ns_id *ns; PidnsEntry *e; for (ns = ns_ids; ns != NULL; ns = ns->next) { struct cr_img *img; int ret; if (ns->nd != &pid_ns_desc) continue; img = open_image(CR_FD_PIDNS, O_RSTR, ns->id); if (!img) return -1; ret = pb_read_one_eof(img, &e, PB_PIDNS); close_image(img); if (ret < 0) { pr_err("Can not read pidns object\n"); return -1; } if (ret > 0) ns->ext_key = e->ext_key; } return 0; } int prepare_namespace_before_tasks(void) { if (start_usernsd()) goto err_unds; if (netns_keep_nsfd()) goto err_netns; if (mntns_maybe_create_roots()) goto err_mnt; if (read_mnt_ns_img()) goto err_img; if (read_net_ns_img()) goto err_img; if (read_pid_ns_img()) goto err_img; return 0; err_img: cleanup_mnt_ns(); err_mnt: /* * Nothing, netns' descriptor will be closed * on criu exit */ err_netns: stop_usernsd(); err_unds: return -1; } struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid"); struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user"); static int collect_pid_ns(struct ns_id *ns, void *oarg) { PidnsEntry e = PIDNS_ENTRY__INIT; struct cr_img *img; int ret; char id[64], *val; pr_info("Collecting pidns %d/%d\n", ns->id, ns->ns_pid); snprintf(id, sizeof(id), "pid[%u]", ns->kid); val = external_lookup_by_key(id); if (PTR_RET(val)) return 0; /* * Only if the user marked the PID namespace as external * via --external pid[]: