pax_global_header00006660000000000000000000000064132204425400014506gustar00rootroot0000000000000052 comment=219d6acd56e62ac5e94e3b084083efdc0237d2b6 amcheck-1.3/000077500000000000000000000000001322044254000127445ustar00rootroot00000000000000amcheck-1.3/.editorconfig000066400000000000000000000003411322044254000154170ustar00rootroot00000000000000# EditorConfig, for indentation on Github # Per http://EditorConfig.org root = true # Unix-style newlines [*] end_of_line = lf charset = utf-8 # 4 space tabs for indentation [*.{c,h,y,l}] indent_style = tab indent_size = 4 amcheck-1.3/.gitignore000066400000000000000000000000361322044254000147330ustar00rootroot00000000000000*.o *.so *.swp .deps .vagrant amcheck-1.3/.travis.yml000066400000000000000000000020671322044254000150620ustar00rootroot00000000000000language: c compiler: gcc env: matrix: - PGBRANCH=master - PGBRANCH=REL_10_STABLE - PGBRANCH=REL9_6_STABLE - PGBRANCH=REL9_5_STABLE - PGBRANCH=REL9_4_STABLE before_install: - sudo service postgresql stop - CURDIR=$(pwd) - PGHOME=${CURDIR}/${PGBRANCH} - PGDATA=${PGHOME}/data - git clone https://github.com/postgres/postgres.git - cd postgres - git checkout ${PGBRANCH} - ./configure --prefix=${PGHOME} --enable-debug --enable-cassert - make -j 2 - make install - export PATH=${PATH}:${PGHOME}/bin - initdb -D ${PGDATA} --locale=C --encoding=UTF8 - pg_ctl -D ${PGDATA} -w start before_script: - cd ${CURDIR} - make USE_PGXS=1 PG_CONFIG=${PGHOME}/bin/pg_config - make USE_PGXS=1 PG_CONFIG=${PGHOME}/bin/pg_config install script: - make USE_PGXS=1 PG_CONFIG=${PGHOME}/bin/pg_config installcheck PGUSER=travis after_script: - if [ -f regression.diffs ]; then cat regression.diffs; fi - pg_ctl -D ${PGDATA} stop notifications: email: on_success: change on_failure: always amcheck-1.3/LICENSE.md000066400000000000000000000022051322044254000143470ustar00rootroot00000000000000Portions Copyright (c) 2016-2017, Peter Geoghegan Portions Copyright (c) 1996-2017, The PostgreSQL Global Development Group Portions Copyright (c) 1994, The Regents of the University of California Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. amcheck-1.3/Makefile000066400000000000000000000014551322044254000144110ustar00rootroot00000000000000short_ver = 2 long_ver = $(shell (git describe --tags --long '--match=v*' 2>/dev/null || echo $(short_ver)-0-unknown) | cut -c2-) MODULE_big = amcheck_next OBJS = bloomfilter.o verify_nbtree.o $(WIN32RES) EXTENSION = amcheck_next DATA = amcheck_next--1.sql amcheck_next--2.sql amcheck_next--1--2.sql PGFILEDESC = "amcheck_next - functions for verifying relation integrity" DOCS = README.md REGRESS = install_amcheck_next check_btree PG_CONFIG = pg_config PGXS = $(shell $(PG_CONFIG) --pgxs) include $(PGXS) DEBUILD_ROOT = /tmp/amcheck deb: mkdir -p $(DEBUILD_ROOT) && rm -rf $(DEBUILD_ROOT)/* rsync -Ca --exclude=build/* ./ $(DEBUILD_ROOT)/ cd $(DEBUILD_ROOT) && make -f debian/rules orig cd $(DEBUILD_ROOT) && debuild -us -uc -sa cp -a /tmp/amcheck_* /tmp/postgresql-[91]* build/ amcheck-1.3/README.md000066400000000000000000000470201322044254000142260ustar00rootroot00000000000000# amcheck/amcheck_next: functions for verifying PostgreSQL relation integrity Current version: 1.3 (`amcheck_next` extension/SQL version: 2) Author: Peter Geoghegan [``](mailto:pg@bowt.ie) License: [PostgreSQL license](https://opensource.org/licenses/postgresql) Supported versions: PostgreSQL 9.4+ Note that Microsoft Windows is supported, but only on point releases that have [the necessary workaround](https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=c572599c65bfe0387563233faabecd2845073538) for [various restrictions on dynamic linking](https://postgr.es/m/508E4121.10804%40ringerc.id.au) that only exist on that platform. The minimum supported point releases are 9.4.16, 9.5.11, 9.6.7, and 10.2. ## Overview The `amcheck` module provides functions that allow you to verify the logical consistency of the structure of PostgreSQL indexes. If the structure appears to be valid, no error is raised. Currently, only B-Tree indexes are supported, although since in practice the majority of PostgreSQL indexes are B-Tree indexes, `amcheck` is likely to be effective as a general corruption smoke-test in production PostgreSQL installations. See [Using amcheck effectively](#using-amcheck-effectively) for information about the kinds of real-world problems `amcheck` is intended to detect. ### Project background `amcheck` is a [contrib extension module that originally appeared in PostgreSQL 10](https://www.postgresql.org/docs/current/static/amcheck.html). This externally maintained version of the extension, which is formally named `amcheck_next` to avoid conflicts with `contrib/amcheck`, provides the same functionality to earlier versions of PostgreSQL. `amcheck_next` also exists to provide additional verification checks that do not yet appear in stable PostgreSQL `contrib/amcheck` releases. It is safe (though generally not useful) to install `amcheck_next` alongside `contrib/amcheck`. ### Invariants `amcheck` provides functions that specifically verify various *invariants* in the structure of the representation of particular indexes. The correctness of the access method functions behind index scans and other important operations relies on these invariants always holding. For example, certain functions verify, among other things, that all B-Tree pages have items in "logical", sorted order (e.g., for B-Tree indexes on text, index tuples should be in collated lexical order). If that particular invariant somehow fails to hold, we can expect binary searches on the affected page to incorrectly guide index scans, resulting in wrong answers to SQL queries. Verification is performed using the same procedures as those used by index scans themselves, which may be user-defined operator class code. For example, B-Tree index verification relies on comparisons made with one or more B-Tree support function 1 routines, much like B-Tree index scans rely on the routines to guide the scan to a point in the underlying table; see [the PostgreSQL documentation on Index Access Methods and Operator Classes](https://www.postgresql.org/docs/current/static/xindex.html) for details of operator class support functions. ### Test status [![Build Status](https://travis-ci.org/petergeoghegan/amcheck.svg?branch=master)](https://travis-ci.org/petergeoghegan/amcheck) ## Installation ### Prebuilt packages It is recommended that `amcheck` be installed using prebuilt packages where available. #### Debian/Ubuntu The most recent `amcheck` release is available from [the PostgreSQL Community APT repository](https://apt.postgresql.org). Setup instructions can be found in the [APT section of the PostgreSQL Wiki](https://wiki.postgresql.org/wiki/Apt). Once the Community APT repository is set up, and PostgreSQL has itself been installed from a community package, installation of `amcheck` is generally a simple matter of installing the package that matches your PostgreSQL version: ```shell sudo aptitude install postgresql-10-amcheck ``` #### Redhat/CentOS/SLES The most recent `amcheck` release is available from [the PostgreSQL Community yum repository](https://yum.postgresql.org). Setup can be performed by following the [PostgreSQL yum Howto](https://yum.postgresql.org/howtoyum.php). Once the Community yum repository is set up, and PostgreSQL has itself been installed from a community package, installation of `amcheck` is generally a simple matter of installing the package that matches your PostgreSQL version: ```shell sudo yum install amcheck_next10 ``` ### Building from source #### Building using PGXS (generic) The module can be built using the standard PGXS infrastructure. For this to work, you will need to have the `pg_config` program available in your $PATH. If you are using a packaged PostgreSQL build and have `pg_config` available (and in your OS user's $PATH), the procedure is as follows: ```shell tar xvzf amcheck-1.3.tar.gz cd amcheck-1.3 make make install ``` Note that just because `pg_config` is located in one user's $PATH does not necessarily make it so for the root user. #### Building Debian/Ubuntu packages from source The Makefile also provides a target for building Debian packages. The target has a dependency on `debhelper`, `devscripts`, `postgresql-server-dev-all`, and the PostgreSQL source package itself (e.g. `postgresql-server-dev-9.4`). The packages can be created and installed from the amcheck directory as follows: ```shell sudo aptitude install debhelper devscripts postgresql-server-dev-all make deb sudo dpkg -i ./build/postgresql-9.4-amcheck_*.deb ``` ## Setting up PostgreSQL Once the module is built and/or installed, it may be created as a PostgreSQL extension: `mydb=# CREATE EXTENSION amcheck_next;` `amcheck` functions may be used only by superusers. ## Interface The `amcheck_next` extension has a simple interface. `amcheck_next` consists of just a few functions that can be used for verification of a named B-Tree index. Note that currently, no function inspects the structure of the underlying heap representation (table). `regclass` function arguments are used by `amcheck` to identify particular index relations. This allows `amcheck` to accept arguments using various SQL calling conventions: ```sql -- Use string literal regclass input: SELECT bt_index_check('pg_database_oid_index', true); -- Use oid regclass input (both perform equivalent verification): SELECT bt_index_check(2672, false); SELECT bt_index_check(oid, false) FROM pg_class WHERE relname = 'pg_database_oid_index'; ``` See the [PostgreSQL documentation on Object identifier types](https://www.postgresql.org/docs/current/static/datatype-oid.html) for more information. ### `bt_index_check` ```sql bt_index_check(index regclass, heapallindexed boolean DEFAULT false) returns void ``` `bt_index_check` tests that its target, a B-Tree index, respects a variety of invariants. Example usage: ```sql SELECT bt_index_check(index => c.oid, heapallindexed => i.indisunique), c.relname, c.relpages FROM pg_index i JOIN pg_opclass op ON i.indclass[0] = op.oid JOIN pg_am am ON op.opcmethod = am.oid JOIN pg_class c ON i.indexrelid = c.oid JOIN pg_namespace n ON c.relnamespace = n.oid WHERE am.amname = 'btree' AND n.nspname = 'pg_catalog' -- Don't check temp tables, which may be from another session: AND c.relpersistence != 't' -- Function may throw an error when this is omitted: AND i.indisready AND i.indisvalid ORDER BY c.relpages DESC LIMIT 10; ``` ``` bt_index_check | relname | relpages ----------------+---------------------------------+---------- | pg_depend_reference_index | 43 | pg_depend_depender_index | 40 | pg_proc_proname_args_nsp_index | 31 | pg_description_o_c_o_index | 21 | pg_attribute_relid_attnam_index | 14 | pg_proc_oid_index | 10 | pg_attribute_relid_attnum_index | 9 | pg_amproc_fam_proc_index | 5 | pg_amop_opr_fam_index | 5 | pg_amop_fam_strat_index | 5 ``` This example shows a session that performs verification of catalog indexes. Verification of the presence of heap tuples as index tuples is requested for unique indexes only. Since no error is raised, all indexes tested appear to be logically consistent. Naturally, this query could easily be changed to call `bt_index_check` for every index in the database where verification is supported. An `AccessShareLock` is acquired on the target index and heap relation by `bt_index_check`. This lock mode is the same lock mode acquired on relations by simple `SELECT` statements. `bt_index_check` does not verify invariants that span child/parent relationships, but will verify the presence of all heap tuples as index tuples within the index when `heapallindexed` is `true`. When a routine, lightweight test for corruption is required in a live production environment, using `bt_index_check` often provides the best trade-off between thoroughness of verification and limiting the impact on application performance and availability. ### `bt_index_parent_check` ```sql bt_index_parent_check(index regclass, heapallindexed boolean DEFAULT false) returns void ``` `bt_index_parent_check` tests that its target, a B-Tree index, respects a variety of invariants. Optionally, when the `heapallindexed` argument is `true`, the function verifies the presence of all heap tuples that should be found within the index. The checks performed by `bt_index_parent_check` are a superset of the checks performed by `bt_index_check` when called with the same options. `bt_index_parent_check` can be thought of as a more thorough variant of `bt_index_check`: unlike `bt_index_check`, `bt_index_parent_check` also checks invariants that span parent/child relationships. `bt_index_parent_check` follows the general convention of raising an error if it finds a logical inconsistency or other problem. A `ShareLock` is required on the target index by `bt_index_parent_check` (a `ShareLock` is also acquired on the heap relation). These locks prevent concurrent data modification from `INSERT`, `UPDATE`, and `DELETE` commands. The locks also prevent the underlying relation from being concurrently processed by `VACUUM` (and other utility commands). Note that the function holds locks for as short a duration as possible, so there is no advantage to verifying each index individually in a series of transactions, unless long running queries happen to be of particular concern. `bt_index_parent_check`'s additional verification is more likely to detect various pathological cases. These cases may involve an incorrectly implemented B-Tree operator class used by the index that is checked, or, hypothetically, undiscovered bugs in the underlying B-Tree index access method code. Note that `bt_index_parent_check` cannot be called when Hot Standby is enabled (i.e., on read-only physical replicas), unlike `bt_index_check`. ## Optional `heapallindexed` verification When the `heapallindexed` argument to verification functions is `true`, an additional phase of verification is performed against the table associated with the target index relation. This consists of a "dummy" `CREATE INDEX` operation, which checks for the presence of all would-be new index tuples against a temporary, in-memory summarizing structure (this is built when needed during the first, standard phase). The summarizing structure "fingerprints" every tuple found within the target index. The high level principle behind `heapallindexed` verification is that a new index that is equivalent to the existing, target index must only have entries that can be found in the existing structure. The additional `heapallindexed` phase adds significant overhead: verification will typically take several times longer than it would with only the standard consistency checking of the target index's structure. However, verification will still take significantly less time than an actual `CREATE INDEX`. There is no change to the relation-level locks acquired when `heapallindexed` verification is performed. The summarizing structure is bound in size by `maintenance_work_mem`. In order to ensure that there is no more than a 2% probability of failure to detect the absence of any particular index tuple, approximately 2 bytes of memory are needed per index tuple. As less memory is made available per index tuple, the probability of missing an inconsistency increases. This is considered an acceptable trade-off, since it limits the overhead of verification very significantly, while only slightly reducing the probability of detecting a problem, especially for installations where verification is treated as a routine maintenance task. With many databases, even the default `maintenance_work_mem` setting of `64MB` is sufficient to have less than a 2% probability of overlooking any single absent or corrupt tuple. This will be the case when there are no indexes with more than about 30 million distinct index tuples, regardless of the overall size of any index, the total number of indexes, or anything else. False positive candidate tuple membership tests within the summarizing structure occur at random, and are very unlikely to be the same for repeat verification operations. Furthermore, within a single verification operation, each missing or malformed index tuple independently has the same chance of being detected. If there is any inconsistency at all, it isn't particularly likely to be limited to a single tuple. All of these factors favor accepting a limited per operation per tuple probability of missing corruption, in order to enable performing more thorough index to heap verification more frequently (practical concerns about the overhead of verification are likely to limit the frequency of verification). In aggregate, the probability of detecting a hardware fault or software defect actually *increases* significantly with this strategy in most real world cases. Moreover, frequent verification allows problems to be caught earlier on average, which helps to limit the overall impact of corruption, and often simplifies root cause analysis. ## Using amcheck effectively ### Causes of corruption `amcheck` can be effective at detecting various types of failure modes that data page checksums will always fail to catch. These include: * Structural inconsistencies caused by incorrect operator class implementations. This includes issues caused by the comparison rules of operating system collations changing. Comparisons of datums of a collatable type like `text` must be immutable (just as all comparisons used for B-Tree index scans must be immutable), which implies that operating system collation rules must never change. Though rare, updates to operating system collation rules can cause these issues. More commonly, an inconsistency in the collation order between a master server and a standby server is implicated, possibly because the *major* operating system version in use is inconsistent. Such inconsistencies will generally only arise on standby servers, and so can generally only be detected on standby servers. If a problem like this arises, it may not affect each individual index that is ordered using an affected collation, simply because *indexed* values might happen to have the same absolute ordering regardless of the behavioral inconsistency. * Structural inconsistencies between indexes and the heap relations that are indexed (when `heapallindexed` verification is performed). There is no cross-checking of indexes against their heap relation during normal operation. Symptoms of heap corruption can be very subtle. * Corruption caused by hypothetical undiscovered bugs in the underlying PostgreSQL access method code, sort code, or transaction management code. Automatic verification of the structural integrity of indexes plays a role in the general testing of new or proposed PostgreSQL features that could plausibly allow a logical inconsistency to be introduced. Verification of table structure and associated visibility and transaction status information plays a similar role. One obvious testing strategy is to call `amcheck` functions continuously when running the standard regression tests. * Filesystem or storage subsystem faults where checksums happen to simply not be enabled. Note that `amcheck` examines a page as represented in some shared memory buffer at the time of verification if there is only a shared buffer hit when accessing the block. Consequently, `amcheck` does not necessarily examine data read from the filesystem at the time of verification. Note that when checksums are enabled, `amcheck` may raise an error due to a checksum failure when a corrupt block is read into a buffer. * Corruption caused by faulty RAM, and the broader memory subsystem and operating system. PostgreSQL does not protect against correctable memory errors and it is assumed you will operate using RAM that uses industry standard Error Correcting Codes (ECC) or better protection. However, ECC memory is typically only immune to single-bit errors, and should not be assumed to provide *absolute* protection against failures that result in memory corruption. When `heapallindexed` verification is performed, there is generally a greatly increased chance of detecting single-bit errors, since strict binary equality is tested, and the indexed attributes within the heap are tested. ### Overhead The overhead of calling `bt_index_check` for every index on a live production system is roughly comparable to the overhead of vacuuming; like `VACUUM`, verification uses a "buffer access strategy", which limits its impact on which pages are cached within `shared_buffers`. A major design goal of `amcheck` is to support routine verification of all indexes on busy production systems. Note that `heapallindexed` verification *significantly* increases the runtime of verification. No `amcheck` routine will ever modify data, and so no pages will ever be "dirtied", which is not the case with `VACUUM`. On the other hand, `amcheck` may be required to verify a large number of indexes all at once, which is typically not a behavior that autovacuum exhibits. `amcheck` exhaustively accesses every page in each index verified. This behavior is useful in part because verification may detect a checksum failure, which may have previously gone undetected only because no process needed data from the corrupt page in question, including even an autovacuum worker process. Note also that `bt_index_check` and `bt_index_parent_check` access the contents of indexes in "logical" order, which, in the worst case, implies that all I/O operations are performed at random positions on the filesystem. In contrast, `VACUUM` always removes dead index tuples from B-Tree indexes while accessing the contents of B-Tree indexes in sequential order. ### Acting on information about corruption No error concerning corruption raised by `amcheck` should ever be a false positive. `amcheck` raises errors in the event of conditions that, by definition, should never happen. It seems unlikely that there could ever be a useful *general* remediation to problems it detects. In general, an explanation for the root cause of an invariant violation should be sought. [`contrib/pageinspect`](https://www.postgresql.org/docs/current/static/pageinspect.html) can play a useful role in diagnosing corruption that `amcheck` highlights. A `REINDEX` may or may not be effective in repairing corruption, depending on the exact details of how the corruption originated. In general, `amcheck` can only prove the presence of corruption; it cannot prove its absence. amcheck-1.3/Vagrantfile000066400000000000000000000004341322044254000151320ustar00rootroot00000000000000# -*- mode: ruby -*- # vi: set ft=ruby : # Vagrantfile API/syntax version. Don't touch unless you know what you're doing! VAGRANTFILE_API_VERSION = "2" Vagrant.configure("2") do |config| config.vm.box = "debian/jessie64" config.vm.provision :shell, :path => "bootstrap.sh" end amcheck-1.3/amcheck_next--1--2.sql000066400000000000000000000015471322044254000165540ustar00rootroot00000000000000/* amcheck_next--1--2.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION amcheck_next UPDATE TO '2'" to load this file. \quit -- -- bt_index_check() -- DROP FUNCTION bt_index_check(regclass); CREATE FUNCTION bt_index_check(index regclass, heapallindexed boolean DEFAULT false) RETURNS VOID AS 'MODULE_PATHNAME', 'bt_index_check_next' LANGUAGE C STRICT; -- -- bt_index_parent_check() -- DROP FUNCTION bt_index_parent_check(regclass); CREATE FUNCTION bt_index_parent_check(index regclass, heapallindexed boolean DEFAULT false) RETURNS VOID AS 'MODULE_PATHNAME', 'bt_index_parent_check_next' LANGUAGE C STRICT; -- Don't want these to be available to public REVOKE ALL ON FUNCTION bt_index_check(regclass, boolean) FROM PUBLIC; REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean) FROM PUBLIC; amcheck-1.3/amcheck_next--1.sql000066400000000000000000000012321322044254000163270ustar00rootroot00000000000000/* amcheck_next--1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION amcheck_next" to load this file. \quit -- -- bt_index_check() -- CREATE FUNCTION bt_index_check(index regclass) RETURNS VOID AS 'MODULE_PATHNAME', 'bt_index_check_next' LANGUAGE C STRICT; -- -- bt_index_parent_check() -- CREATE FUNCTION bt_index_parent_check(index regclass) RETURNS VOID AS 'MODULE_PATHNAME', 'bt_index_parent_check_next' LANGUAGE C STRICT; -- Don't want these to be available to public REVOKE ALL ON FUNCTION bt_index_check(regclass) FROM PUBLIC; REVOKE ALL ON FUNCTION bt_index_parent_check(regclass) FROM PUBLIC; amcheck-1.3/amcheck_next--2.sql000066400000000000000000000014001322044254000163250ustar00rootroot00000000000000/* amcheck_next--2.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION amcheck_next" to load this file. \quit -- -- bt_index_check() -- CREATE FUNCTION bt_index_check(index regclass, heapallindexed boolean DEFAULT false) RETURNS VOID AS 'MODULE_PATHNAME', 'bt_index_check_next' LANGUAGE C STRICT; -- -- bt_index_parent_check() -- CREATE FUNCTION bt_index_parent_check(index regclass, heapallindexed boolean DEFAULT false) RETURNS VOID AS 'MODULE_PATHNAME', 'bt_index_parent_check_next' LANGUAGE C STRICT; -- Don't want these to be available to public REVOKE ALL ON FUNCTION bt_index_check(regclass, boolean) FROM PUBLIC; REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean) FROM PUBLIC; amcheck-1.3/amcheck_next.control000066400000000000000000000002421322044254000167750ustar00rootroot00000000000000# amcheck_next extension comment = 'functions for verifying relation integrity' default_version = '2' module_pathname = '$libdir/amcheck_next' relocatable = true amcheck-1.3/bloomfilter.c000066400000000000000000000232221322044254000154270ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * bloomfilter.c * Minimal Bloom filter * * A Bloom filter is a probabilistic data structure that is used to test an * element's membership of a set. False positives are possible, but false * negatives are not; a test of membership of the set returns either "possibly * in set" or "definitely not in set". This can be very space efficient when * individual elements are larger than a few bytes, because elements are hashed * in order to set bits in the Bloom filter bitset. * * Elements can be added to the set, but not removed. The more elements that * are added, the larger the probability of false positives. Caller must hint * an estimated total size of the set when its Bloom filter is initialized. * This is used to balance the use of memory against the final false positive * rate. * * Portions Copyright (c) 2016-2017, Peter Geoghegan * Portions Copyright (c) 1996-2017, The PostgreSQL Global Development Group * Portions Copyright (c) 1994, The Regents of the University of California * * IDENTIFICATION * amcheck_next/bloom_filter.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "access/hash.h" #include "bloomfilter.h" #define MAX_HASH_FUNCS 10 struct bloom_filter { /* K hash functions are used, which are randomly seeded */ int k_hash_funcs; uint32 seed; /* Bitset is sized directly in bits. It must be a power-of-two <= 2^32. */ int64 bitset_bits; unsigned char bitset[FLEXIBLE_ARRAY_MEMBER]; }; static int my_bloom_power(int64 target_bitset_bits); static int optimal_k(int64 bitset_bits, int64 total_elems); static void k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem, size_t len); static uint32 sdbmhash(unsigned char *elem, size_t len); /* * Create Bloom filter in caller's memory context. This should get a false * positive rate of between 1% and 2% when bitset is not constrained by memory. * * total_elems is an estimate of the final size of the set. It ought to be * approximately correct, but we can cope well with it being off by perhaps a * factor of five or more. See "Bloom Filters in Probabilistic Verification" * (Dillinger & Manolios, 2004) for details of why this is the case. * * bloom_work_mem is sized in KB, in line with the general work_mem convention. * * The Bloom filter behaves non-deterministically when caller passes a random * seed value. This ensures that the same false positives will not occur from * one run to the next, which is useful to some callers. * * Notes on appropriate use: * * To keep the implementation simple and predictable, the underlying bitset is * always sized as a power-of-two number of bits, and the largest possible * bitset is 512MB. The implementation is therefore well suited to data * synchronization problems between unordered sets, where predictable * performance is more important than worst case guarantees around false * positives. Another problem that the implementation is well suited for is * cache filtering where good performance already relies upon having a * relatively small and/or low cardinality set of things that are interesting * (with perhaps many more uninteresting things that never populate the * filter). */ bloom_filter * bloom_create(int64 total_elems, int bloom_work_mem, uint32 seed) { bloom_filter *filter; int bloom_power; int64 bitset_bytes; int64 bitset_bits; /* * Aim for two bytes per element; this is sufficient to get a false * positive rate below 1%, independent of the size of the bitset or total * number of elements. Also, if rounding down the size of the bitset to * the next lowest power of two turns out to be a significant drop, the * false positive rate still won't exceed 2% in almost all cases. */ bitset_bytes = Min(bloom_work_mem * 1024L, total_elems * 2); /* Minimum allowable size is 1MB */ bitset_bytes = Max(1024L * 1024L, bitset_bytes); /* Size in bits should be the highest power of two within budget */ bloom_power = my_bloom_power(bitset_bytes * BITS_PER_BYTE); /* bitset_bits is int64 because 2^32 is greater than UINT32_MAX */ bitset_bits = INT64CONST(1) << bloom_power; bitset_bytes = bitset_bits / BITS_PER_BYTE; /* Allocate bloom filter as all-zeroes */ filter = palloc0(offsetof(bloom_filter, bitset) + sizeof(unsigned char) * bitset_bytes); filter->k_hash_funcs = optimal_k(bitset_bits, total_elems); /* * Hash caller's seed value. We don't trust caller to provide values * uniformly distributed within the range of 0 - PG_UINT32_MAX. */ filter->seed = DatumGetUInt32(hash_uint32(seed)); filter->bitset_bits = bitset_bits; return filter; } /* * Free Bloom filter */ void bloom_free(bloom_filter *filter) { pfree(filter); } /* * Add element to Bloom filter */ void bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len) { uint32 hashes[MAX_HASH_FUNCS]; int i; k_hashes(filter, hashes, elem, len); /* Map a bit-wise address to a byte-wise address + bit offset */ for (i = 0; i < filter->k_hash_funcs; i++) { filter->bitset[hashes[i] >> 3] |= 1 << (hashes[i] & 7); } } /* * Test if Bloom filter definitely lacks element. * * Returns true if the element is definitely not in the set of elements * observed by bloom_add_element(). Otherwise, returns false, indicating that * element is probably present in set. */ bool bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len) { uint32 hashes[MAX_HASH_FUNCS]; int i; k_hashes(filter, hashes, elem, len); /* Map a bit-wise address to a byte-wise address + bit offset */ for (i = 0; i < filter->k_hash_funcs; i++) { if (!(filter->bitset[hashes[i] >> 3] & (1 << (hashes[i] & 7)))) return true; } return false; } /* * What proportion of bits are currently set? * * Returns proportion, expressed as a multiplier of filter size. * * This is a useful, generic indicator of whether or not a Bloom filter has * summarized the set optimally within the available memory budget. If return * value exceeds 0.5 significantly, then that's either because there was a * dramatic underestimation of set size by the caller, or because available * work_mem is very low relative to the size of the set (less than 2 bits per * element). * * The value returned here should generally be close to 0.5, even when we have * more than enough memory to ensure a false positive rate within target 1% to * 2% band, since more hash functions are used as more memory is available per * element. */ double bloom_prop_bits_set(bloom_filter *filter) { int bitset_bytes = filter->bitset_bits / BITS_PER_BYTE; int64 bits_set = 0; int i; for (i = 0; i < bitset_bytes; i++) { unsigned char byte = filter->bitset[i]; while (byte) { bits_set++; byte &= (byte - 1); } } return bits_set / (double) filter->bitset_bits; } /* * Which element in the sequence of powers-of-two is less than or equal to * target_bitset_bits? * * Value returned here must be generally safe as the basis for actual bitset * size. * * Bitset is never allowed to exceed 2 ^ 32 bits (512MB). This is sufficient * for the needs of all current callers, and allows us to use 32-bit hash * functions. It also makes it easy to stay under the MaxAllocSize restriction * (caller needs to leave room for non-bitset fields that appear before * flexible array member, so a 1GB bitset would use an allocation that just * exceeds MaxAllocSize). */ static int my_bloom_power(int64 target_bitset_bits) { int bloom_power = -1; while (target_bitset_bits > 0 && bloom_power < 32) { bloom_power++; target_bitset_bits >>= 1; } return bloom_power; } /* * Determine optimal number of hash functions based on size of filter in bits, * and projected total number of elements. The optimal number is the number * that minimizes the false positive rate. */ static int optimal_k(int64 bitset_bits, int64 total_elems) { int k = round(log(2.0) * bitset_bits / total_elems); return Max(1, Min(k, MAX_HASH_FUNCS)); } /* * Generate k hash values for element. * * Caller passes array, which is filled-in with k values determined by hashing * caller's element. * * Only 2 real independent hash functions are actually used to support an * interface of up to MAX_HASH_FUNCS hash functions; "enhanced double hashing" * is used to make this work. See Dillinger & Manolios for details of why * that's okay. "Building a Better Bloom Filter" by Kirsch & Mitzenmacher also * has detailed analysis of the algorithm. */ static void k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem, size_t len) { uint32 hasha, hashb; int i; hasha = DatumGetUInt32(hash_any(elem, len)); hashb = (filter->k_hash_funcs > 1 ? sdbmhash(elem, len) : 0); /* * Mix seed value using XOR. Mixing with addition instead would defeat the * purpose of having a seed (false positives would never change for a given * set of input elements). */ hasha ^= filter->seed; /* Apply "MOD m" to avoid losing bits/out-of-bounds array access */ hasha = hasha % filter->bitset_bits; hashb = hashb % filter->bitset_bits; /* First hash */ hashes[0] = hasha; /* Subsequent hashes */ for (i = 1; i < filter->k_hash_funcs; i++) { hasha = (hasha + hashb) % filter->bitset_bits; hashb = (hashb + i) % filter->bitset_bits; /* Accumulate hash value for caller */ hashes[i] = hasha; } } /* * Hash function is taken from sdbm, a public-domain reimplementation of the * ndbm database library. */ static uint32 sdbmhash(unsigned char *elem, size_t len) { uint32 hash = 0; int i; for (i = 0; i < len; elem++, i++) { hash = (*elem) + (hash << 6) + (hash << 16) - hash; } return hash; } amcheck-1.3/bloomfilter.h000066400000000000000000000016771322044254000154460ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * bloomfilter.h * Minimal Bloom filter * * Portions Copyright (c) 2016-2017, Peter Geoghegan * Portions Copyright (c) 1996-2017, The PostgreSQL Global Development Group * Portions Copyright (c) 1994, The Regents of the University of California * * IDENTIFICATION * amcheck_next/bloom_filter.h * *------------------------------------------------------------------------- */ #ifndef _BLOOMFILTER_H_ #define _BLOOMFILTER_H_ typedef struct bloom_filter bloom_filter; extern bloom_filter *bloom_create(int64 total_elems, int bloom_work_mem, uint32 seed); extern void bloom_free(bloom_filter *filter); extern void bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len); extern bool bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len); extern double bloom_prop_bits_set(bloom_filter *filter); #endif amcheck-1.3/bootstrap.sh000066400000000000000000000004341322044254000153160ustar00rootroot00000000000000#!/usr/bin/env bash echo "deb http://apt.postgresql.org/pub/repos/apt/ jessie-pgdg main" > /etc/apt/sources.list.d/pgdg.list wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - apt-get update apt-get install -y postgresql-server-dev-all devscripts amcheck-1.3/build/000077500000000000000000000000001322044254000140435ustar00rootroot00000000000000amcheck-1.3/build/.gitignore000066400000000000000000000001071322044254000160310ustar00rootroot00000000000000# Ignore everything in this directory * # Except this file !.gitignore amcheck-1.3/debian/000077500000000000000000000000001322044254000141665ustar00rootroot00000000000000amcheck-1.3/debian/changelog000066400000000000000000000027071322044254000160460ustar00rootroot00000000000000amcheck (1.3-1) unstable; urgency=medium * Fix bug in seeding Bloom filter -- Peter Geoghegan Tue, 26 Dec 2017 12:46:17 +0000 amcheck (1.2-1) unstable; urgency=medium * Fix unportable use of typedef -- Peter Geoghegan Sat, 21 Oct 2017 19:05:57 -0700 amcheck (1.1-1) unstable; urgency=medium * Add heapallindexed verification * Documentation updates -- Peter Geoghegan Thu, 19 Oct 2017 12:11:59 -0700 amcheck (1.0-1) unstable; urgency=medium * Correct 0.3-1 date string in changelog * Normalize copyright assignment, and make debian copyright file consistent with LICENSE.md * Create extension with the name amcheck_next, to avoid ambiguity. * Remove Travis support. * Replace tests with upstream contrib/amcheck tests. -- Peter Geoghegan Sat, 07 Oct 2017 09:28:38 -0700 amcheck (0.3-1) unstable; urgency=low * Tweaks to some diagnostic messages * Detect disagreement between parent/child B-Tree level -- Peter Geoghegan Fri, 16 Jun 2017 17:31:41 -0700 amcheck (0.2-1) unstable; urgency=low * Support for PostgreSQL 9.6 * Reduction in verbosity of diagnostic messages * Bugfix: Omit right block in cross-page check diagnostic message * Travis support -- Peter Geoghegan Thu, 13 Oct 2016 14:21:49 -0700 amcheck (0.1-1) unstable; urgency=low * Initial release -- Peter Geoghegan Fri, 29 Apr 2016 11:28:09 -0700 amcheck-1.3/debian/compat000066400000000000000000000000021322044254000153640ustar00rootroot000000000000009 amcheck-1.3/debian/control000066400000000000000000000044131322044254000155730ustar00rootroot00000000000000Source: amcheck Priority: extra Maintainer: Peter Geoghegan Build-Depends: debhelper (>= 9), postgresql-server-dev-all (>= 171~) Standards-Version: 3.9.6 Section: database Homepage: https://github.com/petergeoghegan/amcheck Vcs-Git: https://github.com/petergeoghegan/amcheck.git Vcs-Browser: https://github.com/petergeoghegan/amcheck XS-Testsuite: autopkgtest Package: postgresql-9.4-amcheck Section: libs Architecture: any Depends: ${shlibs:Depends}, ${misc:Depends}, postgresql-9.4 Description: PostgreSQL extension that verifies indexes This extension verifies the logical consistency of PostgreSQL B-Tree indexes. The extension consists of SQL-callable functions. When no error is raised during a call to these verification functions, no logical inconsistency was detected. This is useful as a general smoke test to detect corruption. Package: postgresql-9.5-amcheck Section: libs Architecture: any Depends: ${shlibs:Depends}, ${misc:Depends}, postgresql-9.5 Description: PostgreSQL extension that verifies indexes This extension verifies the logical consistency of PostgreSQL B-Tree indexes. The extension consists of SQL-callable functions. When no error is raised during a call to these verification functions, no logical inconsistency was detected. This is useful as a general smoke test to detect corruption. Package: postgresql-9.6-amcheck Section: libs Architecture: any Depends: ${shlibs:Depends}, ${misc:Depends}, postgresql-9.6 Description: PostgreSQL extension that verifies indexes This extension verifies the logical consistency of PostgreSQL B-Tree indexes. The extension consists of SQL-callable functions. When no error is raised during a call to these verification functions, no logical inconsistency was detected. This is useful as a general smoke test to detect corruption. Package: postgresql-10-amcheck Section: libs Architecture: any Depends: ${shlibs:Depends}, ${misc:Depends}, postgresql-10 Description: PostgreSQL extension that verifies indexes This extension verifies the logical consistency of PostgreSQL B-Tree indexes. The extension consists of SQL-callable functions. When no error is raised during a call to these verification functions, no logical inconsistency was detected. This is useful as a general smoke test to detect corruption. amcheck-1.3/debian/control.in000066400000000000000000000015471322044254000162050ustar00rootroot00000000000000Source: amcheck Priority: extra Maintainer: Peter Geoghegan Build-Depends: debhelper (>= 9), postgresql-server-dev-all (>= 171~) Standards-Version: 4.1.1 Section: database Homepage: https://github.com/petergeoghegan/amcheck Vcs-Git: https://github.com/petergeoghegan/amcheck.git Vcs-Browser: https://github.com/petergeoghegan/amcheck XS-Testsuite: autopkgtest Package: postgresql-PGVERSION-amcheck Section: libs Architecture: any Depends: ${shlibs:Depends}, ${misc:Depends}, postgresql-PGVERSION Description: PostgreSQL extension that verifies indexes This extension verifies the logical consistency of PostgreSQL B-Tree indexes. The extension consists of SQL-callable functions. When no error is raised during a call to these verification functions, no logical inconsistency was detected. This is useful as a general smoke test to detect corruption. amcheck-1.3/debian/copyright000066400000000000000000000024571322044254000161310ustar00rootroot00000000000000Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: amcheck Source: https://github.com/petergeoghegan/amcheck Files: * Copyright: 2016-2017, Peter Geoghegan 1996-2017, The PostgreSQL Global Development Group 1994, The Regents of the University of California License: PostgreSQL Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. . IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. . THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. amcheck-1.3/debian/pgversions000066400000000000000000000001471322044254000163120ustar00rootroot00000000000000# Only support versions with robust approach to B-Tree page deletion and # concurrent page splits 9.4+ amcheck-1.3/debian/rules000077500000000000000000000012771322044254000152550ustar00rootroot00000000000000#!/usr/bin/make -f PKGVERS = $(shell dpkg-parsechangelog | awk -F '[:-]' '/^Version:/ { print substr($$2, 2) }') EXCLUDE = --exclude-vcs --exclude=debian --exclude=build include /usr/share/postgresql-common/pgxs_debian_control.mk clean: debian/control .PHONY: debian/control override_dh_auto_build: # do nothing override_dh_auto_install: # do nothing override_dh_install: # build all supported versions +pg_buildext loop postgresql-%v-amcheck # remove docs that belong elsewhere rm -rf debian/*/usr/share/doc/postgresql-doc-* override_dh_installdocs: dh_installdocs --all README.md orig: debian/control clean cd .. && tar czf amcheck_$(PKGVERS).orig.tar.gz $(EXCLUDE) amcheck %: dh $@ amcheck-1.3/debian/source/000077500000000000000000000000001322044254000154665ustar00rootroot00000000000000amcheck-1.3/debian/source/format000066400000000000000000000000041322044254000166730ustar00rootroot000000000000001.0 amcheck-1.3/debian/tests/000077500000000000000000000000001322044254000153305ustar00rootroot00000000000000amcheck-1.3/debian/tests/control000066400000000000000000000002351322044254000167330ustar00rootroot00000000000000Depends: @, postgresql-server-dev-all, postgresql-contrib-9.4, postgresql-contrib-9.5, postgresql-contrib-9.6 Tests: installcheck Restrictions: allow-stderr amcheck-1.3/debian/tests/control.in000066400000000000000000000001631322044254000173400ustar00rootroot00000000000000Depends: @, postgresql-server-dev-all, postgresql-contrib-PGVERSION Tests: installcheck Restrictions: allow-stderr amcheck-1.3/debian/tests/installcheck000077500000000000000000000000541322044254000177210ustar00rootroot00000000000000#!/bin/sh set -e pg_buildext installcheck amcheck-1.3/debian/watch000066400000000000000000000001521322044254000152150ustar00rootroot00000000000000version=3 https://github.com/petergeoghegan/amcheck/releases /petergeoghegan/amcheck/archive/v(.*).tar.gz amcheck-1.3/expected/000077500000000000000000000000001322044254000145455ustar00rootroot00000000000000amcheck-1.3/expected/check_btree.out000066400000000000000000000054701322044254000175420ustar00rootroot00000000000000-- minimal test, basically just verifying that amcheck CREATE TABLE bttest_a(id int8); CREATE TABLE bttest_b(id int8); INSERT INTO bttest_a SELECT * FROM generate_series(1, 100000); INSERT INTO bttest_b SELECT * FROM generate_series(100000, 1, -1); CREATE INDEX bttest_a_idx ON bttest_a USING btree (id); CREATE INDEX bttest_b_idx ON bttest_b USING btree (id); CREATE ROLE bttest_role; -- verify permissions are checked (error due to function not callable) SET ROLE bttest_role; SELECT bt_index_check('bttest_a_idx'::regclass); ERROR: permission denied for function bt_index_check SELECT bt_index_parent_check('bttest_a_idx'::regclass); ERROR: permission denied for function bt_index_parent_check RESET ROLE; -- we, intentionally, don't check relation permissions - it's useful -- to run this cluster-wide with a restricted account, and as tested -- above explicit permission has to be granted for that. GRANT EXECUTE ON FUNCTION bt_index_check(regclass, boolean) TO bttest_role; GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass, boolean) TO bttest_role; SET ROLE bttest_role; SELECT bt_index_check('bttest_a_idx'); bt_index_check ---------------- (1 row) SELECT bt_index_parent_check('bttest_a_idx'); bt_index_parent_check ----------------------- (1 row) RESET ROLE; -- verify plain tables are rejected (error) SELECT bt_index_check('bttest_a'); ERROR: "bttest_a" is not an index SELECT bt_index_parent_check('bttest_a'); ERROR: "bttest_a" is not an index -- verify non-existing indexes are rejected (error) SELECT bt_index_check(17); ERROR: could not open relation with OID 17 SELECT bt_index_parent_check(17); ERROR: could not open relation with OID 17 -- normal check outside of xact SELECT bt_index_check('bttest_a_idx'); bt_index_check ---------------- (1 row) -- more expansive tests SELECT bt_index_check('bttest_a_idx', true); bt_index_check ---------------- (1 row) SELECT bt_index_parent_check('bttest_b_idx', true); bt_index_parent_check ----------------------- (1 row) BEGIN; SELECT bt_index_check('bttest_a_idx'); bt_index_check ---------------- (1 row) SELECT bt_index_parent_check('bttest_b_idx'); bt_index_parent_check ----------------------- (1 row) -- make sure we don't have any leftover locks SELECT * FROM pg_locks WHERE relation = ANY(ARRAY['bttest_a', 'bttest_a_idx', 'bttest_b', 'bttest_b_idx']::regclass[]) AND pid = pg_backend_pid(); locktype | database | relation | page | tuple | virtualxid | transactionid | classid | objid | objsubid | virtualtransaction | pid | mode | granted | fastpath ----------+----------+----------+------+-------+------------+---------------+---------+-------+----------+--------------------+-----+------+---------+---------- (0 rows) COMMIT; -- cleanup DROP TABLE bttest_a; DROP TABLE bttest_b; DROP OWNED BY bttest_role; -- permissions DROP ROLE bttest_role; amcheck-1.3/expected/install_amcheck_next.out000066400000000000000000000000371322044254000214550ustar00rootroot00000000000000CREATE EXTENSION amcheck_next; amcheck-1.3/sql/000077500000000000000000000000001322044254000135435ustar00rootroot00000000000000amcheck-1.3/sql/check_btree.sql000066400000000000000000000036001322044254000165210ustar00rootroot00000000000000-- minimal test, basically just verifying that amcheck CREATE TABLE bttest_a(id int8); CREATE TABLE bttest_b(id int8); INSERT INTO bttest_a SELECT * FROM generate_series(1, 100000); INSERT INTO bttest_b SELECT * FROM generate_series(100000, 1, -1); CREATE INDEX bttest_a_idx ON bttest_a USING btree (id); CREATE INDEX bttest_b_idx ON bttest_b USING btree (id); CREATE ROLE bttest_role; -- verify permissions are checked (error due to function not callable) SET ROLE bttest_role; SELECT bt_index_check('bttest_a_idx'::regclass); SELECT bt_index_parent_check('bttest_a_idx'::regclass); RESET ROLE; -- we, intentionally, don't check relation permissions - it's useful -- to run this cluster-wide with a restricted account, and as tested -- above explicit permission has to be granted for that. GRANT EXECUTE ON FUNCTION bt_index_check(regclass, boolean) TO bttest_role; GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass, boolean) TO bttest_role; SET ROLE bttest_role; SELECT bt_index_check('bttest_a_idx'); SELECT bt_index_parent_check('bttest_a_idx'); RESET ROLE; -- verify plain tables are rejected (error) SELECT bt_index_check('bttest_a'); SELECT bt_index_parent_check('bttest_a'); -- verify non-existing indexes are rejected (error) SELECT bt_index_check(17); SELECT bt_index_parent_check(17); -- normal check outside of xact SELECT bt_index_check('bttest_a_idx'); -- more expansive tests SELECT bt_index_check('bttest_a_idx', true); SELECT bt_index_parent_check('bttest_b_idx', true); BEGIN; SELECT bt_index_check('bttest_a_idx'); SELECT bt_index_parent_check('bttest_b_idx'); -- make sure we don't have any leftover locks SELECT * FROM pg_locks WHERE relation = ANY(ARRAY['bttest_a', 'bttest_a_idx', 'bttest_b', 'bttest_b_idx']::regclass[]) AND pid = pg_backend_pid(); COMMIT; -- cleanup DROP TABLE bttest_a; DROP TABLE bttest_b; DROP OWNED BY bttest_role; -- permissions DROP ROLE bttest_role; amcheck-1.3/sql/install_amcheck_next.sql000066400000000000000000000000371322044254000204430ustar00rootroot00000000000000CREATE EXTENSION amcheck_next; amcheck-1.3/verify_nbtree.c000066400000000000000000001570351322044254000157660ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * verify_nbtree.c * Verifies the integrity of nbtree indexes based on invariants. * * For B-Tree indexes, verification includes checking that each page in the * target index has items in logical order as reported by an insertion scankey * (the insertion scankey sort-wise NULL semantics are needed for * verification). * * When index-to-heap verification is requested, a Bloom filter is used to * fingerprint all tuples in the target index, as the index is traversed to * verify its structure. A heap scan later verifies the presence in the heap * of all index tuples fingerprinted within the Bloom filter. * * * Portions Copyright (c) 2016-2017, Peter Geoghegan * Portions Copyright (c) 1996-2017, The PostgreSQL Global Development Group * Portions Copyright (c) 1994, The Regents of the University of California * * IDENTIFICATION * amcheck_next/verify_nbtree.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/htup_details.h" #include "access/nbtree.h" #include "access/transam.h" #include "bloomfilter.h" #include "catalog/index.h" #include "catalog/pg_am.h" #include "commands/tablecmds.h" #include "miscadmin.h" #include "storage/lmgr.h" #include "utils/memutils.h" #include "utils/snapmgr.h" PG_MODULE_MAGIC; /* * A B-Tree cannot possibly have this many levels, since there must be one * block per level, which is bound by the range of BlockNumber: */ #define InvalidBtreeLevel ((uint32) InvalidBlockNumber) /* * State associated with verifying a B-Tree index * * target is the point of reference for a verification operation. * * Other B-Tree pages may be allocated, but those are always auxiliary (e.g., * they are current target's child pages). Conceptually, problems are only * ever found in the current target page (or for a particular heap tuple during * heapallindexed verification). Each page found by verification's left/right, * top/bottom scan becomes the target exactly once. */ typedef struct BtreeCheckState { /* * Unchanging state, established at start of verification: */ /* B-Tree Index Relation and associated heap relation */ Relation rel; Relation heaprel; /* ShareLock held on heap/index, rather than AccessShareLock? */ bool readonly; /* Also verifying heap has no unindexed tuples? */ bool heapallindexed; /* Per-page context */ MemoryContext targetcontext; /* Buffer access strategy */ BufferAccessStrategy checkstrategy; /* * Mutable state, for verification of particular page: */ /* Current target page */ Page target; /* Target block number */ BlockNumber targetblock; /* Target page's LSN */ XLogRecPtr targetlsn; /* * Mutable state, for optional heapallindexed verification: */ /* Bloom filter fingerprints B-Tree index */ bloom_filter *filter; /* Debug counter */ int64 heaptuplespresent; } BtreeCheckState; /* * Starting point for verifying an entire B-Tree index level */ typedef struct BtreeLevel { /* Level number (0 is leaf page level). */ uint32 level; /* Left most block on level. Scan of level begins here. */ BlockNumber leftmost; /* Is this level reported as "true" root level by meta page? */ bool istruerootlevel; } BtreeLevel; PG_FUNCTION_INFO_V1(bt_index_check_next); PG_FUNCTION_INFO_V1(bt_index_parent_check_next); static void bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed); static inline void btree_index_checkable(Relation rel); static void bt_check_every_level(Relation rel, Relation heaprel, bool readonly, bool heapallindexed); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); static void bt_target_page_check(BtreeCheckState *state); static ScanKey bt_right_page_check_scankey(BtreeCheckState *state); static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, ScanKey targetkey); static void bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate); static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); static inline bool invariant_leq_offset(BtreeCheckState *state, ScanKey key, OffsetNumber upperbound); static inline bool invariant_geq_offset(BtreeCheckState *state, ScanKey key, OffsetNumber lowerbound); static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state, Page other, ScanKey key, OffsetNumber upperbound); static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum); /* * bt_index_check(index regclass, heapallindexed boolean) * * Note that the symbol name is appended with "_next", to avoid symbol clashes * with contrib/amcheck. * * Verify integrity of B-Tree index. * * Acquires AccessShareLock on heap & index relations. Does not consider * invariants that exist between parent/child pages. Optionally verifies * that heap does not contain any unindexed or incorrectly indexed tuples. */ Datum bt_index_check_next(PG_FUNCTION_ARGS) { Oid indrelid = PG_GETARG_OID(0); bool heapallindexed = false; if (PG_NARGS() == 2) heapallindexed = PG_GETARG_BOOL(1); bt_index_check_internal(indrelid, false, heapallindexed); PG_RETURN_VOID(); } /* * bt_index_parent_check(index regclass, heapallindexed boolean) * * Note that the symbol name is appended with "_next", to avoid symbol clashes * with contrib/amcheck. * * Verify integrity of B-Tree index. * * Acquires ShareLock on heap & index relations. Verifies that downlinks in * parent pages are valid lower bounds on child pages. Optionally verifies * that heap does not contain any unindexed or incorrectly indexed tuples. */ Datum bt_index_parent_check_next(PG_FUNCTION_ARGS) { Oid indrelid = PG_GETARG_OID(0); bool heapallindexed = false; if (PG_NARGS() == 2) heapallindexed = PG_GETARG_BOOL(1); bt_index_check_internal(indrelid, true, heapallindexed); PG_RETURN_VOID(); } /* * Helper for bt_index_[parent_]check, coordinating the bulk of the work. */ static void bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed) { Oid heapid; Relation indrel; Relation heaprel; LOCKMODE lockmode; if (parentcheck) lockmode = ShareLock; else lockmode = AccessShareLock; /* * We must lock table before index to avoid deadlocks. However, if the * passed indrelid isn't an index then IndexGetRelation() will fail. * Rather than emitting a not-very-helpful error message, postpone * complaining, expecting that the is-it-an-index test below will fail. * * In hot standby mode this will raise an error when parentcheck is true. */ heapid = IndexGetRelation(indrelid, true); if (OidIsValid(heapid)) heaprel = heap_open(heapid, lockmode); else heaprel = NULL; /* * Open the target index relations separately (like relation_openrv(), but * with heap relation locked first to prevent deadlocking). In hot * standby mode this will raise an error when parentcheck is true. */ indrel = index_open(indrelid, lockmode); /* * Since we did the IndexGetRelation call above without any lock, it's * barely possible that a race against an index drop/recreation could have * netted us the wrong table. */ if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_TABLE), errmsg("could not open parent table of index %s", RelationGetRelationName(indrel)))); /* Relation suitable for checking as B-Tree? */ btree_index_checkable(indrel); /* Check index, possibly against table it is an index on */ bt_check_every_level(indrel, heaprel, parentcheck, heapallindexed); /* * Release locks early. That's ok here because nothing in the called * routines will trigger shared cache invalidations to be sent, so we can * relax the usual pattern of only releasing locks after commit. */ index_close(indrel, lockmode); if (heaprel) heap_close(heaprel, lockmode); } /* * Basic checks about the suitability of a relation for checking as a B-Tree * index. * * NB: Intentionally not checking permissions, the function is normally not * callable by non-superusers. If granted, it's useful to be able to check a * whole cluster. */ static inline void btree_index_checkable(Relation rel) { if (rel->rd_rel->relkind != RELKIND_INDEX || rel->rd_rel->relam != BTREE_AM_OID) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("only B-Tree indexes are supported as targets for verification"), errdetail("Relation \"%s\" is not a B-Tree index.", RelationGetRelationName(rel)))); if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"), errdetail("Index \"%s\" is associated with temporary relation.", RelationGetRelationName(rel)))); if (!IndexIsValid(rel->rd_index)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot check index \"%s\"", RelationGetRelationName(rel)), errdetail("Index is not valid"))); } /* * Main entry point for B-Tree SQL-callable functions. Walks the B-Tree in * logical order, verifying invariants as it goes. Optionally, verification * checks if the heap relation contains any tuples that are not represented in * the index but should be. * * It is the caller's responsibility to acquire appropriate heavyweight lock on * the index relation, and advise us if extra checks are safe when a ShareLock * is held. (A lock of the same type must also have been acquired on the heap * relation.) * * A ShareLock is generally assumed to prevent any kind of physical * modification to the index structure, including modifications that VACUUM may * make. This does not include setting of the LP_DEAD bit by concurrent index * scans, although that is just metadata that is not able to directly affect * any check performed here. Any concurrent process that might act on the * LP_DEAD bit being set (recycle space) requires a heavyweight lock that * cannot be held while we hold a ShareLock. (Besides, even if that could * happen, the ad-hoc recycling when a page might otherwise split is performed * per-page, and requires an exclusive buffer lock, which wouldn't cause us * trouble. _bt_delitems_vacuum() may only delete leaf items, and so the extra * parent/child check cannot be affected.) */ static void bt_check_every_level(Relation rel, Relation heaprel, bool readonly, bool heapallindexed) { BtreeCheckState *state; Page metapage; BTMetaPageData *metad; uint32 previouslevel; BtreeLevel current; /* * RecentGlobalXmin assertion matches index_getnext_tid(). See note on * RecentGlobalXmin/B-Tree page deletion. * * We also rely on TransactionXmin having been initialized by now. */ Assert(TransactionIdIsValid(RecentGlobalXmin)); Assert(TransactionIdIsNormal(TransactionXmin)); /* * Initialize state for entire verification operation */ state = palloc(sizeof(BtreeCheckState)); state->rel = rel; state->heaprel = heaprel; state->readonly = readonly; state->heapallindexed = heapallindexed; if (state->heapallindexed) { int64 total_elems; uint32 seed; /* Size Bloom filter based on estimated number of tuples in index */ total_elems = (int64) state->rel->rd_rel->reltuples; /* Random seed relies on backend srandom() call to avoid repetition */ seed = random(); /* Create Bloom filter to fingerprint index */ state->filter = bloom_create(total_elems, maintenance_work_mem, seed); state->heaptuplespresent = 0; } /* Create context for page */ state->targetcontext = AllocSetContextCreate(CurrentMemoryContext, "amcheck context", #if PG_VERSION_NUM >= 110000 ALLOCSET_DEFAULT_SIZES); #else ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); #endif state->checkstrategy = GetAccessStrategy(BAS_BULKREAD); /* Get true root block from meta-page */ metapage = palloc_btree_page(state, BTREE_METAPAGE); metad = BTPageGetMeta(metapage); /* * Certain deletion patterns can result in "skinny" B-Tree indexes, where * the fast root and true root differ. * * Start from the true root, not the fast root, unlike conventional index * scans. This approach is more thorough, and removes the risk of * following a stale fast root from the meta page. */ if (metad->btm_fastroot != metad->btm_root) ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), errmsg("harmless fast root mismatch in index %s", RelationGetRelationName(rel)), errdetail_internal("Fast root block %u (level %u) differs from true root block %u (level %u).", metad->btm_fastroot, metad->btm_fastlevel, metad->btm_root, metad->btm_level))); /* * Starting at the root, verify every level. Move left to right, top to * bottom. Note that there may be no pages other than the meta page (meta * page can indicate that root is P_NONE when the index is totally empty). */ previouslevel = InvalidBtreeLevel; current.level = metad->btm_level; current.leftmost = metad->btm_root; current.istruerootlevel = true; while (current.leftmost != P_NONE) { /* * Verify this level, and get left most page for next level down, if * not at leaf level */ current = bt_check_level_from_leftmost(state, current); if (current.leftmost == InvalidBlockNumber) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" has no valid pages on level below %u or first level", RelationGetRelationName(rel), previouslevel))); previouslevel = current.level; } /* * * Heap contains unindexed/malformed tuples check * */ if (state->heapallindexed) { IndexInfo *indexinfo; if (state->readonly) elog(DEBUG1, "verifying presence of all required tuples in index \"%s\"", RelationGetRelationName(rel)); else elog(DEBUG1, "verifying presence of required tuples in index \"%s\" with xmin before %u", RelationGetRelationName(rel), TransactionXmin); indexinfo = BuildIndexInfo(state->rel); /* * Force use of MVCC snapshot (reuse CONCURRENTLY infrastructure) when * only AccessShareLocks held. It seems like a good idea to not * diverge from expected heap lock strength. */ indexinfo->ii_Concurrent = !state->readonly; /* * Don't wait for uncommitted tuple xact commit/abort when index is a * unique index (or an index used by an exclusion constraint). It is * supposed to be impossible to get duplicates with the already-defined * unique index in place. Our relation-level locks prevent races * resulting in false positive corruption errors where an IndexTuple * insertion was just missed, but we still test its heap tuple. (While * this would not be true for !readonly verification, it doesn't matter * because CREATE INDEX CONCURRENTLY index build heap scanning has no * special treatment for unique indexes to avoid.) * * Not waiting can only affect verification of indexes on system * catalogs, where heavyweights locks can be dropped before transaction * commit. If anything, avoiding waiting slightly improves test * coverage. */ indexinfo->ii_Unique = false; indexinfo->ii_ExclusionOps = NULL; indexinfo->ii_ExclusionProcs = NULL; indexinfo->ii_ExclusionStrats = NULL; IndexBuildHeapScan(state->heaprel, state->rel, indexinfo, true, bt_tuple_present_callback, (void *) state); ereport(DEBUG1, (errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples (proportion of bits set: %f) from table \"%s\"", state->heaptuplespresent, bloom_prop_bits_set(state->filter), RelationGetRelationName(heaprel)))); bloom_free(state->filter); } /* Be tidy: */ MemoryContextDelete(state->targetcontext); } /* * Given a left-most block at some level, move right, verifying each page * individually (with more verification across pages for "readonly" * callers). Caller should pass the true root page as the leftmost initially, * working their way down by passing what is returned for the last call here * until level 0 (leaf page level) was reached. * * Returns state for next call, if any. This includes left-most block number * one level lower that should be passed on next level/call, which is set to * P_NONE on last call here (when leaf level is verified). Level numbers * follow the nbtree convention: higher levels have higher numbers, because new * levels are added only due to a root page split. Note that prior to the * first root page split, the root is also a leaf page, so there is always a * level 0 (leaf level), and it's always the last level processed. * * Note on memory management: State's per-page context is reset here, between * each call to bt_target_page_check(). */ static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) { /* State to establish early, concerning entire level */ BTPageOpaque opaque; MemoryContext oldcontext; BtreeLevel nextleveldown; /* Variables for iterating across level using right links */ BlockNumber leftcurrent = P_NONE; BlockNumber current = level.leftmost; /* Initialize return state */ nextleveldown.leftmost = InvalidBlockNumber; nextleveldown.level = InvalidBtreeLevel; nextleveldown.istruerootlevel = false; /* Use page-level context for duration of this call */ oldcontext = MemoryContextSwitchTo(state->targetcontext); elog(DEBUG2, "verifying level %u%s", level.level, level.istruerootlevel ? " (true root level)" : level.level == 0 ? " (leaf level)" : ""); do { /* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */ CHECK_FOR_INTERRUPTS(); /* Initialize state for this iteration */ state->targetblock = current; state->target = palloc_btree_page(state, state->targetblock); state->targetlsn = PageGetLSN(state->target); opaque = (BTPageOpaque) PageGetSpecialPointer(state->target); if (P_IGNORE(opaque)) { if (P_RIGHTMOST(opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u fell off the end of index \"%s\"", current, RelationGetRelationName(state->rel)))); else ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), errmsg("block %u of index \"%s\" ignored", current, RelationGetRelationName(state->rel)))); goto nextpage; } else if (nextleveldown.leftmost == InvalidBlockNumber) { /* * A concurrent page split could make the caller supplied leftmost * block no longer contain the leftmost page, or no longer be the * true root, but where that isn't possible due to heavyweight * locking, check that the first valid page meets caller's * expectations. */ if (state->readonly) { if (!P_LEFTMOST(opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u is not leftmost in index \"%s\"", current, RelationGetRelationName(state->rel)))); if (level.istruerootlevel && !P_ISROOT(opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u is not true root in index \"%s\"", current, RelationGetRelationName(state->rel)))); } /* * Before beginning any non-trivial examination of level, prepare * state for next bt_check_level_from_leftmost() invocation for * the next level for the next level down (if any). * * There should be at least one non-ignorable page per level, * unless this is the leaf level, which is assumed by caller to be * final level. */ if (!P_ISLEAF(opaque)) { IndexTuple itup; ItemId itemid; /* Internal page -- downlink gets leftmost on next level */ itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(state->target, itemid); nextleveldown.leftmost = ItemPointerGetBlockNumber(&(itup->t_tid)); nextleveldown.level = opaque->btpo.level - 1; } else { /* * Leaf page -- final level caller must process. * * Note that this could also be the root page, if there has * been no root page split yet. */ nextleveldown.leftmost = P_NONE; nextleveldown.level = InvalidBtreeLevel; } /* * Finished setting up state for this call/level. Control will * never end up back here in any future loop iteration for this * level. */ } if (state->readonly && opaque->btpo_prev != leftcurrent) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("left link/right link pair in index \"%s\" not in agreement", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u left block=%u left link from block=%u.", current, leftcurrent, opaque->btpo_prev))); /* Check level, which must be valid for non-ignorable page */ if (level.level != opaque->btpo.level) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down", RelationGetRelationName(state->rel)), errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", current, level.level, opaque->btpo.level))); /* Verify invariants for page */ bt_target_page_check(state); nextpage: /* Try to detect circular links */ if (current == leftcurrent || current == opaque->btpo_prev) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("circular link chain found in block %u of index \"%s\"", current, RelationGetRelationName(state->rel)))); leftcurrent = current; current = opaque->btpo_next; /* Free page and associated memory for this iteration */ MemoryContextReset(state->targetcontext); } while (current != P_NONE); /* Don't change context for caller */ MemoryContextSwitchTo(oldcontext); return nextleveldown; } /* * Function performs the following checks on target page, or pages ancillary to * target page: * * - That every "real" data item is less than or equal to the high key, which * is an upper bound on the items on the pages (where there is a high key at * all -- pages that are rightmost lack one). * * - That within the page, every "real" item is less than or equal to the item * immediately to its right, if any (i.e., that the items are in order within * the page, so that the binary searches performed by index scans are sane). * * - That the last item stored on the page is less than or equal to the first * "real" data item on the page to the right (if such a first item is * available). * * Furthermore, when state passed shows ShareLock held, and target page is * internal page, function also checks: * * - That all child pages respect downlinks lower bound. * * This is also where heapallindexed callers use their Bloom filter to * fingerprint IndexTuples. * * Note: Memory allocated in this routine is expected to be released by caller * resetting state->targetcontext. */ static void bt_target_page_check(BtreeCheckState *state) { OffsetNumber offset; OffsetNumber max; BTPageOpaque topaque; topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); max = PageGetMaxOffsetNumber(state->target); elog(DEBUG2, "verifying %u items on %s block %u", max, P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock); /* * Loop over page items, starting from first non-highkey item, not high * key (if any). Also, immediately skip "negative infinity" real item (if * any). */ for (offset = P_FIRSTDATAKEY(topaque); offset <= max; offset = OffsetNumberNext(offset)) { ItemId itemid; IndexTuple itup; ScanKey skey; CHECK_FOR_INTERRUPTS(); /* * Don't try to generate scankey using "negative infinity" garbage * data */ if (offset_is_negative_infinity(topaque, offset)) continue; /* Build insertion scankey for current page offset */ itemid = PageGetItemId(state->target, offset); itup = (IndexTuple) PageGetItem(state->target, itemid); skey = _bt_mkscankey(state->rel, itup); /* Fingerprint leaf page tuples (those that point to the heap) */ if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid)) bloom_add_element(state->filter, (unsigned char *) itup, IndexTupleSize(itup)); /* * * High key check * * * If there is a high key (if this is not the rightmost page on its * entire level), check that high key actually is upper bound on all * page items. * * We prefer to check all items against high key rather than checking * just the last and trusting that the operator class obeys the * transitive law (which implies that all previous items also * respected the high key invariant if they pass the item order * check). * * Ideally, we'd compare every item in the index against every other * item in the index, and not trust opclass obedience of the * transitive law to bridge the gap between children and their * grandparents (as well as great-grandparents, and so on). We don't * go to those lengths because that would be prohibitively expensive, * and probably not markedly more effective in practice. */ if (!P_RIGHTMOST(topaque) && !invariant_leq_offset(state, skey, P_HIKEY)) { char *itid, *htid; itid = psprintf("(%u,%u)", state->targetblock, offset); htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(&(itup->t_tid)), ItemPointerGetOffsetNumber(&(itup->t_tid))); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("high key invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", itid, P_ISLEAF(topaque) ? "heap" : "index", htid, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } /* * * Item order check * * * Check that items are stored on page in logical order, by checking * current item is less than or equal to next item (if any). */ if (OffsetNumberNext(offset) <= max && !invariant_leq_offset(state, skey, OffsetNumberNext(offset))) { char *itid, *htid, *nitid, *nhtid; itid = psprintf("(%u,%u)", state->targetblock, offset); htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(&(itup->t_tid)), ItemPointerGetOffsetNumber(&(itup->t_tid))); nitid = psprintf("(%u,%u)", state->targetblock, OffsetNumberNext(offset)); /* Reuse itup to get pointed-to heap location of second item */ itemid = PageGetItemId(state->target, OffsetNumberNext(offset)); itup = (IndexTuple) PageGetItem(state->target, itemid); nhtid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(&(itup->t_tid)), ItemPointerGetOffsetNumber(&(itup->t_tid))); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("item order invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Lower index tid=%s (points to %s tid=%s) " "higher index tid=%s (points to %s tid=%s) " "page lsn=%X/%X.", itid, P_ISLEAF(topaque) ? "heap" : "index", htid, nitid, P_ISLEAF(topaque) ? "heap" : "index", nhtid, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } /* * * Last item check * * * Check last item against next/right page's first data item's when * last item on page is reached. This additional check will detect * transposed pages iff the supposed right sibling page happens to * belong before target in the key space. (Otherwise, a subsequent * heap verification will probably detect the problem.) * * This check is similar to the item order check that will have * already been performed for every other "real" item on target page * when last item is checked. The difference is that the next item * (the item that is compared to target's last item) needs to come * from the next/sibling page. There may not be such an item * available from sibling for various reasons, though (e.g., target is * the rightmost page on level). */ else if (offset == max) { ScanKey rightkey; /* Get item in next/right page */ rightkey = bt_right_page_check_scankey(state); if (rightkey && !invariant_geq_offset(state, rightkey, max)) { /* * As explained at length in bt_right_page_check_scankey(), * there is a known !readonly race that could account for * apparent violation of invariant, which we must check for * before actually proceeding with raising error. Our canary * condition is that target page was deleted. */ if (!state->readonly) { /* Get fresh copy of target page */ state->target = palloc_btree_page(state, state->targetblock); /* Note that we deliberately do not update target LSN */ topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); /* * All !readonly checks now performed; just return */ if (P_IGNORE(topaque)) return; } ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("cross page item order invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.", state->targetblock, offset, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } } /* * * Downlink check * * * Additional check of child items iff this is an internal page and * caller holds a ShareLock. This happens for every downlink (item) * in target excluding the negative-infinity downlink (again, this is * because it has no useful value to compare). */ if (!P_ISLEAF(topaque) && state->readonly) { BlockNumber childblock = ItemPointerGetBlockNumber(&(itup->t_tid)); bt_downlink_check(state, childblock, skey); } } } /* * Return a scankey for an item on page to right of current target (or the * first non-ignorable page), sufficient to check ordering invariant on last * item in current target page. Returned scankey relies on local memory * allocated for the child page, which caller cannot pfree(). Caller's memory * context should be reset between calls here. * * This is the first data item, and so all adjacent items are checked against * their immediate sibling item (which may be on a sibling page, or even a * "cousin" page at parent boundaries where target's rightlink points to page * with different parent page). If no such valid item is available, return * NULL instead. * * Note that !readonly callers must reverify that target page has not * been concurrently deleted. */ static ScanKey bt_right_page_check_scankey(BtreeCheckState *state) { BTPageOpaque opaque; ItemId rightitem; BlockNumber targetnext; Page rightpage; OffsetNumber nline; /* Determine target's next block number */ opaque = (BTPageOpaque) PageGetSpecialPointer(state->target); /* If target is already rightmost, no right sibling; nothing to do here */ if (P_RIGHTMOST(opaque)) return NULL; /* * General notes on concurrent page splits and page deletion: * * Routines like _bt_search() don't require *any* page split interlock * when descending the tree, including something very light like a buffer * pin. That's why it's okay that we don't either. This avoidance of any * need to "couple" buffer locks is the raison d' etre of the Lehman & Yao * algorithm, in fact. * * That leaves deletion. A deleted page won't actually be recycled by * VACUUM early enough for us to fail to at least follow its right link * (or left link, or downlink) and find its sibling, because recycling * does not occur until no possible index scan could land on the page. * Index scans can follow links with nothing more than their snapshot as * an interlock and be sure of at least that much. (See page * recycling/RecentGlobalXmin notes in nbtree README.) * * Furthermore, it's okay if we follow a rightlink and find a half-dead or * dead (ignorable) page one or more times. There will either be a * further right link to follow that leads to a live page before too long * (before passing by parent's rightmost child), or we will find the end * of the entire level instead (possible when parent page is itself the * rightmost on its level). */ targetnext = opaque->btpo_next; for (;;) { CHECK_FOR_INTERRUPTS(); rightpage = palloc_btree_page(state, targetnext); opaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); if (!P_IGNORE(opaque) || P_RIGHTMOST(opaque)) break; /* We landed on a deleted page, so step right to find a live page */ targetnext = opaque->btpo_next; ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), errmsg("level %u leftmost page of index \"%s\" was found deleted or half dead", opaque->btpo.level, RelationGetRelationName(state->rel)), errdetail_internal("Deleted page found when building scankey from right sibling."))); /* Be slightly more pro-active in freeing this memory, just in case */ pfree(rightpage); } /* * No ShareLock held case -- why it's safe to proceed. * * Problem: * * We must avoid false positive reports of corruption when caller treats * item returned here as an upper bound on target's last item. In * general, false positives are disallowed. Avoiding them here when * caller is !readonly is subtle. * * A concurrent page deletion by VACUUM of the target page can result in * the insertion of items on to this right sibling page that would * previously have been inserted on our target page. There might have * been insertions that followed the target's downlink after it was made * to point to right sibling instead of target by page deletion's first * phase. The inserters insert items that would belong on target page. * This race is very tight, but it's possible. This is our only problem. * * Non-problems: * * We are not hindered by a concurrent page split of the target; we'll * never land on the second half of the page anyway. A concurrent split * of the right page will also not matter, because the first data item * remains the same within the left half, which we'll reliably land on. If * we had to skip over ignorable/deleted pages, it cannot matter because * their key space has already been atomically merged with the first * non-ignorable page we eventually find (doesn't matter whether the page * we eventually find is a true sibling or a cousin of target, which we go * into below). * * Solution: * * Caller knows that it should reverify that target is not ignorable * (half-dead or deleted) when cross-page sibling item comparison appears * to indicate corruption (invariant fails). This detects the single race * condition that exists for caller. This is correct because the * continued existence of target block as non-ignorable (not half-dead or * deleted) implies that target page was not merged into from the right by * deletion; the key space at or after target never moved left. Target's * parent either has the same downlink to target as before, or a <= * downlink due to deletion at the left of target. Target either has the * same highkey as before, or a highkey <= before when there is a page * split. (The rightmost concurrently-split-from-target-page page will * still have the same highkey as target was originally found to have, * which for our purposes is equivalent to target's highkey itself never * changing, since we reliably skip over * concurrently-split-from-target-page pages.) * * In simpler terms, we allow that the key space of the target may expand * left (the key space can move left on the left side of target only), but * the target key space cannot expand right and get ahead of us without * our detecting it. The key space of the target cannot shrink, unless it * shrinks to zero due to the deletion of the original page, our canary * condition. (To be very precise, we're a bit stricter than that because * it might just have been that the target page split and only the * original target page was deleted. We can be more strict, just not more * lax.) * * Top level tree walk caller moves on to next page (makes it the new * target) following recovery from this race. (cf. The rationale for * child/downlink verification needing a ShareLock within * bt_downlink_check(), where page deletion is also the main source of * trouble.) * * Note that it doesn't matter if right sibling page here is actually a * cousin page, because in order for the key space to be readjusted in a * way that causes us issues in next level up (guiding problematic * concurrent insertions to the cousin from the grandparent rather than to * the sibling from the parent), there'd have to be page deletion of * target's parent page (affecting target's parent's downlink in target's * grandparent page). Internal page deletion only occurs when there are * no child pages (they were all fully deleted), and caller is checking * that the target's parent has at least one non-deleted (so * non-ignorable) child: the target page. (Note that the first phase of * deletion atomically marks the page to be deleted half-dead/ignorable at * the same time downlink in its parent is removed, so caller will * definitely not fail to detect that this happened.) * * This trick is inspired by the method backward scans use for dealing * with concurrent page splits; concurrent page deletion is a problem that * similarly receives special consideration sometimes (it's possible that * the backwards scan will re-read its "original" block after failing to * find a right-link to it, having already moved in the opposite direction * (right/"forwards") a few times to try to locate one). Just like us, * that happens only to determine if there was a concurrent page deletion * of a reference page, and just like us if there was a page deletion of * that reference page it means we can move on from caring about the * reference page. See the nbtree README for a full description of how * that works. */ nline = PageGetMaxOffsetNumber(rightpage); /* * Get first data item, if any */ if (P_ISLEAF(opaque) && nline >= P_FIRSTDATAKEY(opaque)) { /* Return first data item (if any) */ rightitem = PageGetItemId(rightpage, P_FIRSTDATAKEY(opaque)); } else if (!P_ISLEAF(opaque) && nline >= OffsetNumberNext(P_FIRSTDATAKEY(opaque))) { /* * Return first item after the internal page's "negative infinity" * item */ rightitem = PageGetItemId(rightpage, OffsetNumberNext(P_FIRSTDATAKEY(opaque))); } else { /* * No first item. Page is probably empty leaf page, but it's also * possible that it's an internal page with only a negative infinity * item. */ ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), errmsg("%s block %u of index \"%s\" has no first data item", P_ISLEAF(opaque) ? "leaf" : "internal", targetnext, RelationGetRelationName(state->rel)))); return NULL; } /* * Return first real item scankey. Note that this relies on right page * memory remaining allocated. */ return _bt_mkscankey(state->rel, (IndexTuple) PageGetItem(rightpage, rightitem)); } /* * Checks one of target's downlink against its child page. * * Conceptually, the target page continues to be what is checked here. The * target block is still blamed in the event of finding an invariant violation. * The downlink insertion into the target is probably where any problem raised * here arises, and there is no such thing as a parent link, so doing the * verification this way around is much more practical. */ static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, ScanKey targetkey) { OffsetNumber offset; OffsetNumber maxoffset; Page child; BTPageOpaque copaque; /* * Caller must have ShareLock on target relation, because of * considerations around page deletion by VACUUM. * * NB: In general, page deletion deletes the right sibling's downlink, not * the downlink of the page being deleted; the deleted page's downlink is * reused for its sibling. The key space is thereby consolidated between * the deleted page and its right sibling. (We cannot delete a parent * page's rightmost child unless it is the last child page, and we intend * to also delete the parent itself.) * * If this verification happened without a ShareLock, the following race * condition could cause false positives: * * In general, concurrent page deletion might occur, including deletion of * the left sibling of the child page that is examined here. If such a * page deletion were to occur, closely followed by an insertion into the * newly expanded key space of the child, a window for the false positive * opens up: the stale parent/target downlink originally followed to get * to the child legitimately ceases to be a lower bound on all items in * the page, since the key space was concurrently expanded "left". * (Insertion followed the "new" downlink for the child, not our now-stale * downlink, which was concurrently physically removed in target/parent as * part of deletion's first phase.) * * Note that while the cross-page-same-level last item check uses a trick * that allows it to perform verification for !readonly callers, a similar * trick seems difficult here. The trick that that other check uses is, * in essence, to lock down race conditions to those that occur due to * concurrent page deletion of the target; that's a race that can be * reliably detected before actually reporting corruption. * * On the other hand, we'd need to lock down race conditions involving * deletion of child's left page, for long enough to read the child page * into memory (in other words, a scheme with concurrently held buffer * locks on both child and left-of-child pages). That's unacceptable for * amcheck functions on general principle, though. */ Assert(state->readonly); /* * Verify child page has the downlink key from target page (its parent) as * a lower bound. * * Check all items, rather than checking just the first and trusting that * the operator class obeys the transitive law. */ child = palloc_btree_page(state, childblock); copaque = (BTPageOpaque) PageGetSpecialPointer(child); maxoffset = PageGetMaxOffsetNumber(child); for (offset = P_FIRSTDATAKEY(copaque); offset <= maxoffset; offset = OffsetNumberNext(offset)) { /* * Skip comparison of target page key against "negative infinity" * item, if any. Checking it would indicate that it's not an upper * bound, but that's only because of the hard-coding within * _bt_compare(). */ if (offset_is_negative_infinity(copaque, offset)) continue; if (!invariant_leq_nontarget_offset(state, child, targetkey, offset)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("down-link lower bound invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.", state->targetblock, childblock, offset, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } pfree(child); } /* * Per-tuple callback from IndexBuildHeapScan, used to determine if index has * all the entries that definitely should have been observed in leaf pages of * the target index (that is, all IndexTuples that were fingerprinted by our * Bloom filter). All heapallindexed checks occur here. * * Theory of operation: * * The redundancy between an index and the table it indexes provides a good * opportunity to detect corruption, especially corruption within the table. * The high level principle behind the verification performed here is that any * IndexTuple that should be in an index following a fresh CREATE INDEX (based * on the same index definition) should also have been in the original, * existing index, which should have used exactly the same representation * (Index tuple formation is assumed to be deterministic, and IndexTuples are * assumed immutable; while the LP_DEAD bit is mutable, that's ItemId metadata, * which is not fingerprinted). There will often be some dead-to-everyone * IndexTuples fingerprinted by the Bloom filter, but we only try to detect the * *absence of needed tuples*, so that's okay. * * Since the overall structure of the index has already been verified, the most * likely explanation for error here is a corrupt heap page (could be logical * or physical corruption). Index corruption may still be detected here, * though. Only readonly callers will have verified that left links and right * links are in agreement, and so it's possible that a leaf page transposition * within index is actually the source of corruption detected here (for * !readonly callers). The checks performed only for readonly callers might * more accurately frame the problem as a cross-page invariant issue (this * could even be due to recovery not replaying all WAL records). The !readonly * ERROR message raised here includes a HINT about retrying with readonly * verification, just in case it's a cross-page invariant issue, though that * isn't particularly likely. * * IndexBuildHeapScan() expects to be able to find the root tuple when a * heap-only tuple (the live tuple at the end of some HOT chain) needs to be * indexed, in order to replace the actual tuple's TID with the root tuple's * TID (which is what we're actually passed back here). The index build heap * scan code will raise an error when a tuple that claims to be the root of the * heap-only tuple's HOT chain cannot be located. This catches cases where the * original root item offset/root tuple for a HOT chain indicates (for whatever * reason) that the entire HOT chain is dead, despite the fact that the latest * heap-only tuple should be indexed. When this happens, sequential scans may * always give correct answers, and all indexes may be considered structurally * consistent (i.e. the nbtree structural checks would not detect corruption). * It may be the case that only index scans give wrong answers, and yet heap or * SLRU corruption is the real culprit. (While it's true that LP_DEAD bit * setting will probably also leave the index in a corrupt state before too * long, the problem is nonetheless that there is heap corruption.) * * Note also that heap-only tuple handling within IndexBuildHeapScan() detects * index tuples that contain the wrong values. This can happen when there is * no superseding index tuple due to a faulty assessment of HOT safety. * Because the latest tuple's contents are used with the root TID, an error * will be raised when a tuple with the same TID but different (correct) * attribute values is passed back to us. (Faulty assessment of HOT-safety was * behind the CREATE INDEX CONCURRENTLY bug that was fixed in February of * 2017.) */ static void bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate) { BtreeCheckState *state = (BtreeCheckState *) checkstate; IndexTuple itup; Assert(state->heapallindexed); /* Must recheck visibility when only AccessShareLock held */ if (!state->readonly) { TransactionId xmin; /* * Don't test for presence in index where xmin not at least old enough * that we know for sure that absence of index tuple wasn't just due to * some transaction performing insertion after our verifying index * traversal began. (Actually, the cut-off used is a point where * preceding write transactions must have committed/aborted. We should * have already fingerprinted all index tuples for all such preceding * transactions, because the cut-off was established before our index * traversal even began.) * * You might think that the fact that an MVCC snapshot is used by the * heap scan (due to our indicating that this is the first scan of a * CREATE INDEX CONCURRENTLY index build) would make this test * redundant. That's not quite true, because with current * IndexBuildHeapScan() interface caller cannot do the MVCC snapshot * acquisition itself. Heap tuple coverage is thereby similar to the * coverage we could get by using earliest transaction snapshot * directly. It's easier to do this than to adopt the * IndexBuildHeapScan() interface to our narrow requirements. */ Assert(tupleIsAlive); xmin = HeapTupleHeaderGetXmin(htup->t_data); if (!TransactionIdPrecedes(xmin, TransactionXmin)) return; } /* * Generate an index tuple. * * Note that we rely on deterministic index_form_tuple() TOAST compression. * If index_form_tuple() was ever enhanced to compress datums out-of-line, * or otherwise varied when or how compression was applied, our assumption * would break, leading to false positive reports of corruption. For now, * we don't decompress/normalize toasted values as part of fingerprinting. */ itup = index_form_tuple(RelationGetDescr(index), values, isnull); itup->t_tid = htup->t_self; /* Probe Bloom filter -- tuple should be present */ if (bloom_lacks_element(state->filter, (unsigned char *) itup, IndexTupleSize(itup))) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"", ItemPointerGetBlockNumber(&(itup->t_tid)), ItemPointerGetOffsetNumber(&(itup->t_tid)), RelationGetRelationName(state->heaprel), RelationGetRelationName(state->rel)), !state->readonly ? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.") : 0)); state->heaptuplespresent++; pfree(itup); } /* * Is particular offset within page (whose special state is passed by caller) * the page negative-infinity item? * * As noted in comments above _bt_compare(), there is special handling of the * first data item as a "negative infinity" item. The hard-coding within * _bt_compare() makes comparing this item for the purposes of verification * pointless at best, since the IndexTuple only contains a valid TID (a * reference TID to child page). */ static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset) { /* * For internal pages only, the first item after high key, if any, is * negative infinity item. Internal pages always have a negative infinity * item, whereas leaf pages never have one. This implies that negative * infinity item is either first or second line item, or there is none * within page. * * Right-most pages don't have a high key, but could be said to * conceptually have a "positive infinity" high key. Thus, there is a * symmetry between down link items in parent pages, and high keys in * children. Together, they represent the part of the key space that * belongs to each page in the index. For example, all children of the * root page will have negative infinity as a lower bound from root * negative infinity downlink, and positive infinity as an upper bound * (implicitly, from "imaginary" positive infinity high key in root). */ return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque); } /* * Does the invariant hold that the key is less than or equal to a given upper * bound offset item? * * If this function returns false, convention is that caller throws error due * to corruption. */ static inline bool invariant_leq_offset(BtreeCheckState *state, ScanKey key, OffsetNumber upperbound) { int16 natts = state->rel->rd_rel->relnatts; int32 cmp; cmp = _bt_compare(state->rel, natts, key, state->target, upperbound); return cmp <= 0; } /* * Does the invariant hold that the key is greater than or equal to a given * lower bound offset item? * * If this function returns false, convention is that caller throws error due * to corruption. */ static inline bool invariant_geq_offset(BtreeCheckState *state, ScanKey key, OffsetNumber lowerbound) { int16 natts = state->rel->rd_rel->relnatts; int32 cmp; cmp = _bt_compare(state->rel, natts, key, state->target, lowerbound); return cmp >= 0; } /* * Does the invariant hold that the key is less than or equal to a given upper * bound offset item, with the offset relating to a caller-supplied page that * is not the current target page? Caller's non-target page is typically a * child page of the target, checked as part of checking a property of the * target page (i.e. the key comes from the target). * * If this function returns false, convention is that caller throws error due * to corruption. */ static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state, Page nontarget, ScanKey key, OffsetNumber upperbound) { int16 natts = state->rel->rd_rel->relnatts; int32 cmp; cmp = _bt_compare(state->rel, natts, key, nontarget, upperbound); return cmp <= 0; } /* * Given a block number of a B-Tree page, return page in palloc()'d memory. * While at it, perform some basic checks of the page. * * There is never an attempt to get a consistent view of multiple pages using * multiple concurrent buffer locks; in general, we only acquire a single pin * and buffer lock at a time, which is often all that the nbtree code requires. * * Operating on a copy of the page is useful because it prevents control * getting stuck in an uninterruptible state when an underlying operator class * misbehaves. */ static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) { Buffer buffer; Page page; BTPageOpaque opaque; page = palloc(BLCKSZ); /* * We copy the page into local storage to avoid holding pin on the buffer * longer than we must. */ buffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, blocknum, RBM_NORMAL, state->checkstrategy); LockBuffer(buffer, BT_READ); /* * Perform the same basic sanity checking that nbtree itself performs for * every page: */ _bt_checkpage(state->rel, buffer); /* Only use copy of page in palloc()'d memory */ memcpy(page, BufferGetPage(buffer), BLCKSZ); UnlockReleaseBuffer(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_flags & BTP_META && blocknum != BTREE_METAPAGE) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("invalid meta page found at block %u in index \"%s\"", blocknum, RelationGetRelationName(state->rel)))); /* Check page from block that ought to be meta page */ if (blocknum == BTREE_METAPAGE) { BTMetaPageData *metad = BTPageGetMeta(page); if (!(opaque->btpo_flags & BTP_META) || metad->btm_magic != BTREE_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" meta page is corrupt", RelationGetRelationName(state->rel)))); if (metad->btm_version != BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("version mismatch in index \"%s\": file version %d, code version %d", RelationGetRelationName(state->rel), metad->btm_version, BTREE_VERSION))); } /* * Deleted pages have no sane "level" field, so can only check non-deleted * page level */ if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("invalid leaf page level %u for block %u in index \"%s\"", opaque->btpo.level, blocknum, RelationGetRelationName(state->rel)))); if (blocknum != BTREE_METAPAGE && !P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level == 0) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("invalid internal page level 0 for block %u in index \"%s\"", opaque->btpo.level, RelationGetRelationName(state->rel)))); if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("internal page block %u in index \"%s\" has garbage items", blocknum, RelationGetRelationName(state->rel)))); return page; }