pax_global_header00006660000000000000000000000064147446676400014534gustar00rootroot0000000000000052 comment=b882299c9bcadf4fcec68b2c1add8d8fb0b51b98 pg_squeeze-REL1_8_0/000077500000000000000000000000001474466764000144165ustar00rootroot00000000000000pg_squeeze-REL1_8_0/.dir-locals.el000066400000000000000000000002451474466764000170500ustar00rootroot00000000000000((c-mode . ((c-basic-offset . 4) (c-file-style . "bsd") (fill-column . 78) (indent-tabs-mode . t) (tab-width . 4)))) pg_squeeze-REL1_8_0/.github/000077500000000000000000000000001474466764000157565ustar00rootroot00000000000000pg_squeeze-REL1_8_0/.github/workflows/000077500000000000000000000000001474466764000200135ustar00rootroot00000000000000pg_squeeze-REL1_8_0/.github/workflows/regression.yml000066400000000000000000000017101474466764000227150ustar00rootroot00000000000000name: Build on: [push, pull_request] jobs: build: runs-on: ubuntu-latest defaults: run: shell: sh strategy: matrix: pgversion: - 17 - 16 - 15 - 14 - 13 - 12 env: PGVERSION: ${{ matrix.pgversion }} steps: - name: checkout uses: actions/checkout@v3 - name: install pg run: | sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -v $PGVERSION -p -i sudo -u postgres createuser -s "$USER" - name: build run: | make PROFILE="-Werror" sudo -E make install - name: test run: | sudo pg_conftool set shared_preload_libraries pg_squeeze sudo pg_conftool set wal_level logical sudo pg_ctlcluster $PGVERSION main restart make installcheck - name: show regression diffs if: ${{ failure() }} run: | cat regression.diffs pg_squeeze-REL1_8_0/.gitignore000066400000000000000000000000501474466764000164010ustar00rootroot00000000000000*~ *.o *.so results/ GPATH GRTAGS GTAGS pg_squeeze-REL1_8_0/LICENSE000066400000000000000000000017461474466764000154330ustar00rootroot00000000000000Copyright (c) 2016-2023, CYBERTEC PostgreSQL International GmbH Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND CYBERTEC International GmbH HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. pg_squeeze-REL1_8_0/Makefile000066400000000000000000000007531474466764000160630ustar00rootroot00000000000000PG_CONFIG ?= pg_config MODULE_big = pg_squeeze OBJS = pg_squeeze.o concurrent.o worker.o pgstatapprox.o $(WIN32RES) PGFILEDESC = "pg_squeeze - a tool to remove unused space from a relation." EXTENSION = pg_squeeze DATA = pg_squeeze--1.2.sql pg_squeeze--1.2--1.3.sql pg_squeeze--1.3--1.4.sql \ pg_squeeze--1.4--1.5.sql pg_squeeze--1.5--1.6.sql pg_squeeze--1.6--1.7.sql \ pg_squeeze--1.7--1.8.sql DOCS = pg_squeeze.md REGRESS = squeeze PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) pg_squeeze-REL1_8_0/NEWS000066400000000000000000000236051474466764000151230ustar00rootroot00000000000000Release 1.8.0 ============= This release fixes a bug in the squeeze.squeeze_table() function, which can cause data corruption. The problem can occur if other clients run DDL statements or squeeze other tables around the time the function starts its execution. Automatic launching by the scheduler worker (as described in the "Register table for regular processing" section of README) is not affected. Release 1.7.0 ============= New features ------------ 1. Make the code compatible with PostgreSQL server version 17. 2. Do not copy the values of dropped columns into the new table storage (file). So far, only the whole rows (the deleted ones) were considered to contribute to the table bloat. Now we also reclaim the space occupied by dropped columns. 3. Let the squeeze.squeeze_table function raise ERROR. So far, the function only inserted record into the squeeze.log table if the processing was successful, and into squeeze.errors if it failed. Now, in the case of failure, the function also prints out the error message into the console. 4. Do not let the squeeze workers run if there is currently no work for them. For automatic processing, one scheduler worker per database needs to run, but the actual squeeze worker is only started if particular table appears to be bloated. Once done with the processing, the squeeze worker exits instead of waiting for another task. 5. Improved parallelism of the squeeze workers. Multiple worker processes (one per table) can be used since the release 1.6, but so far it was still possible that the work gets serialized due to the logical decoding setup. Those limitations have been relaxed in 1.7. Bug fixes --------- 1. Fixed broken setup of "shared memory hooks". This could cause server crash if other extensions installed their hooks as well. (https://github.com/cybertec-postgresql/pg_squeeze/issues/68) 2. Fixed evaluation of the squeeze.max_xlock_time configuration variable. Due to this bug, pg_squeeze behaved as if the timeout was set to much lower value. (The bug had no effect if the variable was set to 0, which is the default.) 3. Fixed permissions checks for the squeeze.pgstattuple_approx() function. Like with other functions of this extension, the SUPERUSER role attribute is not needed for execution. The REPLICATION attribute is sufficient. 4. Update BRIN indexes when appropriate. If a row was updated during the table processing and only the attributes of "summarizing" (BRIN) indexes changed, pg_squeeze could fail to update those indexes. This bug only affects pg_squeeze on PostgreSQL 16. Release 1.6.1 ============= Bug fixes --------- 1. Pass function argument as Datum rather than the "raw" timestamp. The bug can cause crash on platforms where timestamp is passed by reference (which usually means the 32-bit systems). Release 1.6.0 ============= New features ------------ 1. Make the code compatible with PostgreSQL server version 16. 2. Allow processing of multiple tables at a time, even if the are in the same database. 3. Enhanced monitoring - see the squeeze.get_active_workers() function as well as the new columns of the squeeze.log table. Bug fixes --------- 1. Fixed the mechanism that checks the number of workers per database. This also fixes the stop_worker() function so that it does not miss workers anymore. Note ---- The behavior of the squeeze_table() function changes in this version, see README for more information. Release 1.5.0 ============= This release only updates the code so that it is compatible with PostgreSQL server version 15. Release 1.4.1 ============= Bug fixes --------- 1. Adapted the code to changed API of PostgreSQL core code. Signature of NewHeapCreateToastTable() changed between PG 14beta3 and 14rc1. Release 1.4.0 ============= Bug fixes --------- 1. Fixed determination of WAL insert position before processing the concurrent changes. Without this fix, some data changes performed during the initial load by sessions which have synchronous_commit=off can be lost. New features ------------ 1. Decode WAL even during the initial load, although the changes decoded cannot be applied during the load. This allows WAL segments to be recycled during the load. Note that a huge amount of logical changes can be generated by the initial load itself, but these do not have to be decoded. Thus only changes done by other transactions need to be stored, and applied as soon as the initial load has completed. 2. Process even tables with FULL replica identity, as long as they do have a primary key. Release 1.3.1 ============= Bug fixes --------- 1. Fixed failure to run the CREATE EXTENSION command on PostgreSQL server v10. The problem is that PG10 does not support arrays of domain type. Release 1.3.0 ============= New features ------------ 1. Support for PostgreSQL 13. 2. Enhanced scheduling. Instead of passing an array of timetz values, the DBA can now specify the schedule in a way similar to crontab, i.e. days can be specified too. Note that "ALTER EXTENSION pg_squeeze UPDATE" command does not handle conversion of the schedule - instead it deletes the existing contents of the "squeeze"."tables" configuration table. Another enhancement is that two background workers are now used: one that checks the schedule and another one that checks the amount of bloat and possibly squeezes the tables. This makes the scheduling more accurate. Bug fixes --------- 1. Release the replication slot if ERROR is encountered by the background worker during the call of the squeeze_table() function. For interactive calls, pg_squeeze does not have to care because PG core does. 2. Release lock on the pg_squeeze extension (which ensures that no more than one instance of each background worker can be launched) when exiting due to pg_cancel_backend(). Release 1.2.0 ============= New features ------------ 1. Support for PostgreSQL 12. 2. Do not hide old row versions from VACUUM for longer than necessary. The "initial load" phase of the squeeze process uses a "historic snapshot" to retrieve the table data as it was when the squeeze_table() function started. This data must be protected from VACUUM. pg_squeeze uses replication slot for this purpose, so it's not possible to protect only one particular table from VACUUM. Since this approach affects the whole cluster, the protection needs to be released as soon as the initial load has completed. 3. Try harder to make WAL segments available for archiving. Data changes performed during the squeeze process are retrieved from WAL. Now we check more often if particular WAL segment is still needed by pg_squeeze. Release 1.1.1 ============= Bug fixes --------- 1. Fixed failure to find equality operator for index scan. The failure was observed when the column type (e.g. varchar) differed from the operator class input type (e.g. text). 2. Use index reloptions in the correct format when creating indexes on the new table. The bug could cause crash, however it should not lead to corruption of the original reloptions since the new catalog entries are eventually dropped (only storage of the new index is used). Release 1.1.0 ============= New features ------------ 1. Instead of configuring "first_check" and "task_interval", administrator now passes a "schedule" array of timestamps at which table should be squeezed. 2. Table no longer needs to have the user_catalog_table storage option set before it can be passed to squeeze_table() function. Now INSERT ... ON CONFLICT ... command no longer raises ERROR if trying to access table that is just being squeezed. Another advantage is this change is that calls of the squeeze_table() function are now easier. 3. No longer disable autovacuum for the table being squeezed. The additional code complexity is probably not worth and it does not stop already running VACUUM. It's responsibility of the DBA not to allow VACUUM to waste resources on a table which should be treated by pg_squeeze. Bug fixes --------- 1. Close pg_class relation and its scan if exiting early (i.e. catalog change was detected). 2. Defend against race conditions that allow for applying of the concurrent changes which have already been captured by the initial load. In such a case the squeeze_table() function can raise ERROR due to attempt to insert duplicate value of identity key, or due to failure to update or delete row that no longer exists. 3. Use the correct snapshot for the initial load. So far the squeeze_table() function could have used an obsolete snapshot if some transaction(s) started after the snapshot builder reached SNAPBUILD_FULL_SNAPSHOT state and committed before SNAPBUILD_CONSISTENT. Both the initial load and the logical decoding used to miss such transactions. This problem could only happen after the bug fix #2 had been implemented. Before that at least the logical decoding captured the problematic transactions because it (the decoding) started earlier than necessary. No data loss was reported so far, but we still recommend you to upgrade pg_squeeze to the version that includes this fix. 4. Do not use "initial load memory context" to fetch tuples from table or index. If the load takes place in multiple "batches" (i.e. tuplesort is not used), that memory context gets freed before the next batch starts and the call fo squeeze_table() function results in SEGFAULT. 5. Do not close relation cache entry for index if some field is yet to be accessed. 6. Include composite types in the checks of concurrent catalog changes. If table has a column of a composite data type and the type is altered during the squeeze, the squeeze_table() function raises ERROR. Release 1.0.1 ============= Bug fixes --------- 1. Try harder to avoid out-of-memory conditions during the initial table load. pg_squeeze-REL1_8_0/README.md000066400000000000000000000350731474466764000157050ustar00rootroot00000000000000PostgreSQL extension that removes unused space from a table and optionally sorts tuples according to particular index (as if [CLUSTER][2] command was executed concurrently with regular reads / writes). In fact we try to replace [`pg_repack`][1] extension. While providing very similar functionality, `pg_squeeze` takes a different approach as it: 1. Implements the functionality purely on server side. 2. Utilizes recent improvements of PostgreSQL database server. While (1) makes both configuration and use simpler (compared to [pg_repack][1] which uses both server and client side code), it also allows for rather smooth implementation of unattended processing using [background workers][3]. As for (2), one important difference (besides the use of background workers) is that we use [logical decoding][4] instead of triggers to capture concurrent changes. # INSTALL Install PostgreSQL before proceeding. Make sure to have `pg_config` binary, these are typically included in `-dev` and `-devel` packages. ```bash git clone https://github.com/cybertec-postgresql/pg_squeeze.git cd pg_squeeze make make install ``` Add these to `postgresql.conf`: ``` wal_level = logical max_replication_slots = 1 # ... or add 1 to the current value. shared_preload_libraries = 'pg_squeeze' # ... or add the library to the existing ones. ``` Restart the cluster, and invoke: ``` CREATE EXTENSION pg_squeeze; ``` *Note: when upgrading a database cluster with pg_squeeze installed (either using `pg_dumpall`/restore or `pg_upgrade`), make sure that the new cluster has `pg_squeeze` in `shared_preload_libraries` *before* you upgrade, otherwise the upgrade will fail.* # Register table for regular processing First, make sure that your table has either primary key or unique constraint. This is necessary to process changes other transactions might do while `pg_squeeze` is doing its work. To make the `pg_squeeze` extension aware of the table, you need to insert a record into `squeeze.tables` table. Once added, statistics of the table are checked periodically. Whenever the table meets criteria to be "squeezed", a "task" is added to a queue. The tasks are processed sequentially, in the order they were created. The simplest "registration" looks like: ``` INSERT INTO squeeze.tables (tabschema, tabname, schedule) VALUES ('public', 'foo', ('{30}', '{22}', NULL, NULL, '{3, 5}')); ``` Additional columns can be specified optionally, for example: ``` INSERT INTO squeeze.tables ( tabschema, tabname, schedule, free_space_extra, vacuum_max_age, max_retry ) VALUES ( 'public', 'bar', ('{30}', '{22}', NULL, NULL, '{3, 5}'), 30, '2 hours', 2 ); ``` Following is the complete description of table metadata. * `tabschema` and `tabname` are schema and table name respectively. * `schedule` column tells when the table should be checked, and possibly squeezed. The schedule is described by a value of the following composite data type, which resembles a [crontab][6] entry: ``` CREATE TYPE schedule AS ( minutes minute[], hours hour[], days_of_month dom[], months month[], days_of_week dow[] ); ``` Here, `minutes` (0-59) and `hours` (0-23) specify the time of the check within a day, while `days_of_month` (1-31), `months` (1-12) and `days_of_week` (0-7, where both 0 and 7 stand for Sunday) determine the day of the check. The check is performed if `minute`, `hour` and `month` all match the current timestamp, while NULL value means any minute, hour and month respectively. As for `days_of_month` and `days_of_week`, at least one of these needs to match the current timestamp, or both need to be NULL for the check to take place. For example, in the entries above tell that table `public`.`bar` should be checked every Wednesday and Friday at 22:30. * `free_space_extra` is the minimum percentage of `extra free space` needed to trigger processing of the table. The `extra` adjective refers to the fact that free space derived from `fillfactor` is not reason to squeeze the table. For example, if `fillfactor` is equal to 60, then at least 40 percent of each page should stay free during normal operation. If you want to ensure that 70 percent of free space makes pg_squeeze interested in the table, set `free_space_extra` to 30 (that is 70 percent required to be free minus the 40 percent free due to the `fillfactor`). Default value of `free_space_extra` is 50. * `min_size` is the minimum disk space in megabytes the table must occupy to be eligible for processing. The default value is 8. * `vacuum_max_age` is the maximum time since the completion of the last VACUUM to consider the free space map (FSM) fresh. Once this interval has elapsed, the portion of dead tuples might be significant and so more effort than simply checking the FSM needs to be spent to evaluate the potential effect `pg_squeeze`. The default value is 1 hour. * `max_retry` is the maximum number of extra attempts to squeeze a table if the first processing of the corresponding task failed. Typical reason to retry the processing is that table definition got changed while the table was being squeezed. If the number of retries is achieved, processing of the table is considered complete. The next task is created as soon as the next scheduled time is reached. The default value of `max_retry` is 0 (i.e. do not retry). * `clustering_index` is an existing index of the processed table. Once the processing is finished, tuples of the table will be physically sorted by the key of this index. * `rel_tablespace` is an existing tablespace the table should be moved into. NULL means that the table should stay where it is. * `ind_tablespaces` is a two-dimensional array in which each row specifies tablespace mapping of an index. The first and the second columns represent index name and tablespace name respectively. All indexes for which no mapping is specified will stay in the original tablespace. Regarding tablespaces, one special case is worth to mention: if tablespace is specified for table but not for indexes, the table gets moved to that tablespace but the indexes stay in the original one (i.e. the tablespace of the table is not the default for indexes as one might expect). * `skip_analyze` indicates that table processing should not be followed by ANALYZE command. The default value is `false`, meaning ANALYZE is performed by default. `squeeze.table` **is the only table user should modify. If you want to change anything else, make sure you perfectly understand what you are doing.** # Ad-hoc processing for any table It's also possible to squeeze tables manually without registering (i.e. inserting the corresponding record into `squeeze.tables`), and without prior checking of the actual bloat. Function signature: ``` squeeze.squeeze_table( tabchema name, tabname name, clustering_index name, rel_tablespace name, ind_tablespaces name[] ) ``` Sample execution: ``` SELECT squeeze.squeeze_table('public', 'pgbench_accounts'); ``` Note that the function is not transactional: it only starts a background worker, tells it which table it should process and exits. Rollback of the transaction the function was called in does not revert the changes done by the worker. # Enable / disable table processing To enable processing of bloated tables, run this statement as superuser: ``` SELECT squeeze.start_worker(); ``` The function starts a background worker (`scheduler worker`) that periodically checks which of the registered tables should be checked for bloat, and creates a task for each. Another worker (`squeeze worker`) is launched whenever a task exists for particular database. If the scheduler worker is already running for the current database, the function does not report any error but the new worker will exit immediately. If the workers are running for the current database, you can use the following statement to stop them: ``` SELECT squeeze.stop_worker(); ``` **Only the functions mentioned in this documentation are considered user interface. If you want to call any other one, make sure you perfectly understand what you're doing.** If you want the background workers to start automatically during startup of the whole PostgreSQL cluster, add entries like this to `postgresql.conf` file: ``` squeeze.worker_autostart = 'my_database your_database' squeeze.worker_role = postgres ``` Next time you start the cluster, two or more workers (i.e. one `scheduler worker` and one or more `squeeze workers`) will be launched for `my_database` and the same for `your_database`. If you take this approach, note that any worker will either reject to start or will stop without doing any work if either: 1. The `pg_squeeze` extension does not exist in the database, or 2. `squeeze.worker_role` parameter specifies role which does not have the superuser privileges. *The functions/configuration variables explained above use singular form of the word `worker` although there are actually two workers. This is because only one worker existed in the previous versions of pg_squeeze, which ensured both scheduling and execution of the tasks. This implementation change is probably not worth to force all users to adjust their configuration files during upgrade.* # Control the impact on other backends Although the table being squeezed is available for both read and write operations by other transactions most of the time, exclusive lock is needed to finalize the processing. If pg_squeeze occasionally seems to block access to tables too much, consider setting `squeeze.max_xlock_time` GUC parameter. For example: ``` SET squeeze.max_xlock_time TO 100; ``` Tells that the exclusive lock shouldn't be held for more than 0.1 second (100 milliseconds). If more time is needed for the final stage, pg_squeeze releases the exclusive lock, processes changes committed by other transactions in between and tries the final stage again. Error is reported if the lock duration is exceeded a few more times. If that happens, you should either increase the setting or schedule processing of the problematic table to a different daytime, when the write activity is lower. # Running multiple workers per database If you think that a single squeeze worker does not cope with the load, consider setting the `squeeze.workers_per_database` configuration variable to value higher than 1. Then the `pg_squeeze` extension will be able to process multiple tables at a time - one table per squeeze worker. However, be aware that this setting affects all databases in which you actively use the `pg_squeeze` extension. The total number of all the squeeze workers in the cluster (including the "scheduler workers") cannot exceed the in-core configuration variable `max_worker_processes`. # Monitoring * `squeeze.log` table contains one entry per successfully squeezed table. The columns `tabschema` and `tabname` identify the processed table. The columns `started` and `finished` tell when the processing started and finished. `ins_initial` is the number of tuples inserted into the new table storage during the "initial load stage", i.e. the number of tuples present in the table before the processing started. On the other hand, `ins`, `upd` and `del` are the numbers of tuples inserted, updated and deleted by applications during the table processing. (These "concurrent data changes" must also be incorporated into the squeezed table, otherwise they'd get lost.) * `squeeze.errors` table contains errors that happened during squeezing. An usual problem reported here is that someone changed definition (e.g. added or removed column) of the table whose processing was just in progress. * `squeeze.get_active_workers()` function returns a table of squeeze workers which are just processing tables in the current database. The `pid` column contains the system PID of the worker process. The other columns have the same meaning as their counterparts in the `squeeze.log` table. While the `squeeze.log` table only shows information on the completed squeeze operations, the `squeeze.get_active_workers()` function lets you check the progress during the processing. # Unregister table If particular table should no longer be subject to periodical squeeze, simply delete the corresponding row from `squeeze.tables` table. It's also a good practice to unregister table that you're going to drop, although the background worker does unregister non-existing tables periodically. # Upgrade Make sure to install PostgreSQL and `pg_config`, see [install](#install) section. ```bash make # Compile the newer version. pg_ctl -D /path/to/cluster stop # Stop the cluster. make install pg_ctl -D /path/to/cluster start # Start the cluster. ``` Connect to each database containing `pg_squeeze` and run this command: ``` ALTER EXTENSION pg_squeeze UPDATE; ``` # Upgrade from 1.2.x **As there's no straightforward way to migrate the scheduling information (see the notes on the `schedule` column of the `squeeze.tables` table) automatically, and as the `schedule` column must not contain NULL values, the upgrade deletes the contents of the `squeeze.tables` table. Please export the table contents to a file before you perform the upgrade and configure the checks of those tables again as soon as the upgrade is done.** # Concurrency 1. The extension does not prevent other transactions from altering table at certain stages of the processing. If a "disruptive command" (i.e. `ALTER TABLE`, `VACUUM FULL`, `CLUSTER` or `TRUNCATE`) manages to commit before the squeeze could finish, the `squeeze_table()` function aborts and all changes done to the table are rolled back. The `max_retry` column of `squeeze.tables` table determines how many times the squeeze worker will retry. Besides that, change of schedule might help you to avoid disruptions. 2. Like [`pg_repack`][1], `pg_squeeze` also changes visibility of rows and thus allows for MVCC-unsafe behavior described in the first paragraph of [mvcc-caveats][5]. Disk Space Requirements ----------------------- Performing a full-table squeeze requires free disk space about twice as large as the target table and its indexes. For example, if the total size of the tables and indexes to be squeezed is 1GB, an additional 2GB of disk space is required. [1]: https://reorg.github.io/pg_repack/ [2]: https://www.postgresql.org/docs/13/static/sql-cluster.html [3]: https://www.postgresql.org/docs/13/static/bgworker.html [4]: https://www.postgresql.org/docs/13/static/logicaldecoding.html [5]: https://www.postgresql.org/docs/13/static/mvcc-caveats.html [6]: https://www.freebsd.org/cgi/man.cgi?query=crontab&sektion=5&apropos=0&manpath=FreeBSD+12.1-RELEASE+and+Ports pg_squeeze-REL1_8_0/concurrent.c000066400000000000000000000543241474466764000167540ustar00rootroot00000000000000/*----------------------------------------------------------------------------------- * * concurrent.c * Module to handle changes that took place while new table was being * created * * Copyright (c) 2016-2024, CYBERTEC PostgreSQL International GmbH * *----------------------------------------------------------------------------------- */ #include "pg_squeeze.h" #if PG_VERSION_NUM >= 130000 #include "access/heaptoast.h" #endif #include "executor/executor.h" #include "replication/decode.h" #include "utils/rel.h" #if PG_VERSION_NUM < 150000 extern PGDLLIMPORT int wal_segment_size; #endif static void apply_concurrent_changes(DecodingOutputState *dstate, Relation relation, ScanKey key, int nkeys, IndexInsertState *iistate, struct timeval *must_complete); static bool processing_time_elapsed(struct timeval *utmost); static void plugin_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init); static void plugin_shutdown(LogicalDecodingContext *ctx); static void plugin_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn); static void plugin_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, XLogRecPtr commit_lsn); static void plugin_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, Relation rel, ReorderBufferChange *change); static void store_change(LogicalDecodingContext *ctx, ConcurrentChangeKind kind, HeapTuple tuple); static HeapTuple get_changed_tuple(ConcurrentChange *change); static bool plugin_filter(LogicalDecodingContext *ctx, RepOriginId origin_id); /* * Decode and apply concurrent changes. If there are too many of them, split * the processing into multiple iterations so that the intermediate storage * (tuplestore) is not likely to be written to disk. * * See check_catalog_changes() for explanation of lock_held argument. * * Returns true if must_complete is NULL or if managed to complete by the time * *must_complete indicates. */ bool process_concurrent_changes(LogicalDecodingContext *ctx, XLogRecPtr end_of_wal, CatalogState *cat_state, Relation rel_dst, ScanKey ident_key, int ident_key_nentries, IndexInsertState *iistate, LOCKMODE lock_held, struct timeval *must_complete) { DecodingOutputState *dstate; bool done; dstate = (DecodingOutputState *) ctx->output_writer_private; /* * If some changes could not be applied due to time constraint, make sure * the tuplestore is empty before we insert new tuples into it. */ if (dstate->nchanges > 0) apply_concurrent_changes(dstate, rel_dst, ident_key, ident_key_nentries, iistate, NULL); Assert(dstate->nchanges == 0); done = false; while (!done) { exit_if_requested(); done = decode_concurrent_changes(ctx, end_of_wal, must_complete); if (processing_time_elapsed(must_complete)) /* Caller is responsible for applying the changes. */ return false; if (dstate->nchanges == 0) continue; /* Make sure the changes are still applicable. */ check_catalog_changes(cat_state, lock_held); /* * XXX Consider if it's possible to check *must_complete and stop * processing partway through. Partial cleanup of the tuplestore seems * non-trivial. */ apply_concurrent_changes(dstate, rel_dst, ident_key, ident_key_nentries, iistate, must_complete); if (processing_time_elapsed(must_complete)) /* Like above. */ return false; } return true; } /* * Decode logical changes from the XLOG sequence up to end_of_wal. * * Returns true iff done (for now), i.e. no more changes below the end_of_wal * can be decoded. */ bool decode_concurrent_changes(LogicalDecodingContext *ctx, XLogRecPtr end_of_wal, struct timeval *must_complete) { DecodingOutputState *dstate; ResourceOwner resowner_old; #if PG_VERSION_NUM < 130000 /* Workaround for XLogBeginRead() in setup_decoding(). */ static bool first_time = true; XLogRecPtr startptr; #endif /* * Invalidate the "present" cache before moving to "(recent) history". * * Note: The cache entry of the transient relation is not affected * (because it was created by the current transaction), but the tuple * descriptor shouldn't change anyway (as opposed to index info, which we * change at some point). Moreover, tuples of the transient relation * should not actually be deconstructed: reorderbuffer.c records the * tuples, but - as it never receives the corresponding commit record - * does not examine them in detail. */ InvalidateSystemCaches(); dstate = (DecodingOutputState *) ctx->output_writer_private; resowner_old = CurrentResourceOwner; CurrentResourceOwner = dstate->resowner; PG_TRY(); { while (ctx->reader->EndRecPtr < end_of_wal) { XLogRecord *record; XLogSegNo segno_new; char *errm = NULL; XLogRecPtr end_lsn; #if PG_VERSION_NUM < 130000 if (first_time) { startptr = MyReplicationSlot->data.restart_lsn; first_time = false; } else startptr = InvalidXLogRecPtr; #endif record = XLogReadRecord(ctx->reader, #if PG_VERSION_NUM < 130000 startptr, #endif &errm); if (errm) elog(ERROR, "%s", errm); if (record != NULL) LogicalDecodingProcessRecord(ctx, ctx->reader); if (processing_time_elapsed(must_complete)) break; /* * If WAL segment boundary has been crossed, inform PG core that * we no longer need the previous segment. */ end_lsn = ctx->reader->EndRecPtr; XLByteToSeg(end_lsn, segno_new, wal_segment_size); if (segno_new != squeeze_current_segment) { LogicalConfirmReceivedLocation(end_lsn); elog(DEBUG1, "pg_squeeze: confirmed receive location %X/%X", (uint32) (end_lsn >> 32), (uint32) end_lsn); squeeze_current_segment = segno_new; } exit_if_requested(); } InvalidateSystemCaches(); CurrentResourceOwner = resowner_old; } PG_CATCH(); { InvalidateSystemCaches(); CurrentResourceOwner = resowner_old; PG_RE_THROW(); } PG_END_TRY(); elog(DEBUG1, "pg_squeeze: %.0f changes decoded but not applied yet", dstate->nchanges); return ctx->reader->EndRecPtr >= end_of_wal; } /* * Apply changes that happened during the initial load. * * Scan key is passed by caller, so it does not have to be constructed * multiple times. Key entries have all fields initialized, except for * sk_argument. */ static void apply_concurrent_changes(DecodingOutputState *dstate, Relation relation, ScanKey key, int nkeys, IndexInsertState *iistate, struct timeval *must_complete) { TupleTableSlot *slot; TupleTableSlot *ind_slot; Form_pg_index ident_form; int2vector *ident_indkey; HeapTuple tup_old = NULL; BulkInsertState bistate = NULL; if (dstate->nchanges == 0) return; /* Info needed to retrieve key values from heap tuple. */ ident_form = iistate->ident_index->rd_index; ident_indkey = &ident_form->indkey; /* TupleTableSlot is needed to pass the tuple to ExecInsertIndexTuples(). */ slot = MakeSingleTupleTableSlot(dstate->tupdesc, &TTSOpsHeapTuple); iistate->econtext->ecxt_scantuple = slot; /* A slot to fetch tuples from identity index. */ ind_slot = table_slot_create(relation, NULL); /* * In case functions in the index need the active snapshot and caller * hasn't set one. */ PushActiveSnapshot(GetTransactionSnapshot()); while (tuplestore_gettupleslot(dstate->tstore, true, false, dstate->tsslot)) { bool shouldFree; HeapTuple tup_change, tup, tup_exist; char *change_raw; ConcurrentChange *change; bool isnull[1]; Datum values[1]; Assert(dstate->nchanges > 0); dstate->nchanges--; /* Get the change from the single-column tuple. */ tup_change = ExecFetchSlotHeapTuple(dstate->tsslot, false, &shouldFree); heap_deform_tuple(tup_change, dstate->tupdesc_change, values, isnull); Assert(!isnull[0]); /* This is bytea, but char* is easier to work with. */ change_raw = (char *) DatumGetByteaP(values[0]); change = (ConcurrentChange *) VARDATA(change_raw); /* * Do not keep buffer pinned for insert if the current change is * something else. */ if (change->kind != PG_SQUEEZE_CHANGE_INSERT && bistate != NULL) { FreeBulkInsertState(bistate); bistate = NULL; } tup = get_changed_tuple(change); if (change->kind == PG_SQUEEZE_CHANGE_UPDATE_OLD) { Assert(tup_old == NULL); tup_old = tup; } else if (change->kind == PG_SQUEEZE_CHANGE_INSERT) { List *recheck; Assert(tup_old == NULL); /* * If the next change will also be INSERT, we'll try to use the * same buffer. */ if (bistate == NULL) bistate = GetBulkInsertState(); heap_insert(relation, tup, GetCurrentCommandId(true), 0, bistate); /* Update indexes. */ ExecStoreHeapTuple(tup, slot, false); recheck = ExecInsertIndexTuples( #if PG_VERSION_NUM >= 140000 iistate->rri, #endif slot, iistate->estate, #if PG_VERSION_NUM >= 140000 false, /* update */ #endif false, /* noDupErr */ NULL, /* specConflict */ NIL /* arbiterIndexes */ #if PG_VERSION_NUM >= 160000 , false /* onlySummarizing */ #endif ); /* * If recheck is required, it must have been preformed on the * source relation by now. (All the logical changes we process * here are already committed.) */ list_free(recheck); pfree(tup); /* Update the progress information. */ SpinLockAcquire(&MyWorkerSlot->mutex); MyWorkerSlot->progress.ins += 1; SpinLockRelease(&MyWorkerSlot->mutex); } else if (change->kind == PG_SQUEEZE_CHANGE_UPDATE_NEW || change->kind == PG_SQUEEZE_CHANGE_DELETE) { HeapTuple tup_key; IndexScanDesc scan; int i; ItemPointerData ctid; if (change->kind == PG_SQUEEZE_CHANGE_UPDATE_NEW) { tup_key = tup_old != NULL ? tup_old : tup; } else { Assert(tup_old == NULL); tup_key = tup; } /* * Find the tuple to be updated or deleted. * * XXX As no other transactions are engaged, SnapshotSelf might * seem to prevent us from wasting values of the command counter * (as we do not update catalog here, cache invalidation is not * the reason to increment the counter). However, heap_update() * does require CommandCounterIncrement(). */ scan = index_beginscan(relation, iistate->ident_index, GetActiveSnapshot(), nkeys, 0); index_rescan(scan, key, nkeys, NULL, 0); /* Use the incoming tuple to finalize the scan key. */ for (i = 0; i < scan->numberOfKeys; i++) { ScanKey entry; bool isnull; int16 attno_heap; entry = &scan->keyData[i]; attno_heap = ident_indkey->values[i]; entry->sk_argument = heap_getattr(tup_key, attno_heap, relation->rd_att, &isnull); Assert(!isnull); } if (index_getnext_slot(scan, ForwardScanDirection, ind_slot)) { bool shouldFreeInd; tup_exist = ExecFetchSlotHeapTuple(ind_slot, false, &shouldFreeInd); /* TTSOpsBufferHeapTuple has .get_heap_tuple != NULL. */ Assert(!shouldFreeInd); } else tup_exist = NULL; if (tup_exist == NULL) elog(ERROR, "Failed to find target tuple"); ItemPointerCopy(&tup_exist->t_self, &ctid); index_endscan(scan); if (change->kind == PG_SQUEEZE_CHANGE_UPDATE_NEW) { #if PG_VERSION_NUM >= 160000 TU_UpdateIndexes update_indexes; #endif simple_heap_update(relation, &ctid, tup #if PG_VERSION_NUM >= 160000 , &update_indexes #endif ); /* * In PG < 16, change of any indexed attribute makes HOT * impossible, Therefore HOT update implies that no index * needs to be updated. * * In PG >= 16, if only attributes of "summarizing indexes" * change, HOT update is still possible. Therefore HOT update * might still require some indexes (in particular, the * summarizing ones) to be updated. */ #if PG_VERSION_NUM >= 160000 if (update_indexes != TU_None) #else if (!HeapTupleIsHeapOnly(tup)) #endif { List *recheck; ExecStoreHeapTuple(tup, slot, false); /* * XXX Consider passing update=true, however it requires * es_range_table to be initialized. Is it worth the * complexity? */ recheck = ExecInsertIndexTuples( #if PG_VERSION_NUM >= 140000 iistate->rri, #endif slot, iistate->estate, #if PG_VERSION_NUM >= 140000 false, /* update */ #endif false, /* noDupErr */ NULL, /* specConflict */ NIL /* arbiterIndexes */ #if PG_VERSION_NUM >= 160000 , /* onlySummarizing */ (update_indexes == TU_Summarizing) #endif ); list_free(recheck); } /* Update the progress information. */ SpinLockAcquire(&MyWorkerSlot->mutex); MyWorkerSlot->progress.upd += 1; SpinLockRelease(&MyWorkerSlot->mutex); } else { simple_heap_delete(relation, &ctid); /* Update the progress information. */ SpinLockAcquire(&MyWorkerSlot->mutex); MyWorkerSlot->progress.del += 1; SpinLockRelease(&MyWorkerSlot->mutex); } if (tup_old != NULL) { pfree(tup_old); tup_old = NULL; } pfree(tup); } else elog(ERROR, "Unrecognized kind of change: %d", change->kind); /* If there's any change, make it visible to the next iteration. */ if (change->kind != PG_SQUEEZE_CHANGE_UPDATE_OLD) { CommandCounterIncrement(); UpdateActiveSnapshotCommandId(); } /* TTSOpsMinimalTuple has .get_heap_tuple==NULL. */ Assert(shouldFree); pfree(tup_change); /* * If there is a limit on the time of completion, check it * now. However, make sure the loop does not break if tup_old was set * in the previous iteration. In such a case we could not resume the * processing in the next call. */ if (must_complete && tup_old == NULL && processing_time_elapsed(must_complete)) /* The next call will process the remaining changes. */ break; } /* If we could not apply all the changes, the next call will do. */ if (dstate->nchanges == 0) tuplestore_clear(dstate->tstore); PopActiveSnapshot(); /* Cleanup. */ if (bistate != NULL) FreeBulkInsertState(bistate); ExecDropSingleTupleTableSlot(slot); ExecDropSingleTupleTableSlot(ind_slot); } static bool processing_time_elapsed(struct timeval *utmost) { struct timeval now; if (utmost == NULL) return false; gettimeofday(&now, NULL); if (now.tv_sec < utmost->tv_sec) return false; if (now.tv_sec > utmost->tv_sec) return true; return now.tv_usec >= utmost->tv_usec; } IndexInsertState * get_index_insert_state(Relation relation, Oid ident_index_id) { EState *estate; int i; IndexInsertState *result; result = (IndexInsertState *) palloc0(sizeof(IndexInsertState)); estate = CreateExecutorState(); result->econtext = GetPerTupleExprContext(estate); result->rri = (ResultRelInfo *) palloc(sizeof(ResultRelInfo)); InitResultRelInfo(result->rri, relation, 0, 0, 0); ExecOpenIndices(result->rri, false); /* * Find the relcache entry of the identity index so that we spend no extra * effort to open / close it. */ for (i = 0; i < result->rri->ri_NumIndices; i++) { Relation ind_rel; ind_rel = result->rri->ri_IndexRelationDescs[i]; if (ind_rel->rd_id == ident_index_id) result->ident_index = ind_rel; } if (result->ident_index == NULL) elog(ERROR, "Failed to open identity index"); /* Only initialize fields needed by ExecInsertIndexTuples(). */ #if PG_VERSION_NUM < 140000 estate->es_result_relations = estate->es_result_relation_info = result->rri; estate->es_num_result_relations = 1; #endif result->estate = estate; return result; } void free_index_insert_state(IndexInsertState *iistate) { ExecCloseIndices(iistate->rri); FreeExecutorState(iistate->estate); pfree(iistate->rri); pfree(iistate); } void _PG_output_plugin_init(OutputPluginCallbacks *cb) { AssertVariableIsOfType(&_PG_output_plugin_init, LogicalOutputPluginInit); cb->startup_cb = plugin_startup; cb->begin_cb = plugin_begin_txn; cb->change_cb = plugin_change; cb->commit_cb = plugin_commit_txn; cb->filter_by_origin_cb = plugin_filter; cb->shutdown_cb = plugin_shutdown; } /* initialize this plugin */ static void plugin_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init) { ctx->output_plugin_private = NULL; /* Probably unnecessary, as we don't use the SQL interface ... */ opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT; if (ctx->output_plugin_options != NIL) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("This plugin does not expect any options"))); } } static void plugin_shutdown(LogicalDecodingContext *ctx) { } /* * As we don't release the slot during processing of particular table, there's * no room for SQL interface, even for debugging purposes. Therefore we need * neither OutputPluginPrepareWrite() nor OutputPluginWrite() in the plugin * callbacks. (Although we might want to write custom callbacks, this API * seems to be unnecessarily generic for our purposes.) */ /* BEGIN callback */ static void plugin_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn) { } /* COMMIT callback */ static void plugin_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, XLogRecPtr commit_lsn) { } /* * Callback for individual changed tuples */ static void plugin_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change) { DecodingOutputState *dstate; dstate = (DecodingOutputState *) ctx->output_writer_private; /* Only interested in one particular relation. */ if (relation->rd_id != dstate->relid) return; /* Decode entry depending on its type */ switch (change->action) { case REORDER_BUFFER_CHANGE_INSERT: { HeapTuple newtuple; newtuple = change->data.tp.newtuple != NULL ? #if PG_VERSION_NUM >= 170000 change->data.tp.newtuple : NULL; #else &change->data.tp.newtuple->tuple : NULL; #endif /* * Identity checks in the main function should have made this * impossible. */ if (newtuple == NULL) elog(ERROR, "Incomplete insert info."); store_change(ctx, PG_SQUEEZE_CHANGE_INSERT, newtuple); } break; case REORDER_BUFFER_CHANGE_UPDATE: { HeapTuple oldtuple, newtuple; oldtuple = change->data.tp.oldtuple != NULL ? #if PG_VERSION_NUM >= 170000 change->data.tp.oldtuple : NULL; #else &change->data.tp.oldtuple->tuple : NULL; #endif newtuple = change->data.tp.newtuple != NULL ? #if PG_VERSION_NUM >= 170000 change->data.tp.newtuple : NULL; #else &change->data.tp.newtuple->tuple : NULL; #endif if (newtuple == NULL) elog(ERROR, "Incomplete update info."); if (oldtuple != NULL) store_change(ctx, PG_SQUEEZE_CHANGE_UPDATE_OLD, oldtuple); store_change(ctx, PG_SQUEEZE_CHANGE_UPDATE_NEW, newtuple); } break; case REORDER_BUFFER_CHANGE_DELETE: { HeapTuple oldtuple; oldtuple = change->data.tp.oldtuple ? #if PG_VERSION_NUM >= 170000 change->data.tp.oldtuple : NULL; #else &change->data.tp.oldtuple->tuple : NULL; #endif if (oldtuple == NULL) elog(ERROR, "Incomplete delete info."); store_change(ctx, PG_SQUEEZE_CHANGE_DELETE, oldtuple); } break; default: /* Should not come here */ Assert(0); break; } } /* Store concurrent data change. */ static void store_change(LogicalDecodingContext *ctx, ConcurrentChangeKind kind, HeapTuple tuple) { DecodingOutputState *dstate; char *change_raw; ConcurrentChange *change; MemoryContext oldcontext; bool flattened = false; Size size; Datum values[1]; bool isnull[1]; char *dst; dstate = (DecodingOutputState *) ctx->output_writer_private; /* * ReorderBufferCommit() stores the TOAST chunks in its private memory * context and frees them after having called apply_change(). Therefore we * need flat copy (including TOAST) that we eventually copy into the * memory context which is available to decode_concurrent_changes(). */ if (HeapTupleHasExternal(tuple)) { /* * toast_flatten_tuple_to_datum() might be more convenient but we * don't want the decompression it does. */ tuple = toast_flatten_tuple(tuple, dstate->tupdesc); flattened = true; } size = MAXALIGN(VARHDRSZ) + sizeof(ConcurrentChange) + tuple->t_len; /* XXX Isn't there any function / macro to do this? */ if (size >= 0x3FFFFFFF) elog(ERROR, "Change is too big."); oldcontext = MemoryContextSwitchTo(ctx->context); change_raw = (char *) palloc(size); MemoryContextSwitchTo(oldcontext); SET_VARSIZE(change_raw, size); change = (ConcurrentChange *) VARDATA(change_raw); /* * Copy the tuple. * * CAUTION: change->tup_data.t_data must be fixed on retrieval! */ memcpy(&change->tup_data, tuple, sizeof(HeapTupleData)); dst = (char *) change + sizeof(ConcurrentChange); memcpy(dst, tuple->t_data, tuple->t_len); /* The other field. */ change->kind = kind; /* The data has been copied. */ if (flattened) pfree(tuple); /* Store as tuple of 1 bytea column. */ values[0] = PointerGetDatum(change_raw); isnull[0] = false; tuplestore_putvalues(dstate->tstore, dstate->tupdesc_change, values, isnull); /* Accounting. */ dstate->nchanges++; /* Cleanup. */ pfree(change_raw); } /* * Retrieve tuple from a change structure. As for the change, no alignment is * assumed. */ static HeapTuple get_changed_tuple(ConcurrentChange *change) { HeapTupleData tup_data; HeapTuple result; char *src; /* * Ensure alignment before accessing the fields. (This is why we can't use * heap_copytuple() instead of this function.) */ memcpy(&tup_data, &change->tup_data, sizeof(HeapTupleData)); result = (HeapTuple) palloc(HEAPTUPLESIZE + tup_data.t_len); memcpy(result, &tup_data, sizeof(HeapTupleData)); result->t_data = (HeapTupleHeader) ((char *) result + HEAPTUPLESIZE); src = (char *) change + sizeof(ConcurrentChange); memcpy(result->t_data, src, result->t_len); return result; } /* * A filter that recognizes changes produced by the initial load. */ static bool plugin_filter(LogicalDecodingContext *ctx, RepOriginId origin_id) { DecodingOutputState *dstate; dstate = (DecodingOutputState *) ctx->output_writer_private; /* dstate is not initialized during decoding setup - should it be? */ if (dstate && dstate->rorigin != InvalidRepOriginId && origin_id == dstate->rorigin) return true; return false; } pg_squeeze-REL1_8_0/expected/000077500000000000000000000000001474466764000162175ustar00rootroot00000000000000pg_squeeze-REL1_8_0/expected/squeeze.out000066400000000000000000000024771474466764000204430ustar00rootroot00000000000000CREATE EXTENSION pg_squeeze; CREATE TABLE a(i int PRIMARY KEY, j int); INSERT INTO a(i, j) SELECT x, x FROM generate_series(1, 10) AS g(x); -- The trivial case. SELECT squeeze.squeeze_table('public', 'a', NULL); squeeze_table --------------- (1 row) SELECT * FROM a; i | j ----+---- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 (10 rows) -- Clustering by index. CREATE INDEX a_i_idx_desc ON a(i DESC); SELECT squeeze.squeeze_table('public', 'a', 'a_i_idx_desc'); squeeze_table --------------- (1 row) SELECT * FROM a; i | j ----+---- 10 | 10 9 | 9 8 | 8 7 | 7 6 | 6 5 | 5 4 | 4 3 | 3 2 | 2 1 | 1 (10 rows) -- Involve TOAST. CREATE TABLE b(i int PRIMARY KEY, t text); INSERT INTO b(i, t) SELECT x, repeat(x::text, 1024) FROM generate_series(1, 10) AS g(x) GROUP BY x; SELECT reltoastrelid > 0 FROM pg_class WHERE relname='b'; ?column? ---------- t (1 row) -- Copy the data into another table so we can check later. CREATE TABLE b_copy (LIKE b INCLUDING ALL); INSERT INTO b_copy(i, t) SELECT i, t FROM b; -- Squeeze. SELECT squeeze.squeeze_table('public', 'b', NULL); squeeze_table --------------- (1 row) -- Compare. SELECT b.t = b_copy.t FROM b, b_copy WHERE b.i = b_copy.i; ?column? ---------- t t t t t t t t t t (10 rows) pg_squeeze-REL1_8_0/pg_squeeze--1.2--1.3.sql000066400000000000000000000253541474466764000202460ustar00rootroot00000000000000/* pg_squeeze--1.2--1.3.sql */ -- complain if script is sourced in psql, rather than via ALTER EXTENSION \echo Use "ALTER EXTENSION pg_squeeze UPDATE TO '1.3'" to load this file. \quit ALTER TABLE tables_internal DROP COLUMN class_id; ALTER TABLE tables_internal DROP COLUMN class_id_toast; ALTER TABLE tables_internal DROP COLUMN free_space; ALTER TABLE tables_internal DROP COLUMN last_task_created; DELETE FROM tables; ALTER TABLE tables DROP COLUMN schedule; CREATE DOMAIN minute AS int CHECK (VALUE BETWEEN 0 AND 59); CREATE DOMAIN hour AS int CHECK (VALUE BETWEEN 0 AND 23); CREATE DOMAIN dom AS int CHECK (VALUE BETWEEN 1 AND 31); CREATE DOMAIN month AS int CHECK (VALUE BETWEEN 1 AND 12); CREATE DOMAIN dow AS int CHECK (VALUE BETWEEN 0 AND 7); CREATE TYPE schedule AS ( minutes minute[], hours hour[], days_of_month dom[], months month[], days_of_week dow[]); ALTER TABLE tables ADD COLUMN schedule schedule NOT NULL; DROP TABLE tasks; CREATE DOMAIN task_state AS TEXT CHECK(VALUE IN ('new', 'ready', 'processed')); CREATE TABLE tasks ( id serial NOT NULL PRIMARY KEY, table_id int NOT NULL REFERENCES tables ON DELETE CASCADE, -- Task creation time. created timestamptz NOT NULL DEFAULT now(), -- The latest known free space in the underlying table. Note that it -- includes dead tuples, since these are eliminated by squeeze_table() -- function. free_space double precision, -- How many times did we try to process the task? The common use case -- is that a concurrent DDL broke the processing. tried int NOT NULL DEFAULT 0, -- Either squeezed or skipped by the "squeeze worker" (because there's -- not enough free space or the table is not big enough). state task_state NOT NULL DEFAULT 'new' ); DROP FUNCTION add_new_tasks(); DROP FUNCTION start_next_task(); DROP FUNCTION cleanup_task(a_task_id int); DROP FUNCTION process_current_task(); DROP FUNCTION stop_worker(); CREATE VIEW scheduled_for_now AS SELECT i.table_id, t.tabschema, t.tabname FROM squeeze.tables_internal i, pg_catalog.pg_stat_user_tables s, squeeze.tables t, pg_class c, pg_namespace n WHERE (t.tabschema, t.tabname) = (s.schemaname, s.relname) AND i.table_id = t.id AND n.nspname = t.tabschema AND c.relnamespace = n.oid AND c.relname = t.tabname AND ( ((t.schedule).minutes ISNULL OR EXTRACT(minute FROM now())::int = ANY((t.schedule).minutes)) AND ((t.schedule).hours ISNULL OR EXTRACT(hour FROM now())::int = ANY((t.schedule).hours)) AND ((t.schedule).months ISNULL OR EXTRACT(month FROM now())::int = ANY((t.schedule).months)) AND ( -- At least one of the "days_of_month" and -- "days_of_week" components must -- match. However if one matches, NULL value -- of the other must not be considered "any -- day of month/week". Instead, NULL can only -- cause a match if both components have it. ((t.schedule).days_of_month ISNULL AND (t.schedule).days_of_week ISNULL) OR EXTRACT(day FROM now())::int = ANY((t.schedule).days_of_month) OR EXTRACT(dow FROM now())::int = ANY((t.schedule).days_of_week) OR -- Sunday can be expressed as both 0 and 7. EXTRACT(isodow FROM now())::int = ANY((t.schedule).days_of_week) ) ); CREATE FUNCTION check_schedule() RETURNS void LANGUAGE sql AS $$ -- Delete the processed tasks, but ignore those scheduled and -- processed in the current minute - we don't want to schedule those -- again now. DELETE FROM squeeze.tasks t WHERE state = 'processed' AND (EXTRACT(HOUR FROM now()) <> EXTRACT(HOUR FROM t.created) OR EXTRACT(MINUTE FROM now()) <> EXTRACT(MINUTE FROM t.created)); -- Create task where schedule does match. INSERT INTO squeeze.tasks(table_id) SELECT i.table_id FROM squeeze.tables_internal i, pg_catalog.pg_stat_user_tables s, squeeze.tables t, pg_class c, pg_namespace n WHERE (t.tabschema, t.tabname) = (s.schemaname, s.relname) AND i.table_id = t.id AND n.nspname = t.tabschema AND c.relnamespace = n.oid AND c.relname = t.tabname -- Is there a matching schedule? AND EXISTS ( SELECT * FROM squeeze.scheduled_for_now WHERE table_id = i.table_id ) -- Ignore tables for which a task currently exists. AND NOT t.id IN (SELECT table_id FROM squeeze.tasks); $$; -- Update new tasks with the information on free space in the corresponding -- tables. CREATE OR REPLACE FUNCTION update_free_space_info() RETURNS void LANGUAGE sql AS $$ -- If VACUUM completed recenly enough, we consider the percentage of -- dead tuples negligible and so retrieve the free space from FSM. UPDATE squeeze.tasks k SET free_space = 100 * squeeze.get_heap_freespace(c.oid) FROM squeeze.tables t, squeeze.tables_internal i, pg_catalog.pg_class c, pg_catalog.pg_namespace n, pg_catalog.pg_stat_user_tables s WHERE k.state = 'new' AND k.table_id = t.id AND i.table_id = t.id AND t.tabname = c.relname AND c.relnamespace = n.oid AND t.tabschema = n.nspname AND (t.tabschema, t.tabname) = (s.schemaname, s.relname) AND ( (s.last_vacuum >= now() - t.vacuum_max_age) OR (s.last_autovacuum >= now() - t.vacuum_max_age) ) AND -- Each processing makes the previous VACUUM unimportant. ( i.last_task_finished ISNULL OR i.last_task_finished < s.last_vacuum OR i.last_task_finished < s.last_autovacuum ); -- If VACUUM didn't run recently or there's no FSM, take the more -- expensive approach. UPDATE squeeze.tasks k SET free_space = a.approx_free_percent + a.dead_tuple_percent FROM squeeze.tables t, pg_catalog.pg_class c, pg_catalog.pg_namespace n, squeeze.pgstattuple_approx(c.oid) a WHERE k.state = 'new' AND k.free_space ISNULL AND k.table_id = t.id AND t.tabname = c.relname AND c.relnamespace = n.oid AND t.tabschema = n.nspname; $$; CREATE FUNCTION dispatch_new_tasks() RETURNS void LANGUAGE sql AS $$ -- First, get rid of tables not big enough for processing. UPDATE squeeze.tasks k SET state = 'processed' FROM squeeze.tables t, pg_catalog.pg_class c, pg_catalog.pg_namespace n WHERE k.state = 'new' AND k.table_id = t.id AND t.tabname = c.relname AND c.relnamespace = n.oid AND t.tabschema = n.nspname AND pg_catalog.pg_relation_size(c.oid, 'main') < t.min_size * 1048576; SELECT squeeze.update_free_space_info(); -- Make the actual decision. -- -- Ignore tasks having NULL in free_space - those have been created -- after update_free_space_info() had finished, so the should waite -- for the next run of dispatch_new_tasks(). UPDATE squeeze.tasks k SET state = CASE WHEN k.free_space > ((100 - squeeze.get_heap_fillfactor(c.oid)) + t.free_space_extra) THEN 'ready' ELSE 'processed' END FROM squeeze.tables t, pg_catalog.pg_class c, pg_catalog.pg_namespace n WHERE k.state = 'new' AND k.free_space NOTNULL AND k.table_id = t.id AND t.tabname = c.relname AND c.relnamespace = n.oid AND t.tabschema = n.nspname; $$; CREATE FUNCTION finalize_task(a_task_id int) RETURNS void LANGUAGE sql AS $$ WITH updated(table_id) AS ( UPDATE squeeze.tasks t SET state = 'processed' WHERE id = a_task_id RETURNING table_id ) UPDATE squeeze.tables_internal t SET last_task_finished = now() FROM updated u WHERE u.table_id = t.table_id; $$; CREATE FUNCTION cancel_task(a_task_id int) RETURNS void LANGUAGE sql AS $$ UPDATE squeeze.tasks t SET state = 'processed' WHERE id = a_task_id; $$; CREATE FUNCTION process_next_task() RETURNS void LANGUAGE plpgsql AS $$ DECLARE v_tabschema name; v_tabname name; v_cl_index name; v_rel_tbsp name; v_ind_tbsps name[][]; v_task_id int; v_tried int; v_last_try bool; v_skip_analyze bool; v_stmt text; v_start timestamptz; -- Error info to be logged. v_sql_state text; v_err_msg text; v_err_detail text; BEGIN -- Retrieve the table corresponding to the least recently created task -- in the 'ready' state. SELECT tb.tabschema, tb.tabname, tb.clustering_index, tb.rel_tablespace, tb.ind_tablespaces, t.id, t.tried, t.tried >= tb.max_retry, tb.skip_analyze INTO v_tabschema, v_tabname, v_cl_index, v_rel_tbsp, v_ind_tbsps, v_task_id, v_tried, v_last_try, v_skip_analyze FROM squeeze.tasks t, squeeze.tables tb WHERE t.table_id = tb.id AND t.state = 'ready' ORDER BY t.created LIMIT 1; IF NOT FOUND THEN -- No task currently available? RETURN; END IF; -- Do the actual work. BEGIN v_start := clock_timestamp(); -- Do the actual processing. -- -- If someone dropped the table in between, the exception -- handler below should log the error and cleanup the task. PERFORM squeeze.squeeze_table(v_tabschema, v_tabname, v_cl_index, v_rel_tbsp, v_ind_tbsps); INSERT INTO squeeze.log(tabschema, tabname, started, finished) VALUES (v_tabschema, v_tabname, v_start, clock_timestamp()); PERFORM squeeze.finalize_task(v_task_id); IF NOT v_skip_analyze THEN -- Analyze the new table, unless user rejects it -- explicitly. -- -- XXX Besides updating planner statistics in general, -- this sets pg_class(relallvisible) to 0, so that -- planner is not too optimistic about this -- figure. The preferrable solution would be to run -- (lazy) VACUUM (with the ANALYZE option) to -- initialize visibility map. However, to make the -- effort worthwile, we shouldn't do it until all -- transactions can see all the changes done by -- squeeze_table() function. What's the most suitable -- way to wait? Asynchronous execution of the VACUUM -- is probably needed in any case. v_stmt := 'ANALYZE "' || v_tabschema || '"."' || v_tabname || '"'; EXECUTE v_stmt; END IF; EXCEPTION WHEN OTHERS THEN GET STACKED DIAGNOSTICS v_sql_state := RETURNED_SQLSTATE; GET STACKED DIAGNOSTICS v_err_msg := MESSAGE_TEXT; GET STACKED DIAGNOSTICS v_err_detail := PG_EXCEPTION_DETAIL; INSERT INTO squeeze.errors(tabschema, tabname, sql_state, err_msg, err_detail) VALUES (v_tabschema, v_tabname, v_sql_state, v_err_msg, v_err_detail); -- If the active task failed too many times, cancel -- it. IF v_last_try THEN PERFORM squeeze.cancel_task(v_task_id); RETURN; ELSE -- Account for the current attempt. UPDATE squeeze.tasks SET tried = tried + 1 WHERE id = v_task_id; END IF; END; END; $$; CREATE FUNCTION stop_worker() RETURNS SETOF record LANGUAGE sql AS $$ -- When looking for the PID we rely on the fact that the worker holds -- lock on the extension. If the worker is not running, we could (in -- theory) kill a regular backend trying to ALTER or DROP the -- extension right now. It's not worth taking a different approach -- just to avoid this extremely unlikely case (which shouldn't cause -- data corruption). SELECT pid, pg_terminate_backend(pid) FROM pg_catalog.pg_locks l, pg_catalog.pg_extension e WHERE e.extname = 'pg_squeeze' AND (l.classid, l.objid) = (3079, e.oid); $$; pg_squeeze-REL1_8_0/pg_squeeze--1.2.sql000066400000000000000000000376531474466764000176770ustar00rootroot00000000000000/* pg_squeeze--1.1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_squeeze" to load this file. \quit CREATE TABLE tables ( id serial NOT NULL PRIMARY KEY, tabschema name NOT NULL, tabname name NOT NULL, UNIQUE(tabschema, tabname), -- Clustering index. clustering_index name, -- Tablespace the table should be put into. rel_tablespace name, -- Index-to-tablespace mappings. Each row of the array is expected to -- consist of 2 columns: index name and target tablespace. ind_tablespaces name[][], -- Times to check whether a new task should be created for the table. schedule timetz[] NOT NULL, -- The minimum percentage of free space that triggers processing, in -- addition to the percentage determined by fillfactor. -- -- TODO Tune the default value. free_space_extra int NOT NULL DEFAULT 50, CHECK (free_space_extra >= 0 AND free_space_extra < 100), -- The minimum size of the table (in megabytes) needed to trigger -- processing. -- -- TODO Tune the default value. min_size real NOT NULL DEFAULT 8, CHECK (min_size > 0.0), -- If at most this interval elapsed since the last VACUUM, try to use -- FSM to estimate free space. Otherwise (or if there's no FSM), use -- squeeze.pgstattuple_approx() function. -- -- TODO Tune the default value. vacuum_max_age interval NOT NULL DEFAULT '1 hour', max_retry int NOT NULL DEFAULT 0, -- No ANALYZE after the processing has completed. skip_analyze bool NOT NULL DEFAULT false ); COMMENT ON TABLE tables IS 'List of tables registered for regular squeeze.'; COMMENT ON COLUMN tables.id IS 'Identifier of the registered table (generated column).'; COMMENT ON COLUMN tables.tabschema IS 'Database schema of the registered table.'; COMMENT ON COLUMN tables.tabname IS 'Table registered for regular squeeze.'; COMMENT ON COLUMN tables.clustering_index IS 'Index to control ordering of table rows.'; COMMENT ON COLUMN tables.rel_tablespace IS 'Tablespace into which the registered table should be moved.'; COMMENT ON COLUMN tables.ind_tablespaces IS 'Index-to-tablespace mappings to be applied.'; COMMENT ON COLUMN tables.schedule IS 'Array of scheduled times to check and possibly process the table.'; COMMENT ON COLUMN tables.free_space_extra IS 'In addition to free space derived from fillfactor, this extra ' 'percentage of free space is needed to schedule processing of the ' 'table.'; COMMENT ON COLUMN tables.min_size IS 'Besides meeting the free_space_extra criterion, the table size must ' 'be at least this many MBs to be scheduled for squeezee.'; COMMENT ON COLUMN tables.vacuum_max_age IS 'If less than this elapsed since the last VACUUM, try to use FSM to ' 'estimate the amount of free space.'; COMMENT ON COLUMN tables.max_retry IS 'The maximum nmber of times failed processing is retried.'; COMMENT ON COLUMN tables.skip_analyze IS 'Only squeeze the table, without running ANALYZE afterwards.'; -- Fields that would normally fit into "tables" but require no attention of -- the user are separate. Thus "tables" can be considered an user interface. CREATE TABLE tables_internal ( table_id int NOT NULL PRIMARY KEY REFERENCES tables ON DELETE CASCADE, -- pg_class(oid) of the table. It's an auxiliary field that lets us -- avoid repeated retrieval of the field in add_new_tasks(). class_id oid, -- Likewise. class_id_toast oid, -- The latest known free space. Note that it includes dead tuples, -- since these are eliminated by squeeze_table() function. free_space double precision, -- When was the most recent task created? last_task_created timestamptz, -- When was the most recent task finished? last_task_finished timestamptz ); -- Trigger to keep "tables_internal" in-sync with "tables". -- -- (Deletion is handled by foreign key.) CREATE FUNCTION tables_internal_trig_func() RETURNS trigger LANGUAGE plpgsql AS $$ BEGIN INSERT INTO squeeze.tables_internal(table_id) VALUES (NEW.id); RETURN NEW; END; $$; CREATE TRIGGER tables_internal_trig AFTER INSERT ON squeeze.tables FOR EACH ROW EXECUTE PROCEDURE squeeze.tables_internal_trig_func(); -- Task queue. If completed with success, the task is moved into "log" table. -- -- If task fails and tables(max_retry) is greater than zero, processing will -- be retried automatically as long as tasks(tried) < tables(max_retry) + -- 1. Then the task will be removed from the queue. CREATE TABLE tasks ( id serial NOT NULL PRIMARY KEY, table_id int NOT NULL REFERENCES tables ON DELETE CASCADE, -- Is this the task the next call of process() function will pick? active bool NOT NULL DEFAULT false, -- How many times did we try to process the task? The common use case -- is that a concurrent DDL broke the processing. tried int NOT NULL DEFAULT 0 ); -- Make sure there is at most one active task anytime. CREATE UNIQUE INDEX ON tasks(active) WHERE active; -- Each successfully completed processing of a table is recorded here. CREATE TABLE log ( tabschema name NOT NULL, tabname name NOT NULL, started timestamptz NOT NULL, finished timestamptz NOT NULL ); -- XXX Some other indexes might be useful. Analyze the typical use later. CREATE INDEX ON log(started); COMMENT ON TABLE log IS 'Successfully completed squeeze operations.'; COMMENT ON COLUMN log.tabschema IS 'Database schema of the table processed.'; COMMENT ON COLUMN log.tabname IS 'Name of the table not squeezed.'; COMMENT ON COLUMN log.started IS 'When the processing started.'; COMMENT ON COLUMN log.finished IS 'When the processing finished.'; CREATE TABLE errors ( id bigserial NOT NULL PRIMARY KEY, occurred timestamptz NOT NULL DEFAULT now(), tabschema name NOT NULL, tabname name NOT NULL, sql_state text NOT NULL, err_msg text NOT NULL, err_detail text ); COMMENT ON TABLE errors IS 'Failed attempts to squeeze table.'; COMMENT ON COLUMN errors.id IS 'Identifier of the failure (generated column).'; COMMENT ON COLUMN errors.occurred IS 'Time the errors has occurred.'; COMMENT ON COLUMN errors.tabschema IS 'Database schema of the table not squeezed.'; COMMENT ON COLUMN errors.tabname IS 'Name of the table not squeezed.'; COMMENT ON COLUMN errors.sql_state IS '"SQL state" encountered.'; COMMENT ON COLUMN errors.err_msg IS 'Error message caught.'; COMMENT ON COLUMN errors.err_detail IS 'Detailed error message, if available.'; CREATE FUNCTION get_heap_fillfactor(a_relid oid) RETURNS int AS 'MODULE_PATHNAME', 'get_heap_fillfactor' VOLATILE LANGUAGE C; CREATE FUNCTION get_heap_freespace(a_relid oid) RETURNS double precision AS 'MODULE_PATHNAME', 'get_heap_freespace' VOLATILE LANGUAGE C; CREATE FUNCTION pgstattuple_approx(IN reloid regclass, OUT table_len BIGINT, -- physical table length in bytes OUT scanned_percent FLOAT8, -- what percentage of the table's pages was scanned OUT approx_tuple_count BIGINT, -- estimated number of live tuples OUT approx_tuple_len BIGINT, -- estimated total length in bytes of live tuples OUT approx_tuple_percent FLOAT8, -- live tuples in % (based on estimate) OUT dead_tuple_count BIGINT, -- exact number of dead tuples OUT dead_tuple_len BIGINT, -- exact total length in bytes of dead tuples OUT dead_tuple_percent FLOAT8, -- dead tuples in % (based on estimate) OUT approx_free_space BIGINT, -- estimated free space in bytes OUT approx_free_percent FLOAT8) -- free space in % (based on estimate) AS 'MODULE_PATHNAME', 'squeeze_pgstattuple_approx' LANGUAGE C STRICT PARALLEL SAFE; -- Unregister dropped tables. (CASCADE behaviour ensures deletion of the -- related records in "tables_internal" and "tasks" tables.) CREATE FUNCTION cleanup_tables() RETURNS void LANGUAGE sql AS $$ DELETE FROM squeeze.tables t WHERE (t.tabschema, t.tabname) NOT IN ( SELECT n.nspname, c.relname FROM pg_class c JOIN pg_namespace n ON c.relnamespace = n.oid); $$; -- Update the information on free space for table that has valid -- tables_internal(class_id). CREATE FUNCTION update_free_space_info() RETURNS void LANGUAGE sql AS $$ UPDATE squeeze.tables_internal SET free_space = NULL WHERE class_id NOTNULL; -- If VACUUM completed recenly enough, we consider the percentage of -- dead tuples negligible and so retrieve the free space from FSM. UPDATE squeeze.tables_internal i SET free_space = 100 * squeeze.get_heap_freespace(i.class_id) FROM pg_catalog.pg_stat_user_tables s, squeeze.tables t WHERE i.class_id NOTNULL AND i.table_id = t.id AND (t.tabschema, t.tabname) = (s.schemaname, s.relname) AND ( (s.last_vacuum >= now() - t.vacuum_max_age) OR (s.last_autovacuum >= now() - t.vacuum_max_age) ) AND -- Each processing makes the previous VACUUM unimportant. ( i.last_task_finished ISNULL OR i.last_task_finished < s.last_vacuum OR i.last_task_finished < s.last_autovacuum ); -- If VACUUM didn't run recently or there's no FSM, take the more -- expensive approach. (Use WITH as LATERAL doesn't work for UPDATE.) WITH t_approx(table_id, free_space) AS ( SELECT i.table_id, a.approx_free_percent + a.dead_tuple_percent FROM squeeze.tables_internal i, squeeze.pgstattuple_approx(i.class_id) AS a WHERE i.class_id NOTNULL AND i.free_space ISNULL) UPDATE squeeze.tables_internal i SET free_space = a.free_space FROM t_approx a WHERE i.table_id = a.table_id; $$; -- Create tasks for newly qualifying tables. CREATE FUNCTION add_new_tasks() RETURNS void LANGUAGE sql AS $$ -- The previous estimates are obsolete now. UPDATE squeeze.tables_internal SET free_space = NULL, class_id = NULL, class_id_toast = NULL; -- Mark tables that we're interested in. UPDATE squeeze.tables_internal i SET class_id = c.oid, class_id_toast = c.reltoastrelid FROM pg_catalog.pg_stat_user_tables s, squeeze.tables t, pg_class c, pg_namespace n WHERE (t.tabschema, t.tabname) = (s.schemaname, s.relname) AND i.table_id = t.id AND n.nspname = t.tabschema AND c.relnamespace = n.oid AND c.relname = t.tabname AND -- Is there a matching schedule? EXISTS ( SELECT u.s FROM squeeze.tables t_sub, UNNEST(t_sub.schedule) u(s) WHERE t_sub.id = t.id AND -- The schedule must have passed ... u.s <= now()::timetz AND -- ... and it should be one for which no -- task was created yet. (u.s > i.last_task_created::timetz OR i.last_task_created ISNULL OR -- The next schedule can be in front of the -- last task if a new day started. i.last_task_created::date < current_date) ) -- Ignore tables for which a task currently exists. AND NOT t.id IN (SELECT table_id FROM squeeze.tasks); SELECT squeeze.update_free_space_info(); -- Create a new task for each table having more free space than -- needed. UPDATE squeeze.tables_internal i SET last_task_created = now() FROM squeeze.tables t WHERE i.class_id NOTNULL AND t.id = i.table_id AND i.free_space > ((100 - squeeze.get_heap_fillfactor(i.class_id)) + t.free_space_extra) AND pg_catalog.pg_relation_size(i.class_id, 'main') > t.min_size * 1048576; -- now() is supposed to return the same value as it did in the previous -- query. INSERT INTO squeeze.tasks(table_id) SELECT table_id FROM squeeze.tables_internal i WHERE i.last_task_created = now(); $$; -- Mark the next task as active. CREATE FUNCTION start_next_task() RETURNS void LANGUAGE plpgsql AS $$ DECLARE v_tabschema name; v_tabname name; BEGIN PERFORM FROM squeeze.tasks WHERE active; IF FOUND THEN RETURN; END IF; UPDATE squeeze.tasks t INTO v_tabschema, v_tabname SET active = true FROM squeeze.tables tb WHERE tb.id = t.table_id AND t.id = (SELECT id FROM squeeze.tasks ORDER BY id LIMIT 1) RETURNING tb.tabschema, tb.tabname; IF NOT FOUND THEN RETURN; END IF; END; $$; -- Delete task and make the table available for task creation again. -- -- By adjusting last_task_created make VACUUM necessary before the next task -- can be created for the table. CREATE FUNCTION cleanup_task(a_task_id int) RETURNS void LANGUAGE sql AS $$ WITH deleted(table_id) AS ( DELETE FROM squeeze.tasks t WHERE id = a_task_id RETURNING table_id ) UPDATE squeeze.tables_internal t SET last_task_finished = now() FROM deleted d WHERE d.table_id = t.table_id; $$; -- Process the currently active task. CREATE FUNCTION process_current_task() RETURNS void LANGUAGE plpgsql AS $$ DECLARE v_tabschema name; v_tabname name; v_cl_index name; v_rel_tbsp name; v_ind_tbsps name[][]; v_task_id int; v_tried int; v_last_try bool; v_skip_analyze bool; v_stmt text; v_start timestamptz; -- Error info to be logged. v_sql_state text; v_err_msg text; v_err_detail text; BEGIN SELECT tb.tabschema, tb.tabname, tb.clustering_index, tb.rel_tablespace, tb.ind_tablespaces, t.id, t.tried, t.tried >= tb.max_retry, tb.skip_analyze INTO v_tabschema, v_tabname, v_cl_index, v_rel_tbsp, v_ind_tbsps, v_task_id, v_tried, v_last_try, v_skip_analyze FROM squeeze.tasks t, squeeze.tables tb WHERE t.table_id = tb.id AND t.active; IF NOT FOUND THEN -- Unexpected deletion by someone else? RETURN; END IF; -- Do the actual work. BEGIN v_start := clock_timestamp(); -- Do the actual processing. -- -- If someone dropped the table in between, the exception -- handler below should log the error and cleanup the task. PERFORM squeeze.squeeze_table(v_tabschema, v_tabname, v_cl_index, v_rel_tbsp, v_ind_tbsps); INSERT INTO squeeze.log(tabschema, tabname, started, finished) VALUES (v_tabschema, v_tabname, v_start, clock_timestamp()); PERFORM squeeze.cleanup_task(v_task_id); IF NOT v_skip_analyze THEN -- Analyze the new table, unless user rejects it -- explicitly. -- -- XXX Besides updating planner statistics in general, -- this sets pg_class(relallvisible) to 0, so that -- planner is not too optimistic about this -- figure. The preferrable solution would be to run -- (lazy) VACUUM (with the ANALYZE option) to -- initialize visibility map. However, to make the -- effort worthwile, we shouldn't do it until all -- transactions can see all the changes done by -- squeeze_table() function. What's the most suitable -- way to wait? Asynchronous execution of the VACUUM -- is probably needed in any case. v_stmt := 'ANALYZE "' || v_tabschema || '"."' || v_tabname || '"'; EXECUTE v_stmt; END IF; EXCEPTION WHEN OTHERS THEN GET STACKED DIAGNOSTICS v_sql_state := RETURNED_SQLSTATE; GET STACKED DIAGNOSTICS v_err_msg := MESSAGE_TEXT; GET STACKED DIAGNOSTICS v_err_detail := PG_EXCEPTION_DETAIL; INSERT INTO squeeze.errors(tabschema, tabname, sql_state, err_msg, err_detail) VALUES (v_tabschema, v_tabname, v_sql_state, v_err_msg, v_err_detail); -- If the active task failed too many times, delete -- it. start_next_task() will prepare the next one. IF v_last_try THEN PERFORM squeeze.cleanup_task(v_task_id); RETURN; ELSE -- Account for the current attempt. UPDATE squeeze.tasks SET tried = tried + 1 WHERE id = v_task_id; END IF; END; END; $$; CREATE FUNCTION squeeze_table( tabchema name, tabname name, clustering_index name, rel_tablespace name, ind_tablespaces name[][]) RETURNS void AS 'MODULE_PATHNAME', 'squeeze_table' LANGUAGE C; CREATE FUNCTION start_worker() RETURNS int AS 'MODULE_PATHNAME', 'squeeze_start_worker' LANGUAGE C; -- Stop "squeeze worker" if it's currently running. CREATE FUNCTION stop_worker() RETURNS boolean LANGUAGE sql AS $$ -- When looking for the PID we rely on the fact that the worker holds -- lock on the extension. If the worker is not running, we could (in -- theory) kill a regular backend trying to ALTER or DROP the -- extension right now. It's not worth taking a different approach -- just to avoid this extremely unlikely case (which shouldn't cause -- data corruption). SELECT pg_terminate_backend(pid) FROM pg_catalog.pg_locks l, pg_catalog.pg_extension e WHERE e.extname = 'pg_squeeze' AND (l.classid, l.objid) = (3079, e.oid); $$; pg_squeeze-REL1_8_0/pg_squeeze--1.3--1.4.sql000066400000000000000000000000441474466764000202350ustar00rootroot00000000000000-- No SQL changes for this upgrade. pg_squeeze-REL1_8_0/pg_squeeze--1.4--1.5.sql000066400000000000000000000000441474466764000202370ustar00rootroot00000000000000-- No SQL changes for this upgrade. pg_squeeze-REL1_8_0/pg_squeeze--1.5--1.6.sql000066400000000000000000000023021474466764000202400ustar00rootroot00000000000000/* pg_squeeze--1.5--1.6.sql */ -- complain if script is sourced in psql, rather than via ALTER EXTENSION \echo Use "ALTER EXTENSION pg_squeeze UPDATE TO '1.6'" to load this file. \quit DROP FUNCTION process_next_task(); DROP FUNCTION squeeze_table(name, name, name, name, name[][]); DROP FUNCTION start_worker(); CREATE FUNCTION start_worker() RETURNS void AS 'MODULE_PATHNAME', 'squeeze_start_worker' LANGUAGE C; DROP FUNCTION stop_worker(); CREATE FUNCTION stop_worker() RETURNS void AS 'MODULE_PATHNAME', 'squeeze_stop_worker' LANGUAGE C; CREATE FUNCTION squeeze_table( tabschema name, tabname name, clustering_index name DEFAULT NULL, rel_tablespace name DEFAULT NULL, ind_tablespaces name[][] DEFAULT NULL) RETURNS void AS 'MODULE_PATHNAME', 'squeeze_table_new' LANGUAGE C; CREATE FUNCTION get_active_workers() RETURNS TABLE ( pid int4, tabschema name, tabname name, ins_initial bigint, ins bigint, upd bigint, del bigint) AS 'MODULE_PATHNAME', 'squeeze_get_active_workers' LANGUAGE C; ALTER TABLE log ADD COLUMN ins_initial bigint; ALTER TABLE log ADD COLUMN ins bigint; ALTER TABLE log ADD COLUMN upd bigint; ALTER TABLE log ADD COLUMN del bigint; pg_squeeze-REL1_8_0/pg_squeeze--1.6--1.7.sql000066400000000000000000000000441474466764000202430ustar00rootroot00000000000000-- No SQL changes for this upgrade. pg_squeeze-REL1_8_0/pg_squeeze--1.7--1.8.sql000066400000000000000000000000441474466764000202450ustar00rootroot00000000000000-- No SQL changes for this upgrade. pg_squeeze-REL1_8_0/pg_squeeze.c000066400000000000000000003106531474466764000167410ustar00rootroot00000000000000/*----------------------------------------------------- * * pg_squeeze.c * A tool to eliminate table bloat. * * Copyright (c) 2016-2024, CYBERTEC PostgreSQL International GmbH * *----------------------------------------------------- */ #include "pg_squeeze.h" #if PG_VERSION_NUM >= 130000 #include "access/heaptoast.h" #endif #include "access/multixact.h" #include "access/sysattr.h" #if PG_VERSION_NUM >= 130000 #include "access/toast_internals.h" #include "access/xlogutils.h" #endif #if PG_VERSION_NUM >= 150000 #include "access/xloginsert.h" #endif #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/heap.h" #include "catalog/index.h" #include "catalog/indexing.h" #include "catalog/namespace.h" #include "catalog/objectaddress.h" #include "catalog/objectaccess.h" #include "catalog/pg_am.h" #include "catalog/pg_control.h" #include "catalog/pg_type.h" #include "catalog/pg_tablespace.h" #include "catalog/toasting.h" #include "commands/cluster.h" #include "commands/tablecmds.h" #include "commands/tablespace.h" #include "executor/executor.h" #include "lib/stringinfo.h" #include "nodes/primnodes.h" #include "nodes/makefuncs.h" #include "optimizer/optimizer.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/lmgr.h" #include "storage/smgr.h" #include "storage/standbydefs.h" #include "tcop/tcopprot.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" #include "utils/syscache.h" #if PG_VERSION_NUM < 150000 extern PGDLLIMPORT int wal_segment_size; extern PGDLLIMPORT bool FirstSnapshotSet; #endif #if PG_VERSION_NUM < 120000 #error "PostgreSQL version 12 or higher is required" #endif PG_MODULE_MAGIC; static void squeeze_table_internal(Name relschema, Name relname, Name indname, Name tbspname, ArrayType *ind_tbsp); static int index_cat_info_compare(const void *arg1, const void *arg2); /* Index-to-tablespace mapping. */ typedef struct IndexTablespace { Oid index; Oid tablespace; } IndexTablespace; /* Where should the new table and its indexes be located? */ typedef struct TablespaceInfo { Oid table; int nindexes; IndexTablespace *indexes; } TablespaceInfo; /* The WAL segment being decoded. */ XLogSegNo squeeze_current_segment = 0; static void check_prerequisites(Relation rel); static LogicalDecodingContext *setup_decoding(Oid relid, TupleDesc tup_desc, Snapshot *snap_hist); static void decoding_cleanup(LogicalDecodingContext *ctx); static CatalogState *get_catalog_state(Oid relid); static void get_pg_class_info(Oid relid, TransactionId *xmin, Form_pg_class *form_p, TupleDesc *desc_p); static void get_attribute_info(Oid relid, int relnatts, TransactionId **xmins_p, CatalogState *cat_state); static void cache_composite_type_info(CatalogState *cat_state, Oid typid); static void get_composite_type_info(TypeCatInfo *tinfo); static IndexCatInfo *get_index_info(Oid relid, int *relninds, bool *found_invalid, bool invalid_check_only, bool *found_pk); static void check_attribute_changes(CatalogState *cat_state); static void check_index_changes(CatalogState *state); static void check_composite_type_changes(CatalogState *cat_state); static void free_catalog_state(CatalogState *state); static void check_pg_class_changes(CatalogState *state); static void free_tablespace_info(TablespaceInfo *tbsp_info); static void resolve_index_tablepaces(TablespaceInfo *tbsp_info, CatalogState *cat_state, ArrayType *ind_tbsp_a); static void perform_initial_load(Relation rel_src, RangeVar *cluster_idx_rv, Snapshot snap_hist, Relation rel_dst, LogicalDecodingContext *ctx); static bool has_dropped_attribute(Relation rel); static Oid create_transient_table(CatalogState *cat_state, TupleDesc tup_desc, Oid tablespace, Oid relowner); static Oid *build_transient_indexes(Relation rel_dst, Relation rel_src, Oid *indexes_src, int nindexes, TablespaceInfo *tbsp_info, CatalogState *cat_state, LogicalDecodingContext *ctx); static ScanKey build_identity_key(Oid ident_idx_oid, Relation rel_src, int *nentries); static bool perform_final_merge(Oid relid_src, Oid *indexes_src, int nindexes, Relation rel_dst, ScanKey ident_key, int ident_key_nentries, IndexInsertState *iistate, CatalogState *cat_state, LogicalDecodingContext *ctx); static void swap_relation_files(Oid r1, Oid r2); static void swap_toast_names(Oid relid1, Oid toastrelid1, Oid relid2, Oid toastrelid2); #if PG_VERSION_NUM < 130000 static Oid get_toast_index(Oid toastrelid); #endif /* * The maximum time to hold AccessExclusiveLock during the final * processing. Note that it only process_concurrent_changes() execution time * is included here. The very last steps like swap_relation_files() and * swap_toast_names() shouldn't get blocked and it'd be wrong to consider them * a reason to abort otherwise completed processing. */ int squeeze_max_xlock_time = 0; /* * List of database names for which the background worker should start started * during cluster startup. (We require OIDs because there seems to be now good * way to pass list of database name w/o adding restrictions on character set * characters.) */ char *squeeze_worker_autostart = NULL; /* * Role on behalf of which automatically-started worker connects to * database(s). */ char *squeeze_worker_role = NULL; /* The number of squeeze workers per database. */ int squeeze_workers_per_database = 1; void _PG_init(void) { if (!process_shared_preload_libraries_in_progress) ereport(ERROR, (errmsg("pg_squeeze must be loaded via shared_preload_libraries"))); #if PG_VERSION_NUM >= 150000 squeeze_save_prev_shmem_request_hook(); shmem_request_hook = squeeze_worker_shmem_request; #else squeeze_worker_shmem_request(); #endif squeeze_save_prev_shmem_startup_hook(); shmem_startup_hook = squeeze_worker_shmem_startup; DefineCustomStringVariable( "squeeze.worker_autostart", "Names of databases for which background workers start automatically.", "Comma-separated list for of databases which squeeze worker starts as soon as " "the cluster startup has completed.", &squeeze_worker_autostart, NULL, PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomStringVariable( "squeeze.worker_role", "Role that background workers use to connect to database.", "If background worker was launched automatically on cluster startup, " "it uses this role to initiate database connection(s).", &squeeze_worker_role, NULL, PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomIntVariable( "squeeze.workers_per_database", "Maximum number of squeeze worker processes launched for each database.", NULL, &squeeze_workers_per_database, 1, 1, max_worker_processes, PGC_POSTMASTER, 0, /* * Assume that the in-core GUC max_worker_processes should already be * assigned and checked before the loading of the modules starts. Since * the context of both this GUC and the max_worker_processes is * PGC_POSTMASTER, no future check should be needed. (Some in-core GUCs * that reference other ones have the hooks despite being PGC_POSTMASTER, * but the reason seems to be that those cannot assume anything about the * order of checking.) */ NULL, NULL, NULL); if (squeeze_worker_autostart) { List *dbnames = NIL; char *dbname, *c; int len; ListCell *lc; if (squeeze_worker_role == NULL) ereport(ERROR, (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING), (errmsg("\"squeeze.worker_role\" parameter is invalid or not set")))); c = squeeze_worker_autostart; len = 0; dbname = NULL; while (true) { bool done; done = *c == '\0'; if (done || isspace(*c)) { if (dbname != NULL) { /* The current item ends here. */ Assert(len > 0); dbnames = lappend(dbnames, pnstrdup(dbname, len)); dbname = NULL; len = 0; } if (done) break; } else { /* * Start a new item or add the character to the current one. */ if (dbname == NULL) { dbname = c; len = 1; } else len++; } c++; } if (list_length(dbnames) == 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("\"squeeze.worker_autostart\" parameter is empty")))); foreach(lc, dbnames) { WorkerConInit *con; BackgroundWorker worker; dbname = lfirst(lc); con = allocate_worker_con_info(dbname, squeeze_worker_role); squeeze_initialize_bgworker(&worker, con, NULL, 0); RegisterBackgroundWorker(&worker); } list_free_deep(dbnames); } DefineCustomIntVariable( "squeeze.max_xlock_time", "The maximum time the processed table may be locked exclusively.", "The source table is locked exclusively during the final stage of " "processing. If the lock time should exceed this value, the lock is " "released and the final stage is retried a few more times.", &squeeze_max_xlock_time, 0, 0, INT_MAX, PGC_USERSET, GUC_UNIT_MS, NULL, NULL, NULL); } /* * The original implementation would certainly fail on PG 16 and higher, due * to the commit 240e0dbacd (in the master branch). It's not worth supporting * lower versions of pg_squeeze on lower versions of PG server. */ extern Datum squeeze_table(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(squeeze_table); Datum squeeze_table(PG_FUNCTION_ARGS) { ereport(ERROR, (errmsg("the old implementation of the function is no longer supported"), errhint("please run \"ALTER EXTENSION pg_squeeze UPDATE\""))); PG_RETURN_VOID(); } /* * A substitute for CHECK_FOR_INTERRUPRS. * * procsignal_sigusr1_handler does not support signaling from a backend to a * non-parallel worker (see the values of ProcSignalReason), and an extension * has no other way to set the flags checked by ProcessInterrupts(), so the * worker cannot use CHECK_FOR_INTERRUPTS. Let's use shared memory to tell the * worker that it should exit. (SIGTERM would terminate the worker easily, * but due to race conditions we could terminate another backend / worker * which already managed to reuse this worker's PID.) */ void exit_if_requested(void) { bool exit_requested; SpinLockAcquire(&MyWorkerTask->mutex); exit_requested = MyWorkerTask->exit_requested; SpinLockRelease(&MyWorkerTask->mutex); if (!exit_requested) return; /* * Message similar to that in ProcessInterrupts(), but ERROR is * sufficient here. squeeze_table_impl() should catch it. */ ereport(ERROR, (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("terminating pg_squeeze background worker due to administrator command"))); } /* * Introduced in pg_squeeze 1.6, to be called directly as opposed to calling * via the postgres executor. * * Return true if succeeded. If failed, copy useful information into *edata_p * and return false. */ bool squeeze_table_impl(Name relschema, Name relname, Name indname, Name tbspname, ArrayType *ind_tbsp, ErrorData **edata_p, MemoryContext edata_cxt) { bool result; PG_TRY(); { squeeze_table_internal(relschema, relname, indname, tbspname, ind_tbsp); result = true; } PG_CATCH(); { squeeze_handle_error_db(edata_p, edata_cxt); result = false; } PG_END_TRY(); return result; } static void squeeze_table_internal(Name relschema, Name relname, Name indname, Name tbspname, ArrayType *ind_tbsp) { RangeVar *relrv_src; RangeVar *relrv_cl_idx = NULL; Relation rel_src, rel_dst; Oid rel_src_owner; Oid ident_idx_src, ident_idx_dst; Oid relid_src, relid_dst; Oid toastrelid_src, toastrelid_dst; char replident; ScanKey ident_key; int i, ident_key_nentries; IndexInsertState *iistate; LogicalDecodingContext *ctx; ReplicationSlot *slot; Snapshot snap_hist; TupleDesc tup_desc; CatalogState *cat_state; XLogRecPtr end_of_wal; XLogRecPtr xlog_insert_ptr; int nindexes; Oid *indexes_src = NULL, *indexes_dst = NULL; bool invalid_index = false; IndexCatInfo *ind_info; TablespaceInfo *tbsp_info; ObjectAddress object; bool source_finalized; bool xmin_valid; relrv_src = makeRangeVar(NameStr(*relschema), NameStr(*relname), -1); rel_src = table_openrv(relrv_src, AccessShareLock); check_prerequisites(rel_src); /* * Retrieve the useful info while holding lock on the relation. */ ident_idx_src = RelationGetReplicaIndex(rel_src); replident = rel_src->rd_rel->relreplident; /* The table can have PK although the replica identity is FULL. */ if (ident_idx_src == InvalidOid && rel_src->rd_pkindex != InvalidOid) ident_idx_src = rel_src->rd_pkindex; relid_src = RelationGetRelid(rel_src); rel_src_owner = RelationGetForm(rel_src)->relowner; toastrelid_src = rel_src->rd_rel->reltoastrelid; /* * Info to create transient table and to initialize tuplestore we'll use * during logical decoding. */ tup_desc = CreateTupleDescCopy(RelationGetDescr(rel_src)); /* * Get ready for the subsequent calls of check_catalog_changes(). * * Not all index changes do conflict with the AccessShareLock - see * get_index_info() for explanation. * * XXX It'd still be correct to start the check a bit later, i.e. just * before CreateInitDecodingContext(), but the gain is not worth making * the code less readable. */ cat_state = get_catalog_state(relid_src); /* Give up if it's clear enough to do so. */ if (cat_state->invalid_index) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), (errmsg("At least one index is invalid")))); /* * The relation shouldn't be locked during the call of setup_decoding(), * otherwise another transaction could write XLOG records before the * slots' data.restart_lsn and we'd have to wait for it to finish. If such * a transaction requested exclusive lock on our relation (e.g. ALTER * TABLE), it'd result in a deadlock. * * We can't keep the lock till the end of transaction anyway - that's why * check_catalog_changes() exists. * * XXX Now that the squeeze worker launched by the scheduler worker no * longer needs to call DecodingContextFindStartpoint(), it should not see * running transactions that started before the restart_lsn, so it's * probably no longer necessary to close the relation here. (The worker * launched by the squeeze_table() function does call * DecodingContextFindStartpoint(), however it does so before the current * transaction is started.) Reconsider. */ table_close(rel_src, AccessShareLock); /* * Check if we're ready to capture changes that possibly take place during * the initial load. * * Concurrent DDL causes ERROR in any case, so don't worry about validity * of this test during the next steps. * * Note: we let the plugin do this check on per-change basis, and allow * processing of tables with no identity if only INSERT changes are * decoded. However it seems inconsistent. * * XXX Although ERRCODE_UNIQUE_VIOLATION is no actual "unique violation", * this error code seems to be the best match. * (ERRCODE_TRIGGERED_ACTION_EXCEPTION might be worth consideration as * well.) */ if (replident == REPLICA_IDENTITY_NOTHING || (replident == REPLICA_IDENTITY_DEFAULT && !OidIsValid(ident_idx_src))) ereport(ERROR, (errcode(ERRCODE_UNIQUE_VIOLATION), (errmsg("Table \"%s\".\"%s\" has no identity index", NameStr(*relschema), NameStr(*relname))))); /* * Change processing w/o PK index is not a good idea. * * Note that some users need the "full identity" although the table does * have PK. ("full identity" + UNIQUE constraint is also a valid setup, * but it's harder to check). */ if (replident == REPLICA_IDENTITY_FULL && !cat_state->have_pk_index) ereport(ERROR, (errcode(ERRCODE_UNIQUE_VIOLATION), (errmsg("Replica identity \"full\" not supported")))); /* * Clustering index, if any. * * Do not lock the index so far, e.g. just to retrieve OID and to keep it * valid. Neither the relation can be locked continuously, so by keeping * the index locked alone we'd introduce incorrect order of locking. * Although we use only share locks in most cases (so I'm not aware of * particular deadlock scenario), it doesn't seem wise. The worst * consequence of not locking is that perform_initial_load() will error * out. */ if (indname) { ereport(DEBUG1, (errmsg("clustering index: %s", NameStr(*indname)))); relrv_cl_idx = makeRangeVar(NameStr(*relschema), NameStr(*indname), -1); } /* * Process tablespace arguments, if provided. * * XXX Currently we consider tablespace DDLs rather infrequent, so we let * such a DDL to break transient table or index creation. As we can't * keep the source table locked all the time, it's possible for tablespace * to disappear even if it contains the source table. Is it worth locking * the tablespaces here? Since concurrent renaming of a tablespace is * disruptive too, we'd probably need AccessExclusiveLock. Or are such * changes worth making check_catalog_changes() more expensive? */ tbsp_info = (TablespaceInfo *) palloc0(sizeof(TablespaceInfo)); if (tbspname) tbsp_info->table = get_tablespace_oid(pstrdup(NameStr(*tbspname)), false); else tbsp_info->table = cat_state->form_class->reltablespace; /* Index-to-tablespace mappings. */ if (ind_tbsp) resolve_index_tablepaces(tbsp_info, cat_state, ind_tbsp); nindexes = cat_state->relninds; /* * Existence of identity index was checked above, so number of indexes and * attributes are both non-zero. */ Assert(cat_state->form_class->relnatts >= 1); Assert(nindexes > 0); /* Copy the OIDs into a separate array, for convenient use later. */ indexes_src = (Oid *) palloc(nindexes * sizeof(Oid)); for (i = 0; i < nindexes; i++) indexes_src[i] = cat_state->indexes[i].oid; ctx = setup_decoding(relid_src, tup_desc, &snap_hist); relid_dst = create_transient_table(cat_state, tup_desc, tbsp_info->table, rel_src_owner); /* The source relation will be needed for the initial load. */ rel_src = table_open(relid_src, AccessShareLock); /* * The new relation should not be visible for other transactions until we * commit, but exclusive lock just makes sense. */ rel_dst = table_open(relid_dst, AccessExclusiveLock); toastrelid_dst = rel_dst->rd_rel->reltoastrelid; /* * We need to know whether that no DDL took place that allows for data * inconsistency. The relation was unlocked for some time since last * check, so pass NoLock. */ check_catalog_changes(cat_state, NoLock); /* * This is to satisfy the check introduced by the commit 2776922201f in PG * core. (Per HeapTupleSatisfiesToast() the snapshot shouldn't actually be * used for visibility checks of the TOAST values.) */ PushActiveSnapshot(snap_hist); /* * The historic snapshot is used to retrieve data w/o concurrent changes. */ perform_initial_load(rel_src, relrv_cl_idx, snap_hist, rel_dst, ctx); /* * We no longer need to preserve the rows processed during the initial * load from VACUUM. (User should not run VACUUM on a table that we * currently process, but our stale effective_xmin would also restrict * VACUUM on other tables.) */ slot = ctx->slot; SpinLockAcquire(&slot->mutex); xmin_valid = TransactionIdIsValid(slot->effective_xmin); slot->effective_xmin = InvalidTransactionId; SpinLockRelease(&slot->mutex); /* * This should not happen, but it's critical, therefore use ereport() * rather than Assert(). If the value got lost somehow due to releasing * and acquiring the slot, VACUUM could have removed some rows from the * source table that the historic snapshot was still supposed to see. */ if (!xmin_valid) ereport(ERROR, (errmsg("effective_xmin of the replication slot \"%s\" is invalid", NameStr(slot->data.name)))); /* * The historic snapshot won't be needed anymore. */ PopActiveSnapshot(); /* * This is rather paranoia than anything else --- perform_initial_load() * uses each snapshot to access different table, and it does not cause * catalog changes. */ InvalidateSystemCaches(); /* * Check for concurrent changes that would make us stop working later. * Index build can take quite some effort and we don't want to waste it. * * Note: By still holding the share lock we only ensure that the source * relation is not altered underneath index build, but we'll have to * release the lock for a short time at some point. So while we can't * prevent anyone from forcing us to cancel our work, such cancellation * must happen at well-defined moment. */ check_catalog_changes(cat_state, AccessShareLock); /* * Make sure the contents of the transient table is visible for the * scan(s) during index build. */ CommandCounterIncrement(); /* * Create indexes on the temporary table - that might take a while. * (Unlike the concurrent changes, which we insert into existing indexes.) */ PushActiveSnapshot(GetTransactionSnapshot()); indexes_dst = build_transient_indexes(rel_dst, rel_src, indexes_src, nindexes, tbsp_info, cat_state, ctx); PopActiveSnapshot(); /* * Make the identity index of the transient table visible, for the sake of * concurrent UPDATEs and DELETEs. */ CommandCounterIncrement(); /* Tablespace info is no longer needed. */ free_tablespace_info(tbsp_info); /* * Build scan key that we'll use to look for rows to be updated / deleted * during logical decoding. */ ident_key = build_identity_key(ident_idx_src, rel_src, &ident_key_nentries); /* * As we'll need to take exclusive lock later, release the shared one. * * Note: PG core code shouldn't actually participate in such a deadlock, * as it (supposedly) does not raise lock level. Nor should concurrent * call of the squeeze_table() function participate in the deadlock, * because it should have failed much earlier when creating an existing * logical replication slot again. Nevertheless, these circumstances still * don't justify generally bad practice. * * (As we haven't changed the catalog entry yet, there's no need to send * invalidation messages.) */ table_close(rel_src, AccessShareLock); /* * Valid identity index should exist now, see the identity checks above. */ Assert(OidIsValid(ident_idx_src)); /* Find "identity index" of the transient relation. */ ident_idx_dst = InvalidOid; for (i = 0; i < nindexes; i++) { if (ident_idx_src == indexes_src[i]) { ident_idx_dst = indexes_dst[i]; break; } } if (!OidIsValid(ident_idx_dst)) /* * Should not happen, concurrent DDLs should have been noticed short * ago. */ elog(ERROR, "Identity index missing on the transient relation"); /* Executor state to update indexes. */ iistate = get_index_insert_state(rel_dst, ident_idx_dst); /* * Flush all WAL records inserted so far (possibly except for the last * incomplete page, see GetInsertRecPtr), to minimize the amount of data * we need to flush while holding exclusive lock on the source table. */ xlog_insert_ptr = GetInsertRecPtr(); XLogFlush(xlog_insert_ptr); /* * Since we'll do some more changes, all the WAL records flushed so far * need to be decoded for sure. */ #if PG_VERSION_NUM >= 150000 end_of_wal = GetFlushRecPtr(NULL); #else end_of_wal = GetFlushRecPtr(); #endif /* * Decode and apply the data changes that occurred while the initial load * was in progress. The XLOG reader should continue where setup_decoding() * has left it. * * Even if the amount of concurrent changes of our source table might not * be significant, both initial load and index build could have produced * many XLOG records that we need to read. Do so before requesting * exclusive lock on the source relation. */ process_concurrent_changes(ctx, end_of_wal, cat_state, rel_dst, ident_key, ident_key_nentries, iistate, NoLock, NULL); /* * This (supposedly cheap) special check should avoid one particular * deadlock scenario: another transaction, performing index DDL * concurrenly (e.g. DROP INDEX CONCURRENTLY) committed change of * indisvalid, indisready, ... and called WaitForLockers() before we * unlocked both source table and its indexes above. WaitForLockers() * waits till the end of the holding (our) transaction as opposed to the * end of our locks, and the other transaction holds (non-exclusive) lock * on both relation and index. In this situation we'd cause deadlock by * requesting exclusive lock. We should recognize this scenario by * checking pg_index alone. */ ind_info = get_index_info(relid_src, NULL, &invalid_index, true, NULL); if (invalid_index) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("Concurrent change of index detected"))); else pfree(ind_info); /* * Try a few times to perform the stage that requires exclusive lock on * the source relation. * * XXX Not sure the number of attempts should be configurable. If it fails * several times, admin should either increase squeeze_max_xlock_time or * disable it. */ source_finalized = false; for (i = 0; i < 4; i++) { if (perform_final_merge(relid_src, indexes_src, nindexes, rel_dst, ident_key, ident_key_nentries, iistate, cat_state, ctx)) { source_finalized = true; break; } else elog(DEBUG1, "pg_squeeze: exclusive lock on table %u had to be released.", relid_src); } if (!source_finalized) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("\"squeeze_max_xlock_time\" prevented squeeze from completion"))); /* * Done with decoding. * * XXX decoding_cleanup() frees tup_desc, although we've used it not only * for the decoding. */ decoding_cleanup(ctx); ReplicationSlotRelease(); pfree(ident_key); free_index_insert_state(iistate); /* The destination table is no longer necessary, so close it. */ /* * XXX (Should have been closed right after process_concurrent_changes()?) */ table_close(rel_dst, AccessExclusiveLock); /* * Exchange storage (including TOAST) and indexes between the source and * destination tables. */ swap_relation_files(relid_src, relid_dst); CommandCounterIncrement(); /* * As swap_relation_files() already changed pg_class(reltoastrelid), we * pass toastrelid_dst for relid_src and vice versa. */ swap_toast_names(relid_src, toastrelid_dst, relid_dst, toastrelid_src); for (i = 0; i < nindexes; i++) swap_relation_files(indexes_src[i], indexes_dst[i]); CommandCounterIncrement(); if (nindexes > 0) { pfree(indexes_src); pfree(indexes_dst); } /* State not needed anymore. */ free_catalog_state(cat_state); /* * Drop the transient table including indexes (and possibly constraints on * those indexes). */ object.classId = RelationRelationId; object.objectSubId = 0; object.objectId = relid_dst; performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); } static int index_cat_info_compare(const void *arg1, const void *arg2) { IndexCatInfo *i1 = (IndexCatInfo *) arg1; IndexCatInfo *i2 = (IndexCatInfo *) arg2; if (i1->oid > i2->oid) return 1; else if (i1->oid < i2->oid) return -1; else return 0; } /* * Raise error if the relation is not eligible for squeezing or any adverse * conditions exist. * * Some of the checks may be redundant (e.g. heap_open() checks relkind) but * its safer to have them all listed here. */ static void check_prerequisites(Relation rel) { Form_pg_class form = RelationGetForm(rel); /* * The extension is not generic enough to handle AMs other than "heap". */ if (form->relam != HEAP_TABLE_AM_OID) ereport(ERROR, (errmsg("pg_squeeze only supports the \"heap\" access method"))); /* Check the relation first. */ if (form->relkind == RELKIND_PARTITIONED_TABLE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot squeeze partitioned table"))); if (form->relkind != RELKIND_RELATION) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table", RelationGetRelationName(rel)))); if (form->relpersistence != RELPERSISTENCE_PERMANENT) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a regular table", RelationGetRelationName(rel)))); if (form->relisshared) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is shared relation", RelationGetRelationName(rel)))); if (IsCatalogRelation(rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is a catalog relation", RelationGetRelationName(rel)))); /* * We cannot simply replace the storage of a mapped relation. * * The previous check should have caught them, but let's try hard to be * safe. */ if (RelationIsMapped(rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is mapped relation", RelationGetRelationName(rel)))); /* * There's no urgent need to process catalog tables. * * Should this limitation be relaxed someday, consider if we need to write * xl_heap_rewrite_mapping records. (Probably not because the whole * "decoding session" takes place within a call of squeeze_table() and our * catalog checks should not allow for a concurrent rewrite that could * make snapmgr.c:tuplecid_data obsolete. Furthermore, such a rewrite * would have to take place before perform_initial_load(), but this is * called before any transactions could have been decoded, so tuplecid * should still be empty anyway.) */ if (RelationGetRelid(rel) < FirstNormalObjectId) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not user relation", RelationGetRelationName(rel)))); /* * While AFTER trigger should not be an issue (to generate an event must * have got XID assigned, causing setup_decoding() to fail later), open * cursor might be. See comments of the function for details. */ CheckTableNotInUse(rel, "squeeze_table()"); } /* * Acquire logical replication slot which created either by the scheduler * worker or by a backend executing the squeeze_table() function. */ static LogicalDecodingContext * setup_decoding(Oid relid, TupleDesc tup_desc, Snapshot *snap_hist) { ReplSlotStatus *repl_slot = &MyWorkerTask->repl_slot; DecodingOutputState *dstate; MemoryContext oldcontext; LogicalDecodingContext *ctx; XLogRecPtr restart_lsn; dsm_segment *seg = NULL; char *snap_src; /* * Use the slot initialized by the scheduler worker (or by the backend * running the squeeze_table() function ). */ ReplicationSlotAcquire(NameStr(repl_slot->name), true); /* * This should not really happen, but if it did, the initial load could * miss some data. */ if (!TransactionIdIsValid(MyReplicationSlot->effective_xmin)) ereport(ERROR, (errmsg("replication slot \"%s\" has invalid effective_xmin", NameStr(repl_slot->name)))); /* * It's pretty unlikely for some client to have consumed data changes * (accidentally?) before this worker could acquire the slot, but it's * easy enough to check. */ if (MyReplicationSlot->data.confirmed_flush != repl_slot->confirmed_flush) ereport(ERROR, (errmsg("replication slot \"%s\" has incorrect confirm position", NameStr(repl_slot->name)))); /* * Wasn't effective_xmin lost due to releasing and re-acquiring the slot? * (ReplicationSlotRelease() does clear it in some cases. We try to avoid * that, but checking makes sense as this slot field is critical.). */ if (!TransactionIdIsValid(MyReplicationSlot->effective_xmin)) ereport(ERROR, (errmsg("replication slot \"%s\" has invalid effective_xmin", NameStr(MyReplicationSlot->data.name)))); restart_lsn = MyReplicationSlot->data.restart_lsn; /* Restart the decoding context at slot's confirmed_flush */ ctx = CreateDecodingContext(InvalidXLogRecPtr, NIL, false, #if PG_VERSION_NUM >= 130000 XL_ROUTINE(.page_read = read_local_xlog_page, .segment_open = wal_segment_open, .segment_close = wal_segment_close), #else logical_read_local_xlog_page, #endif NULL, NULL, NULL); #if PG_VERSION_NUM >= 130000 /* decode_concurrent_changes() handles the older versions. */ XLogBeginRead(ctx->reader, MyReplicationSlot->data.restart_lsn); #endif XLByteToSeg(restart_lsn, squeeze_current_segment, wal_segment_size); /* * Setup structures to store decoded changes. */ oldcontext = MemoryContextSwitchTo(TopTransactionContext); dstate = palloc0(sizeof(DecodingOutputState)); dstate->relid = relid; dstate->tstore = tuplestore_begin_heap(false, false, maintenance_work_mem); dstate->tupdesc = tup_desc; /* Initialize the descriptor to store the changes ... */ dstate->tupdesc_change = CreateTemplateTupleDesc(1); TupleDescInitEntry(dstate->tupdesc_change, 1, NULL, BYTEAOID, -1, 0); /* ... as well as the corresponding slot. */ dstate->tsslot = MakeSingleTupleTableSlot(dstate->tupdesc_change, &TTSOpsMinimalTuple); dstate->resowner = ResourceOwnerCreate(CurrentResourceOwner, "logical decoding"); MemoryContextSwitchTo(oldcontext); ctx->output_writer_private = dstate; /* Retrieve the historic snapshot. */ if (repl_slot->snap_handle != DSM_HANDLE_INVALID) { seg = dsm_attach(repl_slot->snap_handle); snap_src = (char *) dsm_segment_address(seg); } else snap_src = repl_slot->snap_private; *snap_hist = RestoreSnapshot(snap_src); if (seg) dsm_detach(seg); return ctx; } static void decoding_cleanup(LogicalDecodingContext *ctx) { DecodingOutputState *dstate; dstate = (DecodingOutputState *) ctx->output_writer_private; ExecDropSingleTupleTableSlot(dstate->tsslot); FreeTupleDesc(dstate->tupdesc_change); FreeTupleDesc(dstate->tupdesc); tuplestore_end(dstate->tstore); FreeDecodingContext(ctx); } /* * Retrieve the catalog state to be passed later to check_catalog_changes. * * Caller is supposed to hold (at least) AccessShareLock on the relation. */ static CatalogState * get_catalog_state(Oid relid) { CatalogState *result; result = (CatalogState *) palloc0(sizeof(CatalogState)); result->rel.relid = relid; /* * pg_class(xmin) helps to ensure that the "user_catalog_option" wasn't * turned off and on. On the other hand it might restrict some concurrent * DDLs that would be safe as such. */ get_pg_class_info(relid, &result->rel.xmin, &result->form_class, &result->desc_class); result->rel.relnatts = result->form_class->relnatts; /* * We might want to avoid the check if relhasindex is false, but * index_update_stats() updates this field in-place. (Currently it should * not change from "true" to "false", but let's be cautious anyway.) */ result->indexes = get_index_info(relid, &result->relninds, &result->invalid_index, false, &result->have_pk_index); /* If any index is "invalid", no more catalog information is needed. */ if (result->invalid_index) return result; if (result->form_class->relnatts > 0) get_attribute_info(relid, result->form_class->relnatts, &result->rel.attr_xmins, result); return result; } /* * For given relid retrieve pg_class(xmin). Also set *form and *desc if valid * pointers are passed. */ static void get_pg_class_info(Oid relid, TransactionId *xmin, Form_pg_class *form_p, TupleDesc *desc_p) { HeapTuple tuple; Form_pg_class form_class; Relation rel; SysScanDesc scan; ScanKeyData key[1]; /* * ScanPgRelation() would do most of the work below, but relcache.c does * not export it. */ rel = table_open(RelationRelationId, AccessShareLock); ScanKeyInit(&key[0], Anum_pg_class_oid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(relid)); scan = systable_beginscan(rel, ClassOidIndexId, true, NULL, 1, key); tuple = systable_getnext(scan); /* * As the relation might not be locked by some callers, it could have * disappeared. */ if (!HeapTupleIsValid(tuple)) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_TABLE), (errmsg("Table no longer exists")))); } /* Invalid relfilenode indicates mapped relation. */ form_class = (Form_pg_class) GETSTRUCT(tuple); if (form_class->relfilenode == InvalidOid) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), (errmsg("Mapped relation cannot be squeezed")))); *xmin = HeapTupleHeaderGetXmin(tuple->t_data); if (form_p) { *form_p = (Form_pg_class) palloc(CLASS_TUPLE_SIZE); memcpy(*form_p, form_class, CLASS_TUPLE_SIZE); } if (desc_p) *desc_p = CreateTupleDescCopy(RelationGetDescr(rel)); systable_endscan(scan); table_close(rel, AccessShareLock); } /* * Retrieve array of pg_attribute(xmin) values for given relation, ordered by * attnum. (The ordering is not essential but lets us do some extra sanity * checks.) * * If cat_state is passed and the attribute is of a composite type, make sure * it's cached in ->comptypes. */ static void get_attribute_info(Oid relid, int relnatts, TransactionId **xmins_p, CatalogState *cat_state) { Relation rel; ScanKeyData key[2]; SysScanDesc scan; HeapTuple tuple; TransactionId *result; int n = 0; rel = table_open(AttributeRelationId, AccessShareLock); ScanKeyInit(&key[0], Anum_pg_attribute_attrelid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(relid)); /* System columns should not be ALTERed. */ ScanKeyInit(&key[1], Anum_pg_attribute_attnum, BTGreaterStrategyNumber, F_INT2GT, Int16GetDatum(0)); scan = systable_beginscan(rel, AttributeRelidNumIndexId, true, NULL, 2, key); result = (TransactionId *) palloc(relnatts * sizeof(TransactionId)); while ((tuple = systable_getnext(scan)) != NULL) { Form_pg_attribute form; int i; Assert(HeapTupleIsValid(tuple)); form = (Form_pg_attribute) GETSTRUCT(tuple); Assert(form->attnum > 0); /* AttributeRelidNumIndexId index ensures ordering. */ i = form->attnum - 1; Assert(i == n); /* * Caller should hold at least AccesShareLock on the owning relation, * supposedly no need for repalloc(). (elog() rather than Assert() as * it's not difficult to break this assumption during future coding.) */ if (n++ > relnatts) elog(ERROR, "Relation %u has too many attributes", relid); result[i] = HeapTupleHeaderGetXmin(tuple->t_data); /* * Gather composite type info if needed. */ if (cat_state != NULL && get_typtype(form->atttypid) == TYPTYPE_COMPOSITE) cache_composite_type_info(cat_state, form->atttypid); } Assert(relnatts == n); systable_endscan(scan); table_close(rel, AccessShareLock); *xmins_p = result; } /* * Make sure that information on a type that caller has recognized as * composite type is cached in cat_state. */ static void cache_composite_type_info(CatalogState *cat_state, Oid typid) { int i; bool found = false; TypeCatInfo *tinfo; /* Check if we already have this type. */ for (i = 0; i < cat_state->ncomptypes; i++) { tinfo = &cat_state->comptypes[i]; if (tinfo->oid == typid) { found = true; break; } } if (found) return; /* Extend the comptypes array if necessary. */ if (cat_state->ncomptypes == cat_state->ncomptypes_max) { if (cat_state->ncomptypes_max == 0) { Assert(cat_state->comptypes == NULL); cat_state->ncomptypes_max = 2; cat_state->comptypes = (TypeCatInfo *) palloc(cat_state->ncomptypes_max * sizeof(TypeCatInfo)); } else { cat_state->ncomptypes_max *= 2; cat_state->comptypes = (TypeCatInfo *) repalloc(cat_state->comptypes, cat_state->ncomptypes_max * sizeof(TypeCatInfo)); } } tinfo = &cat_state->comptypes[cat_state->ncomptypes]; tinfo->oid = typid; get_composite_type_info(tinfo); cat_state->ncomptypes++; } /* * Retrieve information on a type that caller has recognized as composite * type. tinfo->oid must be initialized. */ static void get_composite_type_info(TypeCatInfo *tinfo) { Relation rel; ScanKeyData key[1]; SysScanDesc scan; HeapTuple tuple; Form_pg_type form_type; Form_pg_class form_class; Assert(tinfo->oid != InvalidOid); /* Find the pg_type tuple. */ rel = table_open(TypeRelationId, AccessShareLock); ScanKeyInit(&key[0], Anum_pg_type_oid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(tinfo->oid)); scan = systable_beginscan(rel, TypeOidIndexId, true, NULL, 1, key); tuple = systable_getnext(scan); if (!HeapTupleIsValid(tuple)) elog(ERROR, "composite type %u not found", tinfo->oid); form_type = (Form_pg_type) GETSTRUCT(tuple); Assert(form_type->typtype == TYPTYPE_COMPOSITE); /* Initialize the structure. */ tinfo->xmin = HeapTupleHeaderGetXmin(tuple->t_data); /* * Retrieve the pg_class tuple that represents the composite type, as well * as the corresponding pg_attribute tuples. */ tinfo->rel.relid = form_type->typrelid; get_pg_class_info(form_type->typrelid, &tinfo->rel.xmin, &form_class, NULL); if (form_class->relnatts > 0) get_attribute_info(form_type->typrelid, form_class->relnatts, &tinfo->rel.attr_xmins, NULL); else tinfo->rel.attr_xmins = NULL; tinfo->rel.relnatts = form_class->relnatts; pfree(form_class); systable_endscan(scan); table_close(rel, AccessShareLock); } /* * Retrieve pg_class(oid) and pg_class(xmin) for each index of given * relation. * * If at least one index appears to be problematic in terms of concurrency, * *found_invalid receives true and retrieval of index information ends * immediately. * * If invalid_check_only is true, return after having verified that all * indexes are valid. * * Note that some index DDLs can commit while this function is called from * get_catalog_state(). If we manage to see these changes, our result includes * them and they'll affect the transient table. If any such change gets * committed later and we miss it, it'll be identified as disruptive by * check_catalog_changes(). After all, there should be no dangerous race * conditions. */ static IndexCatInfo * get_index_info(Oid relid, int *relninds, bool *found_invalid, bool invalid_check_only, bool *found_pk) { Relation rel, rel_idx; ScanKeyData key[1]; SysScanDesc scan; HeapTuple tuple; IndexCatInfo *result; int i, n = 0; int relninds_max = 4; Datum *oids_d; int16 oidlen; bool oidbyval; char oidalign; ArrayType *oids_a; bool mismatch; *found_invalid = false; if (found_pk) *found_pk = false; /* * Open both pg_class and pg_index catalogs at once, so that we have a * consistent view in terms of invalidation. Otherwise we might get * different snapshot for each. Thus, in-progress index changes that do * not conflict with AccessShareLock on the parent table could trigger * false alarms later in check_catalog_changes(). */ rel = table_open(RelationRelationId, AccessShareLock); rel_idx = table_open(IndexRelationId, AccessShareLock); ScanKeyInit(&key[0], Anum_pg_index_indrelid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(relid)); scan = systable_beginscan(rel_idx, IndexIndrelidIndexId, true, NULL, 1, key); result = (IndexCatInfo *) palloc(relninds_max * sizeof(IndexCatInfo)); while ((tuple = systable_getnext(scan)) != NULL) { Form_pg_index form; IndexCatInfo *res_entry; form = (Form_pg_index) GETSTRUCT(tuple); /* * First, perform the simple checks that can make the next work * unnecessary. */ if (!form->indisvalid || !form->indisready || !form->indislive) { *found_invalid = true; break; } res_entry = (IndexCatInfo *) &result[n++]; res_entry->oid = form->indexrelid; res_entry->xmin = HeapTupleHeaderGetXmin(tuple->t_data); if (found_pk && form->indisprimary) *found_pk = true; /* * Unlike get_attribute_info(), we can't receive the expected number * of entries from caller. */ if (n == relninds_max) { relninds_max *= 2; result = (IndexCatInfo *) repalloc(result, relninds_max * sizeof(IndexCatInfo)); } } systable_endscan(scan); table_close(rel_idx, AccessShareLock); /* Return if invalid index was found or ... */ if (*found_invalid) { table_close(rel, AccessShareLock); return result; } /* ... caller is not interested in anything else. */ if (invalid_check_only) { table_close(rel, AccessShareLock); return result; } /* * Enforce sorting by OID, so that the entries match the result of the * following scan using OID index. */ qsort(result, n, sizeof(IndexCatInfo), index_cat_info_compare); if (relninds) *relninds = n; if (n == 0) { table_close(rel, AccessShareLock); return result; } /* * Now retrieve the corresponding pg_class(xmax) values. * * Here it seems reasonable to construct an array of OIDs of the pg_class * entries of the indexes and use amsearcharray function of the index. */ oids_d = (Datum *) palloc(n * sizeof(Datum)); for (i = 0; i < n; i++) oids_d[i] = ObjectIdGetDatum(result[i].oid); get_typlenbyvalalign(OIDOID, &oidlen, &oidbyval, &oidalign); oids_a = construct_array(oids_d, n, OIDOID, oidlen, oidbyval, oidalign); pfree(oids_d); ScanKeyInit(&key[0], Anum_pg_class_oid, BTEqualStrategyNumber, F_OIDEQ, PointerGetDatum(oids_a)); key[0].sk_flags |= SK_SEARCHARRAY; scan = systable_beginscan(rel, ClassOidIndexId, true, NULL, 1, key); i = 0; mismatch = false; while ((tuple = systable_getnext(scan)) != NULL) { IndexCatInfo *res_item; Form_pg_class form_class; char *namestr; if (i == n) { /* Index added concurrently? */ mismatch = true; break; } res_item = &result[i++]; res_item->pg_class_xmin = HeapTupleHeaderGetXmin(tuple->t_data); form_class = (Form_pg_class) GETSTRUCT(tuple); namestr = NameStr(form_class->relname); Assert(strlen(namestr) < NAMEDATALEN); strcpy(NameStr(res_item->relname), namestr); res_item->reltablespace = form_class->reltablespace; } if (i < n) mismatch = true; if (mismatch) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("Concurrent change of index detected"))); systable_endscan(scan); table_close(rel, AccessShareLock); pfree(oids_a); return result; } /* * Compare the passed catalog information to the info retrieved using the most * recent catalog snapshot. Perform the cheapest checks first, the trickier * ones later. * * lock_held is the *least* mode of the lock held by caller on stat->relid * relation since the last check. This information helps to avoid unnecessary * checks. * * We check neither constraint nor trigger related DDLs. Since all the * concurrent changes we receive from replication slot must have been subject * to those constraints / triggers, the transient relation does not need them, * and therefore no incompatibility can arise. We only need to make sure that * the storage is "compatible", i.e. no column and no index was added / * altered / dropped, and no heap rewriting took place. * * Unlike get_catalog_state(), fresh catalog snapshot is used for each catalog * scan. That might increase the chance a little bit that concurrent change * will be detected in the current call, instead of the following one. * * (As long as we use xmin columns of the catalog tables to detect changes, we * can't use syscache here.) * * XXX It's worth checking AlterTableGetLockLevel() each time we adopt a new * version of PG core. */ void check_catalog_changes(CatalogState *state, LOCKMODE lock_held) { /* * No DDL should be compatible with this lock mode. (Not sure if this * condition will ever fire.) */ if (lock_held == AccessExclusiveLock) return; /* * First the source relation itself. * * Only AccessExclusiveLock guarantees that the pg_class entry hasn't * changed. By lowering this threshold we'd perhaps skip unnecessary check * sometimes (e.g. change of pg_class(relhastriggers) is unimportant), but * we could also miss the check when necessary. It's simply too fragile to * deduce the kind of DDL from lock level, so do this check * unconditionally. */ check_pg_class_changes(state); /* * Index change does not necessarily require lock of the parent relation, * so check indexes unconditionally. */ check_index_changes(state); /* * XXX If any lock level lower than AccessExclusiveLock conflicts with all * commands that change pg_attribute catalog, skip this check if lock_held * is at least that level. */ check_attribute_changes(state); /* * Finally check if any composite type used by the source relation has * changed. */ if (state->ncomptypes > 0) check_composite_type_changes(state); } static void check_pg_class_changes(CatalogState *cat_state) { TransactionId xmin_current; get_pg_class_info(cat_state->rel.relid, &xmin_current, NULL, NULL); /* * Check if pg_class(xmin) has changed. * * The changes caught here include change of pg_class(relfilenode), which * indicates heap rewriting or TRUNCATE command (or concurrent call of * squeeze_table(), but that should fail to allocate new replication * slot). (Invalid relfilenode does not change, but mapped relations are * excluded from processing by get_catalog_state().) */ if (!TransactionIdEquals(xmin_current, cat_state->rel.xmin)) /* XXX Does more suitable error code exist? */ ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("Incompatible DDL or heap rewrite performed concurrently"))); } /* * Check if any tuple of pg_attribute of given relation has changed. In * addition, if the attribute type is composite, check for its changes too. */ static void check_attribute_changes(CatalogState *cat_state) { TransactionId *attrs_new; int i; /* * Since pg_class should have been checked by now, relnatts can only be * zero if it was zero originally, so there's no info to be compared to * the current state. */ if (cat_state->rel.relnatts == 0) { Assert(cat_state->rel.attr_xmins == NULL); return; } /* * Check if any row of pg_attribute changed. * * If the underlying type is composite, pg_attribute(xmin) will not * reflect its change, so pass NULL for cat_state to indicate that we're * not interested in type info at the moment. We'll do that later if all * the cheaper tests pass. */ get_attribute_info(cat_state->rel.relid, cat_state->rel.relnatts, &attrs_new, NULL); for (i = 0; i < cat_state->rel.relnatts; i++) { if (!TransactionIdEquals(cat_state->rel.attr_xmins[i], attrs_new[i])) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("Table definition changed concurrently"))); } pfree(attrs_new); } static void check_index_changes(CatalogState *cat_state) { IndexCatInfo *inds_new; int relninds_new; bool failed = false; bool invalid_index; bool have_pk_index; if (cat_state->relninds == 0) { Assert(cat_state->indexes == NULL); return; } inds_new = get_index_info(cat_state->rel.relid, &relninds_new, &invalid_index, false, &have_pk_index); /* * If this field was set to true, no attention was paid to the other * fields during catalog scans. */ if (invalid_index) failed = true; if (!failed && relninds_new != cat_state->relninds) failed = true; /* * It might be o.k. for the PK index to disappear if the table still has * an unique constraint, but this is too hard to check. */ if (!failed && cat_state->have_pk_index != have_pk_index) failed = true; if (!failed) { int i; for (i = 0; i < cat_state->relninds; i++) { IndexCatInfo *ind, *ind_new; ind = &cat_state->indexes[i]; ind_new = &inds_new[i]; if (ind->oid != ind_new->oid || !TransactionIdEquals(ind->xmin, ind_new->xmin) || !TransactionIdEquals(ind->pg_class_xmin, ind_new->pg_class_xmin)) { failed = true; break; } } } if (failed) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("Concurrent change of index detected"))); pfree(inds_new); } static void check_composite_type_changes(CatalogState *cat_state) { int i; TypeCatInfo *changed = NULL; for (i = 0; i < cat_state->ncomptypes; i++) { TypeCatInfo *tinfo = &cat_state->comptypes[i]; TypeCatInfo tinfo_new; int j; tinfo_new.oid = tinfo->oid; get_composite_type_info(&tinfo_new); if (!TransactionIdEquals(tinfo->xmin, tinfo_new.xmin) || !TransactionIdEquals(tinfo->rel.xmin, tinfo_new.rel.xmin) || (tinfo->rel.relnatts != tinfo_new.rel.relnatts)) { changed = tinfo; break; } /* * Check the individual attributes of the type relation. * * This should catch ALTER TYPE ... ALTER ATTRIBUTE ... change of * attribute data type, which is currently not allowed if the type is * referenced by any table. Do it yet in this generic way so that we * don't have to care whether any PG restrictions are relaxed in the * future. */ for (j = 0; j < tinfo->rel.relnatts; j++) { if (!TransactionIdEquals(tinfo->rel.attr_xmins[j], tinfo_new.rel.attr_xmins[j])) { changed = tinfo; break; } } if (tinfo_new.rel.relnatts > 0) { Assert(tinfo_new.rel.attr_xmins != NULL); pfree(tinfo_new.rel.attr_xmins); } if (changed != NULL) break; } if (changed != NULL) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("Concurrent change of composite type %u detected", changed->oid))); } static void free_catalog_state(CatalogState *state) { if (state->form_class) pfree(state->form_class); if (state->desc_class) pfree(state->desc_class); if (state->rel.attr_xmins) pfree(state->rel.attr_xmins); if (state->indexes) pfree(state->indexes); if (state->comptypes) { int i; for (i = 0; i < state->ncomptypes; i++) { TypeCatInfo *tinfo = &state->comptypes[i]; if (tinfo->rel.attr_xmins) pfree(tinfo->rel.attr_xmins); } pfree(state->comptypes); } pfree(state); } static void resolve_index_tablepaces(TablespaceInfo *tbsp_info, CatalogState *cat_state, ArrayType *ind_tbsp_a) { int *dims, *lb; int i, ndim; int16 elmlen; bool elmbyval; char elmalign; Datum *elements; bool *nulls; int nelems, nentries; /* The CREATE FUNCTION statement should ensure this. */ Assert(ARR_ELEMTYPE(ind_tbsp_a) == NAMEOID); if ((ndim = ARR_NDIM(ind_tbsp_a)) != 2) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("Index-to-tablespace mappings must be text[][] array"))); dims = ARR_DIMS(ind_tbsp_a); if (dims[1] != 2) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("The index-to-tablespace mappings must have 2 columns"))); lb = ARR_LBOUND(ind_tbsp_a); for (i = 0; i < ndim; i++) if (lb[i] != 1) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("Each dimension of the index-to-tablespace mappings must start at 1"))); get_typlenbyvalalign(NAMEOID, &elmlen, &elmbyval, &elmalign); deconstruct_array(ind_tbsp_a, NAMEOID, elmlen, elmbyval, elmalign, &elements, &nulls, &nelems); Assert(nelems % 2 == 0); for (i = 0; i < nelems; i++) if (nulls[i]) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("The index-to-tablespace array is must not contain NULLs"))); /* Do the actual processing. */ nentries = nelems / 2; tbsp_info->indexes = (IndexTablespace *) palloc(nentries * sizeof(IndexTablespace)); Assert(tbsp_info->nindexes == 0); for (i = 0; i < nentries; i++) { char *indname, *tbspname; int j; Oid ind_oid, tbsp_oid; IndexTablespace *ind_ts; /* Find OID of the index. */ indname = NameStr(*DatumGetName(elements[2 * i])); ind_oid = InvalidOid; for (j = 0; j < cat_state->relninds; j++) { IndexCatInfo *ind_cat; ind_cat = &cat_state->indexes[j]; if (strcmp(NameStr(ind_cat->relname), indname) == 0) { ind_oid = ind_cat->oid; break; } } if (!OidIsValid(ind_oid)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("Table has no index \"%s\"", indname))); /* Duplicate entries are not expected in the input array. */ for (j = 0; j < tbsp_info->nindexes; j++) { ind_ts = &tbsp_info->indexes[j]; if (ind_ts->index == ind_oid) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("Duplicate tablespace mapping for index \"%s\"", indname))); } /* Look up the tablespace. Fail if it does not exist. */ tbspname = NameStr(*DatumGetName(elements[2 * i + 1])); tbsp_oid = get_tablespace_oid(tbspname, false); /* Add the new mapping entry to the array. */ ind_ts = &tbsp_info->indexes[tbsp_info->nindexes++]; ind_ts->index = ind_oid; ind_ts->tablespace = tbsp_oid; } pfree(elements); pfree(nulls); } static void free_tablespace_info(TablespaceInfo *tbsp_info) { if (tbsp_info->indexes != NULL) pfree(tbsp_info->indexes); pfree(tbsp_info); } /* * Use snap_hist snapshot to get the relevant data from rel_src and insert it * into rel_dst. * * Caller is responsible for opening and locking both relations. */ static void perform_initial_load(Relation rel_src, RangeVar *cluster_idx_rv, Snapshot snap_hist, Relation rel_dst, LogicalDecodingContext *ctx) { bool use_sort; int batch_size, batch_max_size; Size tuple_array_size; bool tuple_array_can_expand = true; Tuplesortstate *tuplesort = NULL; Relation cluster_idx = NULL; TableScanDesc heap_scan = NULL; TupleTableSlot *slot; IndexScanDesc index_scan = NULL; HeapTuple *tuples = NULL; ResourceOwner res_owner_old, res_owner_plan; BulkInsertState bistate; MemoryContext load_cxt, old_cxt; XLogRecPtr end_of_wal_prev = InvalidXLogRecPtr; DecodingOutputState *dstate; bool has_dropped_attr; Datum values[MaxTupleAttributeNumber]; bool isnull[MaxTupleAttributeNumber]; /* * Also remember that the WAL records created during the load should not * be decoded later. */ dstate = (DecodingOutputState *) ctx->output_writer_private; dstate->rorigin = replorigin_session_origin; if (cluster_idx_rv != NULL) { cluster_idx = relation_openrv(cluster_idx_rv, AccessShareLock); /* * Use the cluster.c API to check if the index can be used for * clustering. */ #if PG_VERSION_NUM >= 150000 check_index_is_clusterable(rel_src, RelationGetRelid(cluster_idx), NoLock); #else check_index_is_clusterable(rel_src, RelationGetRelid(cluster_idx), false, NoLock); #endif /* * Decide whether index scan or explicit sort should be used. * * Caller does not expect to see any additional locks, so use a * separate resource owner to keep track of them. */ res_owner_old = CurrentResourceOwner; res_owner_plan = ResourceOwnerCreate(res_owner_old, "use_sort owner"); CurrentResourceOwner = res_owner_plan; use_sort = plan_cluster_use_sort(rel_src->rd_id, cluster_idx->rd_id); /* * Now use the special resource owner to release those planner locks. * In fact this owner should contain any other resources, that the * planner might have allocated. Release them all, to avoid leak. */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, false); ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_LOCKS, false, false); ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_AFTER_LOCKS, false, false); /* Cleanup. */ CurrentResourceOwner = res_owner_old; ResourceOwnerDelete(res_owner_plan); } else use_sort = false; if (use_sort || cluster_idx == NULL) heap_scan = table_beginscan(rel_src, snap_hist, 0, (ScanKey) NULL); else { index_scan = index_beginscan(rel_src, cluster_idx, snap_hist, 0, 0); index_rescan(index_scan, NULL, 0, NULL, 0); } slot = table_slot_create(rel_src, NULL); if (use_sort) tuplesort = tuplesort_begin_cluster(RelationGetDescr(rel_src), cluster_idx, maintenance_work_mem, NULL, false); /* * If tuplesort is not applicable, we store as much data as we can store * in memory. The more memory is available, the fewer iterations. */ if (!use_sort) { batch_max_size = 1024; tuple_array_size = batch_max_size * sizeof(HeapTuple); /* The minimum value of maintenance_work_mem is 1024 kB. */ Assert(tuple_array_size / 1024 < maintenance_work_mem); tuples = (HeapTuple *) palloc(tuple_array_size); } /* Expect many insertions. */ bistate = GetBulkInsertState(); /* Has the relation at least one dropped attribute? */ has_dropped_attr = has_dropped_attribute(rel_src); /* * The processing can take many iterations. In case any data manipulation * below leaked, try to defend against out-of-memory conditions by using a * separate memory context. */ load_cxt = AllocSetContextCreate(CurrentMemoryContext, "pg_squeeze initial load cxt", ALLOCSET_DEFAULT_SIZES); old_cxt = MemoryContextSwitchTo(load_cxt); while (true) { HeapTuple tup_in = NULL; int i; Size data_size = 0; XLogRecPtr end_of_wal; /* Sorting cannot be split into batches. */ for (i = 0;; i++) { bool have_tup_copy = false; /* * While tuplesort is responsible for not exceeding * maintenance_work_mem itself, we must check if the tuple array * does. * * Since the tuple cannot be put back to the scan, it'd make * things tricky if we involved the current tuple in the * computation. Since the unit of maintenance_work_mem is kB, one * extra tuple shouldn't hurt too much. */ if (!use_sort && ((data_size + tuple_array_size) / 1024) >= maintenance_work_mem) { /* * data_size should still be zero if tup_in is the first item * of the current batch and the array itself should never * exceed maintenance_work_mem. XXX If the condition above is * changed to include the current tuple (i.e. we put the * current tuple aside for the next batch), make sure the * first tuple of a batch is inserted regardless its size. We * cannot shrink the array in favor of actual data in generic * case (i.e. tuple size can in general be bigger than * maintenance_work_mem). */ Assert(i > 0); break; } /* * Perform the tuple retrieval in the original context so that no * scan data is freed during the cleanup between batches. */ MemoryContextSwitchTo(old_cxt); { bool res; if (use_sort || cluster_idx == NULL) res = table_scan_getnextslot(heap_scan, ForwardScanDirection, slot); else res = index_getnext_slot(index_scan, ForwardScanDirection, slot); if (res) { bool shouldFree; tup_in = ExecFetchSlotHeapTuple(slot, false, &shouldFree); /* TTSOpsBufferHeapTuple has .get_heap_tuple != NULL. */ Assert(!shouldFree); } else tup_in = NULL; } MemoryContextSwitchTo(load_cxt); /* * Ran out of input data? */ if (tup_in == NULL) break; /* Flatten the tuple if needed. */ if (HeapTupleHasExternal(tup_in)) { tup_in = toast_flatten_tuple(tup_in, RelationGetDescr(rel_src)); have_tup_copy = true; } /* * If at least one attribute has been dropped, we need to deform / * form the tuple to make sure that the values of the dropped * attribute(s) are NULL. (Unfortunately we don't know if the * table was already squeezed since the last ALTER TABLE ... DROP * COLUMN ... command.) */ if (has_dropped_attr) { HeapTuple tup_orig = tup_in; TupleDesc tup_desc = RelationGetDescr(rel_src); heap_deform_tuple(tup_in, tup_desc, values, isnull); for (int j = 0; j < tup_desc->natts; j++) { if (TupleDescAttr(tup_desc, j)->attisdropped) isnull[j] = true; } tup_in = heap_form_tuple(tup_desc, values, isnull); if (have_tup_copy) /* tup_in is a flat copy. We do not want two copies. */ heap_freetuple(tup_orig); have_tup_copy = true; } if (use_sort) { tuplesort_putheaptuple(tuplesort, tup_in); /* tuplesort should have copied the tuple. */ if (have_tup_copy) heap_freetuple(tup_in); } else { exit_if_requested(); /* * Check for a free slot early enough so that the current * tuple can be stored even if the array cannot be * reallocated. Do not try again and again if the tuple array * reached the maximum value. */ if (i == (batch_max_size - 1) && tuple_array_can_expand) { int batch_max_size_new; Size tuple_array_size_new; batch_max_size_new = 2 * batch_max_size; tuple_array_size_new = batch_max_size_new * sizeof(HeapTuple); /* * Besides being of valid size, the new array should allow * for storing some data w/o exceeding * maintenance_work_mem. Check also batch_max_size_new for * overflow although AllocSizeIsValid() probably should * detect a problem much earlier. XXX Consider tuning the * portion of maintenance_work_mem that the array can use. */ if (!AllocSizeIsValid(tuple_array_size_new) || batch_max_size_new < 0 || tuple_array_size_new / 1024 >= maintenance_work_mem / 16) tuple_array_can_expand = false; /* * Only expand the array if the current iteration does not * violate maintenance_work_mem. */ if (tuple_array_can_expand) { tuples = (HeapTuple *) repalloc(tuples, tuple_array_size_new); batch_max_size = batch_max_size_new; tuple_array_size = tuple_array_size_new; } } if (!have_tup_copy) tup_in = heap_copytuple(tup_in); /* * Store the tuple and account for its size. */ tuples[i] = tup_in; data_size += HEAPTUPLESIZE + tup_in->t_len; /* * If the tuple array could not be expanded, stop reading for * the current batch. */ if (i == (batch_max_size - 1)) { /* The current tuple belongs to the current batch. */ i++; break; } } } /* * Insert the tuples into the target table. * * check_catalog_changes() shouldn't be necessary as long as the * AccessSqhareLock we hold on the source relation does not allow * change of table type. (Should ALTER INDEX take place concurrently, * it does not break the heap insertions. In such a case we'll find * out later that we need to terminate processing of the current * table, but it's probably not worth checking each batch.) */ if (use_sort) tuplesort_performsort(tuplesort); else { /* * Has the previous batch processed all the remaining tuples? * * In theory, the counter might end up zero as a result of * overflow. However in practice 'i' should not overflow because * its upper limit is controlled by 'batch_max_size' which is also * of the int data type, and which in turn should not overflow * because value much lower than INT_MAX will make * AllocSizeIsValid(tuple_array_size_new) return false. */ if (i == 0) break; } batch_size = i; i = 0; while (true) { HeapTuple tup_out; exit_if_requested(); if (use_sort) tup_out = tuplesort_getheaptuple(tuplesort, true); else { if (i == batch_size) tup_out = NULL; else tup_out = tuples[i++]; } if (tup_out == NULL) break; /* * Insert the tuple into the new table. * * XXX Should this happen outside load_cxt? Currently "bistate" is * a flat object (i.e. it does not point to any memory chunk that * the previous call of heap_insert() might have allocated) and * thus the cleanup between batches should not damage it, but * can't it get more complex in future PG versions? */ heap_insert(rel_dst, tup_out, GetCurrentCommandId(true), 0, bistate); /* Update the progress information. */ SpinLockAcquire(&MyWorkerSlot->mutex); MyWorkerSlot->progress.ins_initial += 1; SpinLockRelease(&MyWorkerSlot->mutex); if (!use_sort) pfree(tup_out); } /* * Reached the end of scan when retrieving data from heap or index? */ if (tup_in == NULL) break; /* * Free possibly-leaked memory. */ MemoryContextReset(load_cxt); /* * Decode the WAL produced by the load, as well as by other * transactions, so that the replication slot can advance and WAL does * not pile up. Of course we must not apply the changes until the * initial load has completed. * * Note that the insertions into the new table shouldn't actually be * decoded, they should be filtered out by their origin. */ #if PG_VERSION_NUM >= 150000 end_of_wal = GetFlushRecPtr(NULL); #else end_of_wal = GetFlushRecPtr(); #endif if (end_of_wal > end_of_wal_prev) { MemoryContextSwitchTo(old_cxt); decode_concurrent_changes(ctx, end_of_wal, NULL); MemoryContextSwitchTo(load_cxt); } end_of_wal_prev = end_of_wal; } /* * At whichever stage the loop broke, the historic snapshot should no * longer be active. */ /* Cleanup. */ FreeBulkInsertState(bistate); if (use_sort) tuplesort_end(tuplesort); else pfree(tuples); if (heap_scan != NULL) table_endscan(heap_scan); if (index_scan != NULL) index_endscan(index_scan); ExecDropSingleTupleTableSlot(slot); /* * Unlock the index, but not the relation yet - caller will do so when * appropriate. */ if (cluster_idx != NULL) relation_close(cluster_idx, AccessShareLock); MemoryContextSwitchTo(old_cxt); MemoryContextDelete(load_cxt); elog(DEBUG1, "pg_squeeze: the initial load completed"); } /* * Check if relation has at least one dropped attribute. */ static bool has_dropped_attribute(Relation rel) { TupleDesc tup_desc = RelationGetDescr(rel); for (int i = 0; i < tup_desc->natts; i++) { Form_pg_attribute attr = &tup_desc->attrs[i]; if (attr->attisdropped) return true; } return false; } /* * Create a table into which we'll copy the contents of the source table, as * well as changes of the source table that happened during the copying. At * the end of processing we'll just swap storage of the transient and the * source relation and drop the transient one. * * Return oid of the new relation, which is neither locked nor open. */ static Oid create_transient_table(CatalogState *cat_state, TupleDesc tup_desc, Oid tablespace, Oid relowner) { StringInfo relname; Form_pg_class form_class; HeapTuple tuple; Datum reloptions; bool isnull; Oid toastrelid; Oid result; /* As elsewhere in PG core. */ if (OidIsValid(tablespace) && tablespace != MyDatabaseTableSpace) { AclResult aclresult; /* * squeeze_table() must be executed by superuser because it creates * and drops the replication slot. However it should not be a way to * do things that the table owner is not allowed to. (For indexes we * assume they all have the same owner as the table.) */ #if PG_VERSION_NUM >= 160000 aclresult = object_aclcheck(TableSpaceRelationId, tablespace, relowner, ACL_CREATE); #else aclresult = pg_tablespace_aclcheck(tablespace, relowner, ACL_CREATE); #endif if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, OBJECT_TABLESPACE, get_tablespace_name(tablespace)); } if (tablespace == GLOBALTABLESPACE_OID) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("only shared relations can be placed in pg_global tablespace"))); relname = makeStringInfo(); appendStringInfo(relname, "tmp_%u", cat_state->rel.relid); /* * Constraints are not created because each data change must be committed * in the source table before we see it during initial load or via logical * decoding. * * Values of some arguments (e.g. oidislocal, oidinhcount) are unimportant * since the transient table and its catalog entries will eventually get * dropped. On the other hand, we do not change catalog regarding the * source relation. */ form_class = cat_state->form_class; /* * reloptions must be preserved, so fetch them from the catalog. */ tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(cat_state->rel.relid)); if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for relation %u", cat_state->rel.relid); reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions, &isnull); Assert(!isnull || reloptions == (Datum) 0); result = heap_create_with_catalog(relname->data, form_class->relnamespace, tablespace, InvalidOid, InvalidOid, InvalidOid, form_class->relowner, form_class->relam, tup_desc, NIL, form_class->relkind, form_class->relpersistence, false, false, ONCOMMIT_NOOP, reloptions, false, false, false, InvalidOid, /* relrewrite */ NULL); Assert(OidIsValid(result)); ReleaseSysCache(tuple); elog(DEBUG1, "pg_squeeze: transient relation created: %u", result); /* Make sure the transient relation is visible. */ CommandCounterIncrement(); /* * See cluster.c:make_new_heap() for details about the supposed * (non)existence of TOAST relation on both source and the transient * relations. */ toastrelid = form_class->reltoastrelid; if (OidIsValid(toastrelid)) { /* keep the existing toast table's reloptions, if any */ tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastrelid)); if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for relation %u", toastrelid); reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions, &isnull); Assert(!isnull || reloptions == (Datum) 0); /* * No lock is needed on the target relation since no other transaction * should be able to see it until our transaction commits. However, * table_open() is eventually called and that would cause assertion * failure if we passed NoLock. We can pass any other lock mode. * * On PG versions: the OIDOldToast argument was added to * NewHeapCreateToastTable() in v14, but the change was back-patched * to the minor releases down to v11. Regarding the value of * OIDOldToast, it does not matter here because the auxiliary table * will eventually be dropped. */ #if (PG_VERSION_NUM >= 140000) || \ (PG_VERSION_NUM < 140000 && PG_VERSION_NUM > 130004) || \ (PG_VERSION_NUM < 130000 && PG_VERSION_NUM > 120008) NewHeapCreateToastTable(result, reloptions, AccessExclusiveLock, InvalidOid); #else NewHeapCreateToastTable(result, reloptions, AccessExclusiveLock); #endif ReleaseSysCache(tuple); /* Make sure the TOAST relation is visible. */ CommandCounterIncrement(); } return result; } /* * Make sure "dst" relation has the same indexes as "src". * * indexes_src is array of existing indexes on the source relation and * nindexes the number of its entries. * * An array of oids of corresponding indexes created on the destination * relation is returned. The order of items does match, so we can use these * arrays to swap index storage. */ static Oid * build_transient_indexes(Relation rel_dst, Relation rel_src, Oid *indexes_src, int nindexes, TablespaceInfo *tbsp_info, CatalogState *cat_state, LogicalDecodingContext *ctx) { StringInfo ind_name; int i; Oid *result; XLogRecPtr end_of_wal_prev = InvalidXLogRecPtr; Assert(nindexes > 0); ind_name = makeStringInfo(); result = (Oid *) palloc(nindexes * sizeof(Oid)); for (i = 0; i < nindexes; i++) { Oid ind_oid, ind_oid_new, tbsp_oid; Relation ind; IndexInfo *ind_info; int j, heap_col_id; List *colnames; int16 indnatts; Oid *collations, *opclasses; HeapTuple tup; bool isnull; Datum d; oidvector *oidvec; int2vector *int2vec; size_t oid_arr_size; size_t int2_arr_size; int16 *indoptions; text *reloptions = NULL; bits16 flags; XLogRecPtr end_of_wal; #if PG_VERSION_NUM >= 170000 Datum *opclassOptions; #endif ind_oid = indexes_src[i]; ind = index_open(ind_oid, AccessShareLock); ind_info = BuildIndexInfo(ind); /* * Tablespace defaults to the original one, but can be overridden by * tbsp_info. */ tbsp_oid = InvalidOid; for (j = 0; j < tbsp_info->nindexes; j++) { IndexTablespace *ind_tbsp; ind_tbsp = &tbsp_info->indexes[j]; if (ind_tbsp->index == ind_oid) { tbsp_oid = ind_tbsp->tablespace; break; } } if (tbsp_oid == GLOBALTABLESPACE_OID) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("only shared relations can be placed in pg_global tablespace"))); if (!OidIsValid(tbsp_oid)) { bool found = false; for (j = 0; j < cat_state->relninds; j++) { IndexCatInfo *ind_cat; ind_cat = &cat_state->indexes[j]; if (ind_cat->oid == ind_oid) { tbsp_oid = ind_cat->reltablespace; found = true; break; } } /* * It's o.k. for tbsp_oid to end up invalid (if the default * tablespace of the database should be used), but the index * shouldn't have disappeared (caller should hold share lock on * the relation). */ if (!found) elog(ERROR, "Failed to retrieve index tablespace"); } /* * Index name really doesn't matter, we'll eventually use only their * storage. Just make them unique within the table. */ resetStringInfo(ind_name); appendStringInfo(ind_name, "ind_%d", i); flags = 0; if (ind->rd_index->indisprimary) flags |= INDEX_CREATE_IS_PRIMARY; colnames = NIL; indnatts = ind->rd_index->indnatts; oid_arr_size = sizeof(Oid) * indnatts; int2_arr_size = sizeof(int16) * indnatts; collations = (Oid *) palloc(oid_arr_size); for (j = 0; j < indnatts; j++) { char *colname; heap_col_id = ind->rd_index->indkey.values[j]; if (heap_col_id > 0) { Form_pg_attribute att; /* Normal attribute. */ att = TupleDescAttr(rel_src->rd_att, heap_col_id - 1); colname = pstrdup(NameStr(att->attname)); collations[j] = att->attcollation; } else if (heap_col_id == 0) { HeapTuple tuple; Form_pg_attribute att; /* * Expression column is not present in relcache. What we need * here is an attribute of the *index* relation. */ tuple = SearchSysCache2(ATTNUM, ObjectIdGetDatum(ind_oid), Int16GetDatum(j + 1)); if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for attribute %d of relation %u", j + 1, ind_oid); att = (Form_pg_attribute) GETSTRUCT(tuple); colname = pstrdup(NameStr(att->attname)); collations[j] = att->attcollation; ReleaseSysCache(tuple); } else elog(ERROR, "Unexpected column number: %d", heap_col_id); colnames = lappend(colnames, colname); } /* * Special effort needed for variable length attributes of * Form_pg_index. */ tup = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(ind_oid)); if (!HeapTupleIsValid(tup)) elog(ERROR, "cache lookup failed for index %u", ind_oid); d = SysCacheGetAttr(INDEXRELID, tup, Anum_pg_index_indclass, &isnull); Assert(!isnull); oidvec = (oidvector *) DatumGetPointer(d); opclasses = (Oid *) palloc(oid_arr_size); memcpy(opclasses, oidvec->values, oid_arr_size); d = SysCacheGetAttr(INDEXRELID, tup, Anum_pg_index_indoption, &isnull); Assert(!isnull); int2vec = (int2vector *) DatumGetPointer(d); indoptions = (int16 *) palloc(int2_arr_size); memcpy(indoptions, int2vec->values, int2_arr_size); ReleaseSysCache(tup); tup = SearchSysCache1(RELOID, ObjectIdGetDatum(ind_oid)); if (!HeapTupleIsValid(tup)) elog(ERROR, "cache lookup failed for index relation %u", ind_oid); d = SysCacheGetAttr(RELOID, tup, Anum_pg_class_reloptions, &isnull); reloptions = !isnull ? DatumGetTextPCopy(d) : NULL; ReleaseSysCache(tup); #if PG_VERSION_NUM >= 170000 opclassOptions = palloc0(sizeof(Datum) * ind_info->ii_NumIndexAttrs); for (j = 0; j < ind_info->ii_NumIndexAttrs; j++) opclassOptions[j] = get_attoptions(ind_oid, j + 1); #endif /* * Publish information on what we're going to do. This is especially * important if parallel workers are used to build the index. */ debug_query_string = "pg_squeeze index build"; /* * Neither parentIndexRelid nor parentConstraintId needs to be passed * since the new catalog entries (pg_constraint, pg_inherits) will * eventually be dropped. Therefore there's no need to record valid * dependency on parents. */ ind_oid_new = index_create(rel_dst, ind_name->data, InvalidOid, InvalidOid, /* parentIndexRelid */ InvalidOid, /* parentConstraintId */ InvalidOid, ind_info, colnames, ind->rd_rel->relam, tbsp_oid, collations, opclasses, #if PG_VERSION_NUM >= 170000 opclassOptions, #endif indoptions, #if PG_VERSION_NUM >= 170000 /* * stattargets not needed for the transient * index, the value of the source index * will remain (we only swap the storage). */ NULL, #endif PointerGetDatum(reloptions), flags, /* flags */ 0, /* constr_flags */ false, /* allow_system_table_mods */ false, /* is_internal */ NULL /* constraintId */ ); result[i] = ind_oid_new; debug_query_string = NULL; index_close(ind, AccessShareLock); list_free_deep(colnames); pfree(collations); pfree(opclasses); pfree(indoptions); if (reloptions) pfree(reloptions); /* * Like in perform_initial_load(), process some WAL so that the * segment files can be recycled. Unlike the initial load, do not set * replorigin_session_origin because index changes are not decoded * anyway. */ #if PG_VERSION_NUM >= 150000 end_of_wal = GetFlushRecPtr(NULL); #else end_of_wal = GetFlushRecPtr(); #endif if (end_of_wal > end_of_wal_prev) decode_concurrent_changes(ctx, end_of_wal, NULL); end_of_wal_prev = end_of_wal; } return result; } /* * Build scan key to process logical changes. * * Caller must hold at least AccessShareLock on rel_src. */ static ScanKey build_identity_key(Oid ident_idx_oid, Relation rel_src, int *nentries) { Relation ident_idx_rel; Form_pg_index ident_idx; int n, i; ScanKey result; Assert(OidIsValid(ident_idx_oid)); ident_idx_rel = index_open(ident_idx_oid, AccessShareLock); ident_idx = ident_idx_rel->rd_index; n = ident_idx->indnatts; result = (ScanKey) palloc(sizeof(ScanKeyData) * n); for (i = 0; i < n; i++) { ScanKey entry; int16 relattno; Form_pg_attribute att; Oid opfamily, opcintype, opno, opcode; entry = &result[i]; relattno = ident_idx->indkey.values[i]; if (relattno >= 1) { TupleDesc desc; desc = rel_src->rd_att; att = TupleDescAttr(desc, relattno - 1); } else elog(ERROR, "Unexpected attribute number %d in index", relattno); opfamily = ident_idx_rel->rd_opfamily[i]; opcintype = ident_idx_rel->rd_opcintype[i]; opno = get_opfamily_member(opfamily, opcintype, opcintype, BTEqualStrategyNumber); if (!OidIsValid(opno)) elog(ERROR, "Failed to find = operator for type %u", opcintype); opcode = get_opcode(opno); if (!OidIsValid(opcode)) elog(ERROR, "Failed to find = operator for operator %u", opno); /* Initialize everything but argument. */ ScanKeyInit(entry, i + 1, BTEqualStrategyNumber, opcode, (Datum) NULL); entry->sk_collation = att->attcollation; } index_close(ident_idx_rel, AccessShareLock); *nentries = n; return result; } /* * Try to perform the final processing of concurrent data changes of the * source table, which requires an exclusive lock. The return value tells * whether this step succeeded. (If not, caller might want to retry.) */ static bool perform_final_merge(Oid relid_src, Oid *indexes_src, int nindexes, Relation rel_dst, ScanKey ident_key, int ident_key_nentries, IndexInsertState *iistate, CatalogState *cat_state, LogicalDecodingContext *ctx) { bool success; XLogRecPtr xlog_insert_ptr, end_of_wal; int i; struct timeval t_end; struct timeval *t_end_ptr = NULL; char dummy_rec_data = '\0'; /* * Lock the source table exclusively last time, to finalize the work. * * On pg_repack: before taking the exclusive lock, pg_repack extension is * more restrictive in waiting for other transactions to complete. That * might reduce the likelihood of MVCC-unsafe behavior that PG core admits * in some cases * (https://www.postgresql.org/docs/9.6/static/mvcc-caveats.html) but * can't completely avoid it anyway. On the other hand, pg_squeeze only * waits for completion of transactions which performed write (i.e. do * have XID assigned) - this is a side effect of bringing our replication * slot into consistent state. * * As pg_repack shows, extra effort makes little sense here, because some * other transactions still can start before the exclusive lock on the * source relation is acquired. In particular, if transaction A starts in * this period and commits a change, transaction B can miss it if the next * steps are as follows: 1. transaction B took a snapshot (e.g. it has * REPEATABLE READ isolation level), 2. pg_repack took the exclusive * relation lock and finished its work, 3. transaction B acquired shared * lock and performed its scan. (And of course, waiting for transactions * A, B, ... to complete while holding the exclusive lock can cause * deadlocks.) */ LockRelationOid(relid_src, AccessExclusiveLock); /* * Lock the indexes too, as ALTER INDEX does not need table lock. * * The locking will succeed even if the index is no longer there. In that * case, ERROR will be raised during the catalog check below. */ for (i = 0; i < nindexes; i++) LockRelationOid(indexes_src[i], AccessExclusiveLock); if (squeeze_max_xlock_time > 0) { int64 usec; struct timeval t_start; gettimeofday(&t_start, NULL); /* Add the whole seconds. */ t_end.tv_sec = t_start.tv_sec + squeeze_max_xlock_time / 1000; /* Add the rest, expressed in microseconds. */ usec = t_start.tv_usec + 1000 * (squeeze_max_xlock_time % 1000); /* The number of microseconds could have overflown. */ t_end.tv_sec += usec / USECS_PER_SEC; t_end.tv_usec = usec % USECS_PER_SEC; t_end_ptr = &t_end; elog(DEBUG1, "pg_squeeze: completion required by %lu.%lu, current time is %lu.%lu.", t_end_ptr->tv_sec, t_end_ptr->tv_usec, t_start.tv_sec, t_start.tv_usec); } /* * Check the source relation for DDLs once again. If this check passes, no * DDL can break the process anymore. NoLock must be passed because the * relation was really unlocked for some period since the last check. * * It makes sense to do this immediately after having acquired the * exclusive lock(s), so we don't waste any effort if the source table is * no longer compatible. */ check_catalog_changes(cat_state, NoLock); /* * Flush anything we see in WAL, to make sure that all changes committed * while we were creating indexes and waiting for the exclusive lock are * available for decoding. This should not be necessary if all backends * had synchronous_commit set, but we can't rely on this setting. * * Unfortunately, GetInsertRecPtr() may lag behind the actual insert * position, and GetLastImportantRecPtr() points at the start of the last * record rather than at the end. Thus the simplest way to determine the * insert position is to insert a dummy record and use its LSN. * * XXX Consider using GetLastImportantRecPtr() and adding the size of the * last record (plus the total size of all the page headers the record * spans)? */ XLogBeginInsert(); XLogRegisterData(&dummy_rec_data, 1); xlog_insert_ptr = XLogInsert(RM_XLOG_ID, XLOG_NOOP); XLogFlush(xlog_insert_ptr); #if PG_VERSION_NUM >= 150000 end_of_wal = GetFlushRecPtr(NULL); #else end_of_wal = GetFlushRecPtr(); #endif /* * Process the changes that might have taken place while we were waiting * for the lock. * * AccessExclusiveLock effectively disables catalog checks - we've already * performed them above. */ success = process_concurrent_changes(ctx, end_of_wal, cat_state, rel_dst, ident_key, ident_key_nentries, iistate, AccessExclusiveLock, t_end_ptr); if (t_end_ptr) { struct timeval t_now; gettimeofday(&t_now, NULL); elog(DEBUG1, "pg_squeeze: concurrent changes processed at %lu.%lu, result: %u", t_now.tv_sec, t_now.tv_usec, success); } if (!success) { /* Unlock the relations and indexes. */ for (i = 0; i < nindexes; i++) UnlockRelationOid(indexes_src[i], AccessExclusiveLock); UnlockRelationOid(relid_src, AccessExclusiveLock); /* * Take time to reach end_of_wal. * * XXX DecodingOutputState may contain some changes. The corner case * that the data_size has already reached maintenance_work_mem so the * first change we decode now will make it spill to disk is too low to * justify calling apply_concurrent_changes() separately. */ process_concurrent_changes(ctx, end_of_wal, cat_state, rel_dst, ident_key, ident_key_nentries, iistate, AccessExclusiveLock, NULL); /* No time constraint, all changes must have been processed. */ Assert(((DecodingOutputState *) ctx->output_writer_private)->nchanges == 0); } return success; } /* * Derived from swap_relation_files() in PG core, but removed anything we * don't need. Also incorporated the relevant parts of finish_heap_swap(). * * Caution: r1 is the relation to remain, r2 is the one to be dropped. * * XXX Unlike PG core, we currently receive neither frozenXid nor cutoffMulti * arguments. Instead we only copy these fields from r2 to r1. This should * change if we preform regular rewrite instead of INSERT INTO ... SELECT ... */ static void swap_relation_files(Oid r1, Oid r2) { Relation relRelation; HeapTuple reltup1, reltup2; Form_pg_class relform1, relform2; Oid relfilenode1, relfilenode2; Oid swaptemp; CatalogIndexState indstate; /* We need writable copies of both pg_class tuples. */ relRelation = table_open(RelationRelationId, RowExclusiveLock); reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1)); if (!HeapTupleIsValid(reltup1)) elog(ERROR, "cache lookup failed for relation %u", r1); relform1 = (Form_pg_class) GETSTRUCT(reltup1); reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2)); if (!HeapTupleIsValid(reltup2)) elog(ERROR, "cache lookup failed for relation %u", r2); relform2 = (Form_pg_class) GETSTRUCT(reltup2); relfilenode1 = relform1->relfilenode; relfilenode2 = relform2->relfilenode; if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2)) { swaptemp = relform1->relfilenode; relform1->relfilenode = relform2->relfilenode; relform2->relfilenode = swaptemp; swaptemp = relform1->reltablespace; relform1->reltablespace = relform2->reltablespace; relform2->reltablespace = swaptemp; Assert(relform1->relpersistence == relform2->relpersistence); Assert(relform1->relam == relform2->relam); swaptemp = relform1->reltoastrelid; relform1->reltoastrelid = relform2->reltoastrelid; relform2->reltoastrelid = swaptemp; } else elog(ERROR, "cannot swap mapped relations"); /* * Set rel1's frozen Xid and minimum MultiXid. */ if (relform1->relkind != RELKIND_INDEX) { TransactionId frozenXid; MultiXactId cutoffMulti; frozenXid = RecentXmin; Assert(TransactionIdIsNormal(frozenXid)); /* * Unlike CLUSTER command (see copy_heap_data()), we don't derive the * new value from any freeze-related configuration parameters, so * there should be no way to see the value go backwards. */ Assert(!TransactionIdPrecedes(frozenXid, relform2->relfrozenxid)); relform1->relfrozenxid = frozenXid; cutoffMulti = GetOldestMultiXactId(); Assert(MultiXactIdIsValid(cutoffMulti)); Assert(!MultiXactIdPrecedes(cutoffMulti, relform2->relminmxid)); relform1->relminmxid = cutoffMulti; } /* * Adjust pg_class fields of the relation (relform2 can be ignored as the * transient relation will get dropped.) * * There's no reason to expect relallvisible to be non-zero. The next * VACUUM should fix it. * * As for relpages and reltuples, neither includes concurrent changes (are * those worth any calculation?), so leave the original values. The next * ANALYZE will fix them. */ relform1->relallvisible = 0; indstate = CatalogOpenIndexes(relRelation); CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1, indstate); CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2, indstate); CatalogCloseIndexes(indstate); InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0, InvalidOid, true); InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0, InvalidOid, true); if (relform1->reltoastrelid || relform2->reltoastrelid) { ObjectAddress baseobject, toastobject; long count; if (IsSystemClass(r1, relform1)) elog(ERROR, "cannot swap toast files by links for system catalogs"); if (relform1->reltoastrelid) { count = deleteDependencyRecordsFor(RelationRelationId, relform1->reltoastrelid, false); if (count != 1) elog(ERROR, "expected one dependency record for TOAST table, found %ld", count); } if (relform2->reltoastrelid) { count = deleteDependencyRecordsFor(RelationRelationId, relform2->reltoastrelid, false); if (count != 1) elog(ERROR, "expected one dependency record for TOAST table, found %ld", count); } baseobject.classId = RelationRelationId; baseobject.objectSubId = 0; toastobject.classId = RelationRelationId; toastobject.objectSubId = 0; if (relform1->reltoastrelid) { baseobject.objectId = r1; toastobject.objectId = relform1->reltoastrelid; recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL); } if (relform2->reltoastrelid) { baseobject.objectId = r2; toastobject.objectId = relform2->reltoastrelid; recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL); } } heap_freetuple(reltup1); heap_freetuple(reltup2); table_close(relRelation, RowExclusiveLock); #if PG_VERSION_NUM < 170000 RelationCloseSmgrByOid(r1); RelationCloseSmgrByOid(r2); #endif } /* * Swap TOAST relation names if needed. * * relid1 represents the relation to stay and toastrelid1 its TOAST relation. * relid2 refer to the transient relation in the same manner. * * The storage of TOAST tables and their indexes have already been swapped. * * On exit we hold AccessExclusiveLock on the TOAST relations and their indexes. */ static void swap_toast_names(Oid relid1, Oid toastrelid1, Oid relid2, Oid toastrelid2) { char name[NAMEDATALEN]; Oid toastidxid; /* * If relid1 no longer needs TOAST, we don't even rename that of relid2. */ if (!OidIsValid(toastrelid1)) return; if (OidIsValid(toastrelid2)) { /* * Added underscore should be enough to keep names unique (at least * within the pg_toast namespace). This assumption makes name * retrieval unnecessary. */ snprintf(name, NAMEDATALEN, "pg_toast_%u_", relid1); RenameRelationInternal(toastrelid2, name, true, false); snprintf(name, NAMEDATALEN, "pg_toast_%u_index_", relid1); #if PG_VERSION_NUM < 130000 /* NoLock as RenameRelationInternal() did not release its lock. */ toastidxid = get_toast_index(toastrelid2); #else /* TOAST relation is locked, but not its indexes. */ toastidxid = toast_get_valid_index(toastrelid2, AccessExclusiveLock); #endif /* * Pass is_index=false so that even the index is locked in * AccessExclusiveLock mode. ShareUpdateExclusiveLock mode (allowing * concurrent read / write access to the index or even its renaming) * should not be a problem at this stage of table squeezing, but it'd * also bring little benefit (the table is locked exclusively, so no * one should need read / write access to the TOAST indexes). */ RenameRelationInternal(toastidxid, name, true, false); CommandCounterIncrement(); } /* Now set the desired names on the TOAST stuff of relid1. */ snprintf(name, NAMEDATALEN, "pg_toast_%u", relid1); RenameRelationInternal(toastrelid1, name, true, false); #if PG_VERSION_NUM < 130000 /* NoLock as RenameRelationInternal() did not release its lock. */ toastidxid = get_toast_index(toastrelid1); #else /* TOAST relation is locked, but not its indexes. */ toastidxid = toast_get_valid_index(toastrelid1, AccessExclusiveLock); #endif snprintf(name, NAMEDATALEN, "pg_toast_%u_index", relid1); RenameRelationInternal(toastidxid, name, true, false); CommandCounterIncrement(); } #if PG_VERSION_NUM < 130000 /* * The function is called after RenameRelationInternal() which does not * release the lock it acquired. */ static Oid get_toast_index(Oid toastrelid) { Relation toastrel; List *toastidxs; Oid result; toastrel = table_open(toastrelid, NoLock); toastidxs = RelationGetIndexList(toastrel); if (toastidxs == NIL || list_length(toastidxs) != 1) elog(ERROR, "Unexpected number of TOAST indexes"); result = linitial_oid(toastidxs); table_close(toastrel, NoLock); return result; } #endif /* * Retrieve the "fillfactor" storage option in a convenient way, so we don't * have to parse pg_class(reloptions) value at SQL level. */ extern Datum get_heap_fillfactor(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(get_heap_fillfactor); Datum get_heap_fillfactor(PG_FUNCTION_ARGS) { Oid relid; Relation rel; int fillfactor; relid = PG_GETARG_OID(0); /* * XXX Not sure we need stronger lock - there are still occasions for * others to change the fillfactor (or even drop the relation) after this * function has returned. */ rel = table_open(relid, AccessShareLock); fillfactor = RelationGetFillFactor(rel, HEAP_DEFAULT_FILLFACTOR); table_close(rel, AccessShareLock); PG_RETURN_INT32(fillfactor); } /* * Return fraction of free space in a relation, as indicated by FSM. */ extern Datum get_heap_freespace(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(get_heap_freespace); Datum get_heap_freespace(PG_FUNCTION_ARGS) { Oid relid; Relation rel; BlockNumber blkno, nblocks; Size free, total; float8 result; bool fsm_exists = true; relid = PG_GETARG_OID(0); rel = table_open(relid, AccessShareLock); nblocks = RelationGetNumberOfBlocks(rel); /* NULL makes more sense than zero free space. */ if (nblocks == 0) { table_close(rel, AccessShareLock); PG_RETURN_NULL(); } free = 0; total = 0; for (blkno = 0; blkno < nblocks; blkno++) { free += GetRecordedFreeSpace(rel, blkno); total += BLCKSZ; } /* * If the relation seems to be full, verify that missing FSM is not the * reason. */ if (free == 0) { #if PG_VERSION_NUM >= 150000 if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM)) fsm_exists = false; #else RelationOpenSmgr(rel); if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) fsm_exists = false; #endif RelationCloseSmgr(rel); } table_close(rel, AccessShareLock); if (!fsm_exists) PG_RETURN_NULL(); result = (float8) free / total; PG_RETURN_FLOAT8(result); } /* * Handle an error from the perspective of postgres */ void squeeze_handle_error_db(ErrorData **edata_p, MemoryContext edata_cxt) { MemoryContext old_context = CurrentMemoryContext; HOLD_INTERRUPTS(); /* Save error info in caller's context */ MemoryContextSwitchTo(edata_cxt); *edata_p = CopyErrorData(); MemoryContextSwitchTo(old_context); /* * Send the message to the process that assigned the task. */ strlcpy(MyWorkerTask->error_msg, (*edata_p)->message, ERROR_MESSAGE_MAX_SIZE); /* * Abort the transaction as we do not call PG_RE_THROW() below in this * case. */ if (IsTransactionState()) AbortOutOfAnyTransaction(); /* * Now that the transaction is aborted, we can run a new one to drop the * origin. */ if (replorigin_session_origin != InvalidRepOriginId) manage_session_origin(InvalidOid); /* * Special effort is needed to release the replication slot because, * unlike other resources, AbortTransaction() does not release it. */ if (MyReplicationSlot != NULL) ReplicationSlotRelease(); RESUME_INTERRUPTS(); } /* * If 'relid' is valid, create replication origin and set the * replorigin_session_origin variable. If 'relid' is InvalidOid, drop the * origin created earlier and clear replorigin_session_origin. * * (The origin is used here to mark WAL records produced by the extension, * rather than for real replication.) */ void manage_session_origin(Oid relid) { static Oid my_relid = InvalidOid; char origin_name[NAMEDATALEN]; Oid origin; snprintf(origin_name, sizeof(origin_name), REPLORIGIN_NAME_PATTERN, MyDatabaseId, OidIsValid(relid) ? relid : my_relid); StartTransactionCommand(); if (OidIsValid(relid)) { origin = replorigin_create(origin_name); /* * As long as we set replorigin_session_origin below, we should setup * the session state because both RecordTransactionCommit() and * RecordTransactionAbort() do expect that. */ #if PG_VERSION_NUM >= 160000 replorigin_session_setup(origin, 0); #else replorigin_session_setup(origin); #endif Assert(replorigin_session_origin == InvalidRepOriginId); replorigin_session_origin = origin; Assert(!OidIsValid(my_relid)); my_relid = relid; } else { replorigin_session_reset(); #if PG_VERSION_NUM >= 140000 replorigin_drop_by_name(origin_name, false, true); #else replorigin_drop(replorigin_session_origin, false); #endif replorigin_session_origin = InvalidRepOriginId; Assert(OidIsValid(my_relid)); my_relid = InvalidOid; } CommitTransactionCommand(); } pg_squeeze-REL1_8_0/pg_squeeze.control000066400000000000000000000002701474466764000201660ustar00rootroot00000000000000# pg_squeeze extension comment = 'A tool to remove unused space from a relation.' default_version = '1.8' module_pathname = '$libdir/pg_squeeze' relocatable = false schema = 'squeeze' pg_squeeze-REL1_8_0/pg_squeeze.h000066400000000000000000000316271474466764000167470ustar00rootroot00000000000000/*----------------------------------------------------- * * pg_squeeze.h * A tool to eliminate table bloat. * * Copyright (c) 2016-2024, CYBERTEC PostgreSQL International GmbH * *----------------------------------------------------- */ #include #include "c.h" #include "postgres.h" #include "fmgr.h" #include "miscadmin.h" #include "access/genam.h" #include "access/heapam.h" #include "access/relscan.h" #if PG_VERSION_NUM < 130000 #include "access/tuptoaster.h" #endif #include "access/xact.h" #if PG_VERSION_NUM >= 130000 #include "access/xlogutils.h" #endif #include "access/xlog_internal.h" #include "catalog/pg_class.h" #include "nodes/execnodes.h" #include "postmaster/bgworker.h" #include "replication/logical.h" #if PG_VERSION_NUM < 130000 #include "replication/logicalfuncs.h" #endif #include "replication/origin.h" #include "storage/ipc.h" #include "utils/array.h" #include "utils/inval.h" #include "utils/resowner.h" #include "utils/snapmgr.h" /* * No underscore, names starting with "pg_" are reserved. See * pg_replication_origin_create(). */ #define REPLORIGIN_NAME_PREFIX "pgsqueeze_" #define REPLORIGIN_NAME_PATTERN REPLORIGIN_NAME_PREFIX "%u_%u" extern int squeeze_max_xlock_time; typedef enum { PG_SQUEEZE_CHANGE_INSERT, PG_SQUEEZE_CHANGE_UPDATE_OLD, PG_SQUEEZE_CHANGE_UPDATE_NEW, PG_SQUEEZE_CHANGE_DELETE } ConcurrentChangeKind; typedef struct ConcurrentChange { /* See the enum above. */ ConcurrentChangeKind kind; /* * The actual tuple. * * The tuple data follows the ConcurrentChange structure. Before use make * sure the tuple is correctly aligned (ConcurrentChange can be stored as * bytea) and that tuple->t_data is fixed. */ HeapTupleData tup_data; } ConcurrentChange; typedef struct DecodingOutputState { /* The relation whose changes we're decoding. */ Oid relid; /* * Decoded changes are stored here. Although we try to avoid excessive * batches, it can happen that the changes need to be stored to disk. The * tuplestore does this transparently. */ Tuplestorestate *tstore; /* The current number of changes in tstore. */ double nchanges; /* * Descriptor to store the ConcurrentChange structure serialized (bytea). * We can't store the tuple directly because tuplestore only supports * minimum tuple and we may need to transfer OID system column from the * output plugin. Also we need to transfer the change kind, so it's better * to put everything in the structure than to use 2 tuplestores "in * parallel". */ TupleDesc tupdesc_change; /* Tuple descriptor needed to update indexes. */ TupleDesc tupdesc; /* Slot to retrieve data from tstore. */ TupleTableSlot *tsslot; /* * WAL records having this origin have been created by the initial load * and should not be decoded. */ RepOriginId rorigin; ResourceOwner resowner; } DecodingOutputState; /* The WAL segment being decoded. */ extern XLogSegNo squeeze_current_segment; extern void _PG_init(void); /* Everything we need to call ExecInsertIndexTuples(). */ typedef struct IndexInsertState { ResultRelInfo *rri; EState *estate; ExprContext *econtext; Relation ident_index; } IndexInsertState; /* * Subset of fields of pg_class, plus the necessary info on attributes. It * represents either the source relation or a composite type of the source * relation's attribute. */ typedef struct PgClassCatInfo { /* pg_class(oid) */ Oid relid; /* * pg_class(xmin) */ TransactionId xmin; /* Array of pg_attribute(xmin). (Dropped columns are here too.) */ TransactionId *attr_xmins; int16 relnatts; } PgClassCatInfo; /* * Information on source relation index, used to build the index on the * transient relation. To avoid repeated retrieval of the pg_index fields we * also add pg_class(xmin) and pass the same structure to * check_catalog_changes(). */ typedef struct IndexCatInfo { Oid oid; /* pg_index(indexrelid) */ NameData relname; /* pg_class(relname) */ Oid reltablespace; /* pg_class(reltablespace) */ TransactionId xmin; /* pg_index(xmin) */ TransactionId pg_class_xmin; /* pg_class(xmin) of the index (not the * parent relation) */ } IndexCatInfo; /* * If the source relation has attribute(s) of composite type, we need to check * for changes of those types. */ typedef struct TypeCatInfo { Oid oid; /* pg_type(oid) */ TransactionId xmin; /* pg_type(xmin) */ /* * The pg_class entry whose oid == pg_type(typrelid) of this type. */ PgClassCatInfo rel; } TypeCatInfo; /* * Information to check whether an "incompatible" catalog change took * place. Such a change prevents us from completing processing of the current * table. */ typedef struct CatalogState { /* The relation whose changes we'll check for. */ PgClassCatInfo rel; /* Copy of pg_class tuple of the source relation. */ Form_pg_class form_class; /* Copy of pg_class tuple descriptor of the source relation. */ TupleDesc desc_class; /* Per-index info. */ int relninds; IndexCatInfo *indexes; /* Composite types used by the source rel attributes. */ TypeCatInfo *comptypes; /* Size of the array. */ int ncomptypes_max; /* Used elements of the array. */ int ncomptypes; /* * Does at least one index have wrong value of indisvalid, indisready or * indislive? */ bool invalid_index; /* Does the table have primary key index? */ bool have_pk_index; } CatalogState; extern void check_catalog_changes(CatalogState *state, LOCKMODE lock_held); extern IndexInsertState *get_index_insert_state(Relation relation, Oid ident_index_id); extern void free_index_insert_state(IndexInsertState *iistate); extern bool process_concurrent_changes(LogicalDecodingContext *ctx, XLogRecPtr end_of_wal, CatalogState *cat_state, Relation rel_dst, ScanKey ident_key, int ident_key_nentries, IndexInsertState *iistate, LOCKMODE lock_held, struct timeval *must_complete); extern bool decode_concurrent_changes(LogicalDecodingContext *ctx, XLogRecPtr end_of_wal, struct timeval *must_complete); extern void _PG_output_plugin_init(OutputPluginCallbacks *cb); extern int squeeze_workers_per_database; /* * Connection information the squeeze worker needs to connect to database if * starting automatically. Strings are more convenient for admin than OIDs and * we have no chance to lookup OIDs in the catalog when registering worker * during postmaster startup. That's why we pass strings. * * The structure is allocated in TopMemoryContext during postmaster startup, * so the worker should access it correctly if it receives pointer from the * bgw_main_arg field of BackgroundWorker. * * Unlike WorkerConInteractive, this is currently used only for the scheduler * worker. */ typedef struct WorkerConInit { char *dbname; char *rolename; } WorkerConInit; /* * The same for interactive start of the worker. In this case we can no longer * add anything to the TopMemoryContext of postmaster, so * BackgroundWorker.bgw_extra is the only way to pass the information. As we * have OIDs at this stage, the structure is small enough to fit bgw_extra * field of BackgroundWorker. */ typedef struct WorkerConInteractive { Oid dbid; Oid roleid; bool scheduler; int task_idx; } WorkerConInteractive; /* Progress tracking. */ typedef struct WorkerProgress { /* Tuples inserted during the initial load. */ int64 ins_initial; /* * Tuples inserted, updated and deleted after the initial load (i.e. * during the catch-up phase). */ int64 ins; int64 upd; int64 del; } WorkerProgress; /* * Shared memory structures to keep track of the status of squeeze workers. */ typedef struct WorkerSlot { Oid dbid; /* database the worker is connected to */ Oid relid; /* relation the worker is working on */ int pid; /* the PID */ bool scheduler; /* true if scheduler, false if the "squeeze * worker" */ WorkerProgress progress; /* progress tracking information */ /* * Use this when setting / clearing the fields above. * * Note that, when setting, workerData->lock in exclusive mode must be * held in addition. This is to ensure the maximum number of workers per * database is not exceeded when multiple workers search for a slot * concurrently. On the other hand, the spinlock is sufficient to clear * the fields. * * Note that we use MemSet() to reset 'progress', which is hopefully * o.k. to do under spinlock. XXX Consider using atomics for the * 'progress' counters rather than the spinlock. In theory, the absence of * spinlock could allow the new worker to see the values not yet cleared * by the old worker (or cleared after the new worker already had * increased the counters), but not sure if this a serious issue. * (Likewise: is it a problem if the monitoring functions get an * inconsistent view of the counters?) */ slock_t mutex; } WorkerSlot; /* * Information on a replication slot that we pass to squeeze workers. */ typedef struct ReplSlotStatus { /* Slot name */ NameData name; /* A copy of the same field of ReplicationSlotPersistentData. */ XLogRecPtr confirmed_flush; /* * Shared memory to pass the initial snapshot to the worker. Only needed * by the scheduler. */ dsm_handle snap_handle; dsm_segment *snap_seg; /* The snapshot in the squeeze worker private memory. */ char *snap_private; } ReplSlotStatus; /* Life cycle of the task from the perspective of the worker. */ typedef enum { WTS_UNUSED, /* processing not yet requested by backend or * scheduler worker */ WTS_INIT, /* processing requested but task not yet picked by a * worker */ WTS_IN_PROGRESS, /* worker is working on the task */ } WorkerTaskState; /* * This structure represents a task assigned to the worker via shared memory. */ typedef struct WorkerTask { /* * State of the task. * * The "requester" (i.e. regular backend or squeeze scheduler) sets the * state to WTS_INIT, the scheduler worker then sets it to WTS_IN_PROGRESS * and eventually to WTS_UNUSED. However, in order to avoid leak, the * requester can set it to WTS_UNUSED too, if it's sure that the worker * failed to start. */ WorkerTaskState worker_state; /* See the comments of exit_if_requested(). */ bool exit_requested; /* * Use this when setting / clearing the fields above. * * Note that, when setting "worker_state" to WTS_INIT, workerData->lock * must be held in exclusive mode in addition. This is because, when * "allocating" the task using this status, we need to check if no other * task exists for the same database and relation. */ slock_t mutex; /* * Details of the task. * * Only the requester should change these fields, after he has "allocated" * the task by setting the state to WTS_INIT. Therefore, no locking is * required, except for "dbid", "relschema" and "relname" - those require * workerData->lock in exclusive mode because they are used to check task * uniqueness (i.e. no more than one worker per table). */ Oid dbid; NameData relschema; NameData relname; NameData indname; /* clustering index */ NameData tbspname; /* destination tablespace */ int max_xlock_time; /* * Fields of the squeeze.tasks table. * * task_id can be -1 if there is no corresponding record in the * squeeze.table. In any case, it must be set as soon as we set * ->assigned, before ->mutex is released. */ int task_id; bool last_try; bool skip_analyze; /* * Index destination tablespaces. * * text[][] array is stored here. The space should only be used by the * interactive squeeze_table() function, which is only there for testing * and troubleshooting purposes. If the array doesn't fit here, the user * needs to use the regular UI (ie register the table for squeezing and * insert a record into the "tasks" table). */ #define IND_TABLESPACES_ARRAY_SIZE 1024 char ind_tbsps[IND_TABLESPACES_ARRAY_SIZE]; ReplSlotStatus repl_slot; #define ERROR_MESSAGE_MAX_SIZE 1024 char error_msg[ERROR_MESSAGE_MAX_SIZE]; } WorkerTask; extern WorkerSlot *MyWorkerSlot; extern WorkerTask *MyWorkerTask; extern WorkerConInit *allocate_worker_con_info(char *dbname, char *rolename); extern void squeeze_initialize_bgworker(BackgroundWorker *worker, WorkerConInit *con_init, WorkerConInteractive *con_interactive, pid_t notify_pid); #if PG_VERSION_NUM >= 150000 extern void squeeze_save_prev_shmem_request_hook(void); #endif extern void squeeze_worker_shmem_request(void); extern void squeeze_save_prev_shmem_startup_hook(void); extern void squeeze_worker_shmem_startup(void); extern PGDLLEXPORT void squeeze_worker_main(Datum main_arg); extern void exit_if_requested(void); extern bool squeeze_table_impl(Name relschema, Name relname, Name indname, Name tbspname, ArrayType *ind_tbsp, ErrorData **edata_p, MemoryContext edata_cxt); extern void squeeze_handle_error_db(ErrorData **edata_p, MemoryContext edata_cxt); extern void manage_session_origin(Oid relid); pg_squeeze-REL1_8_0/pg_squeeze.md000077700000000000000000000000001474466764000203612README.mdustar00rootroot00000000000000pg_squeeze-REL1_8_0/pgstatapprox.c000066400000000000000000000213501474466764000173170ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * pgstatapprox.c * Bloat estimation functions * * Copyright (c) 2014-2023, PostgreSQL Global Development Group * * Copyright (c) 2016-2024, CYBERTEC PostgreSQL International GmbH * * IDENTIFICATION * contrib/pgstattuple/pgstatapprox.c * *------------------------------------------------------------------------- */ /* * The initial version has been copied from contrib/pgstattuple in PostgreSQL * source tree. We could ask users to install the "pgstattuple" extension, but * then we'd have to handle dependencies between various versions of * "pgstattuple" and our extension. On the other hand, the copy gives us the * possibility to do changes. */ #include "postgres.h" #include "pg_squeeze.h" #include "access/heapam.h" #include "access/visibilitymap.h" #include "access/transam.h" #include "access/xact.h" #include "access/multixact.h" #include "access/htup_details.h" #include "catalog/pg_am.h" #include "catalog/namespace.h" #include "funcapi.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/procarray.h" #include "storage/lmgr.h" #include "utils/builtins.h" #include "commands/vacuum.h" PG_FUNCTION_INFO_V1(squeeze_pgstattuple_approx); typedef struct output_type { uint64 table_len; double scanned_percent; uint64 tuple_count; uint64 tuple_len; double tuple_percent; uint64 dead_tuple_count; uint64 dead_tuple_len; double dead_tuple_percent; uint64 free_space; double free_percent; } output_type; #define NUM_OUTPUT_COLUMNS 10 /* * This function takes an already open relation and scans its pages, * skipping those that have the corresponding visibility map bit set. * For pages we skip, we find the free space from the free space map * and approximate tuple_len on that basis. For the others, we count * the exact number of dead tuples etc. * * This scan is loosely based on vacuumlazy.c:lazy_scan_heap(), but * we do not try to avoid skipping single pages. */ static void statapprox_heap(Relation rel, output_type *stat) { BlockNumber scanned, nblocks, blkno; Buffer vmbuffer = InvalidBuffer; BufferAccessStrategy bstrategy; TransactionId OldestXmin; #if PG_VERSION_NUM >= 140000 OldestXmin = GetOldestNonRemovableTransactionId(rel); #else OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM); #endif bstrategy = GetAccessStrategy(BAS_BULKREAD); nblocks = RelationGetNumberOfBlocks(rel); scanned = 0; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; Size freespace; CHECK_FOR_INTERRUPTS(); /* * If the page has only visible tuples, then we can find out the free * space from the FSM and move on. */ if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) { freespace = GetRecordedFreeSpace(rel, blkno); stat->tuple_len += BLCKSZ - freespace; stat->free_space += freespace; continue; } buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); /* * It's not safe to call PageGetHeapFreeSpace() on new pages, so we * treat them as being free space for our purposes. */ if (!PageIsNew(page)) stat->free_space += PageGetHeapFreeSpace(page); else stat->free_space += BLCKSZ - SizeOfPageHeaderData; /* We may count the page as scanned even if it's new/empty */ scanned++; if (PageIsNew(page) || PageIsEmpty(page)) { UnlockReleaseBuffer(buf); continue; } /* * Look at each tuple on the page and decide whether it's live or * dead, then count it and its size. Unlike lazy_scan_heap, we can * afford to ignore problems and special cases. */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; HeapTupleData tuple; itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) || ItemIdIsDead(itemid)) { continue; } Assert(ItemIdIsNormal(itemid)); ItemPointerSet(&(tuple.t_self), blkno, offnum); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(rel); /* * We follow VACUUM's lead in counting INSERT_IN_PROGRESS tuples * as "dead" while DELETE_IN_PROGRESS tuples are "live". We don't * bother distinguishing tuples inserted/deleted by our own * transaction. */ switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) { case HEAPTUPLE_LIVE: case HEAPTUPLE_DELETE_IN_PROGRESS: stat->tuple_len += tuple.t_len; stat->tuple_count++; break; case HEAPTUPLE_DEAD: case HEAPTUPLE_RECENTLY_DEAD: case HEAPTUPLE_INSERT_IN_PROGRESS: stat->dead_tuple_len += tuple.t_len; stat->dead_tuple_count++; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } } UnlockReleaseBuffer(buf); } stat->table_len = (uint64) nblocks * BLCKSZ; /* * We don't know how many tuples are in the pages we didn't scan, so * extrapolate the live-tuple count to the whole table in the same way * that VACUUM does. (Like VACUUM, we're not taking a random sample, so * just extrapolating linearly seems unsafe.) There should be no dead * tuples in all-visible pages, so no correction is needed for that, and * we already accounted for the space in those pages, too. */ stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned, stat->tuple_count); /* It's not clear if we could get -1 here, but be safe. */ stat->tuple_count = Max(stat->tuple_count, 0); /* * Calculate percentages if the relation has one or more pages. */ if (nblocks != 0) { stat->scanned_percent = 100.0 * scanned / nblocks; stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len; stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; stat->free_percent = 100.0 * stat->free_space / stat->table_len; } if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } } /* * Returns estimated live/dead tuple statistics for the given relid. */ Datum squeeze_pgstattuple_approx(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); Relation rel; output_type stat = {0}; TupleDesc tupdesc; bool nulls[NUM_OUTPUT_COLUMNS]; Datum values[NUM_OUTPUT_COLUMNS]; HeapTuple ret; int i = 0; if (!superuser() && !has_rolreplication(GetUserId())) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser or replication role to run this function")))); if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); if (tupdesc->natts != NUM_OUTPUT_COLUMNS) elog(ERROR, "incorrect number of output arguments"); rel = relation_open(relid, AccessShareLock); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); /* * We support only relation kinds with a visibility map and a free space * map. */ if (!(rel->rd_rel->relkind == RELKIND_RELATION || rel->rd_rel->relkind == RELKIND_MATVIEW || rel->rd_rel->relkind == RELKIND_TOASTVALUE)) ereport(ERROR, #if PG_VERSION_NUM >= 150000 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("relation \"%s\" is of wrong relation kind", RelationGetRelationName(rel)), errdetail_relkind_not_supported(rel->rd_rel->relkind))); #else (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("\"%s\" is not a table, materialized view, or TOAST table", RelationGetRelationName(rel)))); #endif if (rel->rd_rel->relam != HEAP_TABLE_AM_OID) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("only heap AM is supported"))); statapprox_heap(rel, &stat); relation_close(rel, AccessShareLock); memset(nulls, 0, sizeof(nulls)); values[i++] = Int64GetDatum(stat.table_len); values[i++] = Float8GetDatum(stat.scanned_percent); values[i++] = Int64GetDatum(stat.tuple_count); values[i++] = Int64GetDatum(stat.tuple_len); values[i++] = Float8GetDatum(stat.tuple_percent); values[i++] = Int64GetDatum(stat.dead_tuple_count); values[i++] = Int64GetDatum(stat.dead_tuple_len); values[i++] = Float8GetDatum(stat.dead_tuple_percent); values[i++] = Int64GetDatum(stat.free_space); values[i++] = Float8GetDatum(stat.free_percent); ret = heap_form_tuple(tupdesc, values, nulls); return HeapTupleGetDatum(ret); } pg_squeeze-REL1_8_0/sql/000077500000000000000000000000001474466764000152155ustar00rootroot00000000000000pg_squeeze-REL1_8_0/sql/squeeze.sql000066400000000000000000000015621474466764000174230ustar00rootroot00000000000000CREATE EXTENSION pg_squeeze; CREATE TABLE a(i int PRIMARY KEY, j int); INSERT INTO a(i, j) SELECT x, x FROM generate_series(1, 10) AS g(x); -- The trivial case. SELECT squeeze.squeeze_table('public', 'a', NULL); SELECT * FROM a; -- Clustering by index. CREATE INDEX a_i_idx_desc ON a(i DESC); SELECT squeeze.squeeze_table('public', 'a', 'a_i_idx_desc'); SELECT * FROM a; -- Involve TOAST. CREATE TABLE b(i int PRIMARY KEY, t text); INSERT INTO b(i, t) SELECT x, repeat(x::text, 1024) FROM generate_series(1, 10) AS g(x) GROUP BY x; SELECT reltoastrelid > 0 FROM pg_class WHERE relname='b'; -- Copy the data into another table so we can check later. CREATE TABLE b_copy (LIKE b INCLUDING ALL); INSERT INTO b_copy(i, t) SELECT i, t FROM b; -- Squeeze. SELECT squeeze.squeeze_table('public', 'b', NULL); -- Compare. SELECT b.t = b_copy.t FROM b, b_copy WHERE b.i = b_copy.i; pg_squeeze-REL1_8_0/test/000077500000000000000000000000001474466764000153755ustar00rootroot00000000000000pg_squeeze-REL1_8_0/test/concurrency000077500000000000000000000351631474466764000176650ustar00rootroot00000000000000#!/usr/bin/python3 # # -*- coding: utf-8 -*- # # Copyright (c) 2016-2023, CYBERTEC PostgreSQL International GmbH # # Test the squeeze_table() function while concurrent data changes. To verify # that all the concurrent changes have been captured and processed, run the # test once again w/o the squeeze_table() function and compare results. # # If the test should run for long time and if user suspects the tables can # grow beyond disk space available, he can pass reasonable value of # --test-duration and use --test-iterations option to ensure that the whole # test (from table creation to verification of results) runs many times. import argparse import psycopg import random import sys import threading import time from threading import Thread, Timer parser = argparse.ArgumentParser() parser.add_argument("--host", default="localhost", help="Database server host") parser.add_argument("--port", default="5432", help="Database server port") parser.add_argument("--database", default="postgres", help="The test database name") parser.add_argument("--user", default="postgres", help="The user that connects to the test database") parser.add_argument("--test-duration", type=int, default=5, help="Test duration in seconds") parser.add_argument("--test-iterations", type=int, default=1, help="How many times should the test be executed") parser.add_argument("--no-verification", action="store_true", help="Sikp verification of result, i.e. only test stability") args = parser.parse_args() test_succeeded = True def get_connection(): return psycopg.connect(host=args.host, port=args.port, dbname=args.database, user=args.user) def start_single_test(): d = globals() # All threads should check this variable and stop as soon as it becomes # True. d['test_done'] = False # Instead of letting the timer wait for the whole test time, we start it # many times for a 1-second interval so that user can interrupt the # application. d['timer_executions'] = 0 # Since the test interval is one second, the number of executions is the # test duration in seconds d['timer_max_executions'] = args.test_duration def stop_single_test(success): d = globals() d['test_done'] = True if not success: d['test_succeeded'] = False # Stop if the required number of executions elapsed. def maybe_stop_single_test(): d = globals() d['timer_executions'] = d['timer_executions'] + 1 if timer_executions >= timer_max_executions: stop_single_test(True) def check(cur): # Check the pg_squeeze extension. cur.execute("SELECT extversion FROM pg_extension WHERE extname='pg_squeeze'") if cur.rowcount == 0: raise Exception("pg_squeeze is not installed") ext_row = cur.fetchone() class CommandThread(Thread): def __init__(self, cmds, cmds_executed): super(CommandThread, self).__init__() self.cmds = cmds self.cmds_executed = cmds_executed def run(self): try: con = get_connection() # For debuging purposes we might need to print out XID, see below. con.autocommit = False cur = con.cursor() while not test_done: i = random.randint(0, len(self.cmds) - 1) next_cmd = self.cmds[i] cur.execute(next_cmd) con.commit() if self.cmds_executed != None: # Record the command so it can be replayed during # verification. self.cmds_executed.append(i) con.close() except Exception as e: print(e) stop_single_test(False) class SqueezeParams(object): def __init__(self, table, index): self.table = table self.index = index class SqueezeThread(Thread): # params_array is an array of SqueezeParams instances # # delay is the number of seconds to wait before the next squeeze should # start. def __init__(self, params_array, delay): super(SqueezeThread, self).__init__() self.params_array = params_array self.delay = delay self.done = False def run(self): self.con = get_connection() self.con.autocommit = True self.cur = self.con.cursor() while not test_done: timer = Timer(self.delay, self.squeeze) timer.start() timer.join() self.con.close() self.done = True # Each call processes the next item of params_array (round robin). def squeeze(self): params = random.choice(self.params_array) try: ind = "'%s'" % params.index if params.index else "NULL" self.cur.execute("SET maintenance_work_mem='1MB'") self.cur.execute( "SELECT squeeze.squeeze_table('public', '%s', %s)" % (params.table, ind,)) self.cur.execute("SELECT count(*) FROM squeeze.errors") row = self.cur.fetchone() if row[0] > 0: # XXX The failure could be caused by a concurrent call of # squeeze_table(). Nevertheless, it's a reason to stop. raise Exception("squeeze_table() failed") except Exception as e: print(e) stop_single_test(False) # cmds_setup is a list of commands to create the test table and any other # database objects needed. The first item must be CREATE TABLE command, and it # must contain two formatting strings: one to allow insertion of UNLOGGED # keyword and one to insert schema name. # # cmds is a list of commands to be executed in random order. The commands must # be such that new execution in the same order on an empty table produces the # same results as during the first execution. For example, only stable # functions (in terms of pg_proc(provolatile) may be used. # # check_query is an SQL query that compares the data produced by the stability # test to the data produced by replaying the same commands and returns zero if # the sets are identical. class Test(object): def __init__(self, table, cmds_setup, cmds, check_query): self.table = table self.cmds_setup = cmds_setup self.cmds = cmds self.check_query = check_query def setup(self): con = get_connection() con.autocommit = True cur = con.cursor() cur.execute("DROP TABLE IF EXISTS %s" % self.table) first = True for cmd in self.cmds_setup: if first: # CREATE TABLE w/o the UNLOGGED keyword and with "public" as # schema name. cmd = cmd % ("", "public",) first = False cur.execute(cmd) con.close() if args.no_verification: self.cmds_executed = None else: self.cmds_executed = [] def start(self): self.cmd_thread = CommandThread(self.cmds, self.cmds_executed) self.cmd_thread.start() # Execute the test suite once. def run_single_test(squeeze_thread): print('Running test...') start_single_test() for test in tests: test.setup() for test in tests: test.start() squeeze_thread.start() while True: if test_done: break timer = Timer(1, maybe_stop_single_test) timer.start() timer.join() verification_schema = "expected" # Run the same SQL statements on each table again, w/o the interference with # squeeze_table(). The resulting tables should be identical. def verify_single_test(squeeze_thread, con, con_vac): print("Verifying results...") cur = con.cursor() cur_vac = con_vac.cursor() # Make sure that the last call of squeeze_table() finished. while not squeeze_thread.done: time.sleep(1) cur.execute("CREATE SCHEMA IF NOT EXISTS %s" % verification_schema) con.commit() # Create tables to execute the queries again. for test in tests: cur.execute("DROP TABLE IF EXISTS expected.%s" % (test.table,)) cur.execute(test.cmds_setup[0] % ("UNLOGGED", verification_schema,)) # We try to run VACUUM FULL when it seems appropriate (see below), so do # not let autovacuum interfere with that effort. cur.execute("ALTER TABLE %s SET (autovacuum_enabled=false)" % test.table) cur.execute("ALTER TABLE %s SET (toast.autovacuum_enabled=false)" % test.table) con.commit() # Replay the commands. Use a separate transaction for each command and # change search_path only within the transaction so that we do not have to # remember the original value of search_path. rows = 0.0 rows_live = 0.0 for i in test.cmds_executed: cmd = test.cmds[i] cur.execute("SET LOCAL search_path TO expected") cur.execute(cmd) if cmd.lower().find('insert') >= 0: rows = rows + cur.rowcount rows_live = rows_live + cur.rowcount elif cmd.lower().find('update') >= 0: # UPDATE does not change the number of live rows, it just adds one # dead row per row updated. rows = rows + cur.rowcount elif cmd.lower().find('delete') >= 0: # DELETE removes a live row, but does not change the total number # of rows. rows_live = rows_live - cur.rowcount con.commit() # The check queries run much faster if the bloat is kept at reasonable # level. if rows > 0 and rows_live / rows < 0.5: cur_vac.execute("VACUUM FULL %s" % test.table) rows = rows_live # Compare the tables cur.execute(test.check_query) con.commit() if cur.rowcount == 0: print('Test passed for table "%s"' % test.table) else: print('Found difference for table "%s"' % test.table) con.close() con_vac.close() tests = [ Test(table = "a", cmds_setup = [ "CREATE %s TABLE %s.a(i serial NOT NULL PRIMARY KEY, j int)", ], cmds = [ "INSERT INTO a(j) SELECT g.i FROM generate_series(0, 255) AS g(i)", "UPDATE a SET j = j + 1 WHERE i IN (SELECT i FROM a WHERE i % 2 = 0 ORDER BY i LIMIT 256)", "UPDATE a SET j = j + 1 WHERE i IN (SELECT i FROM a WHERE i % 2 = 1 ORDER BY i LIMIT 256)", "DELETE FROM a WHERE i IN (SELECT i FROM a WHERE i % 2 = 0 ORDER BY i LIMIT 128)", "DELETE FROM a WHERE i IN (SELECT i FROM a WHERE i % 2 = 1 ORDER BY i LIMIT 128)" ], check_query = "SELECT * FROM public.a AS t1 FULL JOIN expected.a AS t2 ON (t1.i, t2.j) = (t2.i, t2.j) WHERE t1.i ISNULL OR t2.i ISNULL") , # TOAST Test(table = "b", cmds_setup = [ "CREATE %s TABLE %s.b(i serial NOT NULL PRIMARY KEY, j text)", "CREATE OR REPLACE FUNCTION public.long_string() RETURNS text LANGUAGE sql AS $$ SELECT string_agg(h.x::text, ' ') FROM generate_series(0, 4095) as h(x);$$" ], cmds = [ "INSERT INTO b(j) SELECT public.long_string() FROM generate_series(0, 16) AS g(i)", "UPDATE b SET j = public.long_string() WHERE i IN (SELECT i FROM b WHERE i % 2 = 0 ORDER BY i LIMIT 256)", "UPDATE b SET j = public.long_string() WHERE i IN (SELECT i FROM b WHERE i % 2 = 1 ORDER BY i LIMIT 256)", "DELETE FROM b WHERE i IN (SELECT i FROM b WHERE i % 2 = 0 ORDER BY i LIMIT 8)", "DELETE FROM b WHERE i IN (SELECT i FROM b WHERE i % 2 = 1 ORDER BY i LIMIT 8)" ], check_query = "SELECT * FROM public.b AS t1 FULL JOIN expected.b AS t2 ON (t1.i, t2.j) = (t2.i, t2.j) WHERE t1.i ISNULL OR t2.i ISNULL") , # Update identity key. Test(table = "c", cmds_setup = [ "CREATE %s TABLE %s.c(i serial NOT NULL PRIMARY KEY, j real)" ], cmds = [ # The values should be sparse so that the UPDATE can increment as # many values as possible. "WITH tmp(i) AS (SELECT max(i) FROM (SELECT i FROM c UNION VALUES (0)) AS s) INSERT INTO c SELECT g.i FROM tmp, generate_series(tmp.i + 1, tmp.i + 256) AS g(i)", # To avoid violation of the primary key, only update those rows # for which i + 1 does not exist. "UPDATE c SET i = i + 1 WHERE i IN (SELECT s1.x FROM (SELECT i, i + 1 FROM c) s1(x, y) LEFT JOIN (SELECT i FROM c) s2(x) ON s1.y = s2.x WHERE s2.x ISNULL)", "DELETE FROM c WHERE i IN (SELECT i FROM c WHERE i % 2 = 0 ORDER BY i LIMIT 128)", "DELETE FROM c WHERE i IN (SELECT i FROM c WHERE i % 2 = 1 ORDER BY i LIMIT 128)", ], check_query = "SELECT * FROM public.c AS t1 FULL JOIN expected.c AS t2 ON t1.i = t2.i WHERE t1.i ISNULL OR t2.i ISNULL") ] con = get_connection() con.autocommit = True cur = con.cursor() try: check(cur) check_ok = True # Prepare for error checking. cur.execute("TRUNCATE TABLE squeeze.errors") # Make sure that the squeeze worker is on. cur.execute("SELECT squeeze.start_worker()") except Exception as e: print(e) check_ok = False finally: con.close() if not check_ok: sys.exit(1) for i in range(args.test_iterations): con = get_connection() con.autocommit = False # An extra connection for VACUUM because it cannot run inside transaction # block. con_vac = get_connection() con_vac.autocommit = True squeeze_thread = SqueezeThread( [ SqueezeParams("a", None), SqueezeParams("a", "a_pkey"), SqueezeParams("b", None), SqueezeParams("b", "b_pkey"), SqueezeParams("c", None), SqueezeParams("c", "c_pkey") ], 1.0) try: run_single_test(squeeze_thread) except KeyboardInterrupt as e: # Pay special attention to KeyboardInterrupt because the join() method # of Timer can be interrupted, in which case maybe_stop_single_test # never gets called. print(e) stop_single_test(False) con.close() con_vac.close() if test_succeeded: if not args.no_verification: try: verify_single_test(squeeze_thread, con, con_vac) except Exception as e: print(e) con.close() con_vac.close() sys.exit(1) else: con.close() con_vac.close() sys.exit(1) con = get_connection() cur = con.cursor() cur.execute("SELECT squeeze.stop_worker()") pg_squeeze-REL1_8_0/worker.c000066400000000000000000001767651474466764000161210ustar00rootroot00000000000000/*--------------------------------------------------------- * * worker.c * Background worker to call functions of pg_squeeze.c * * Copyright (c) 2016-2024, CYBERTEC PostgreSQL International GmbH * *--------------------------------------------------------- */ #include "c.h" #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" #include "pgstat.h" #include "access/xact.h" #include "catalog/pg_extension.h" #include "catalog/pg_type.h" #include "commands/dbcommands.h" #include "executor/spi.h" #include "nodes/makefuncs.h" #include "replication/slot.h" #include "replication/snapbuild.h" #include "storage/latch.h" #include "storage/lock.h" #include "storage/proc.h" #if PG_VERSION_NUM >= 160000 #include "utils/backend_status.h" #endif #include "utils/builtins.h" #include "utils/memutils.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/snapmgr.h" #include "pg_squeeze.h" /* * There are 2 kinds of worker: 1) scheduler, which creates new tasks, 2) the * actual "squeeze worker" which calls the squeeze_table() function. It's * simpler to have a separate worker that checks the schedules every * minute. If there was a single worker that checks the schedules among the * calls of squeeze_table(), it'd be harder to handle the cases where the call * of squeeze_table() took too much time to complete (i.e. the worker could * miss some schedule(s)). */ static bool am_i_scheduler = false; /* * Indicates that the squeeze worker was launched by an user backend (using * the squeeze_table() function), as opposed to the scheduler worker. */ static bool am_i_standalone = false; /* * As long as the number of slots depends on the max_worker_processes GUC (it * just makes sense not to allocate more slots for our workers than this * value), we should not use this GUC before the other libraries have been * loaded: those libraries might, at least in theory, adjust * max_worker_processes. * * In PG >= 15, this function is called from squeeze_worker_shmem_request(), * after all the related GUCs have been set. In earlier versions (which do not * have the hook), the function is called while our library is being loaded, * and some other libraries might follow. Therefore we prefer a compile time * constant to a (possibly) not-yet-finalized GUC. */ static int max_squeeze_workers(void) { #if PG_VERSION_NUM >= 150000 return max_worker_processes; #else #define MAX_SQUEEZE_WORKERS 32 /* * If max_worker_processes appears to be greater than MAX_SQUEEZE_WORKERS, * postmaster can start new processes but squeeze_worker_main() will fail * to find a slot for them, and therefore those extra workers will exit * immediately. */ return MAX_SQUEEZE_WORKERS; #endif } /* * The maximum number of tasks submitted by the scheduler worker or by the * squeeze_table() user function that can be in progress at a time (as long as * there's enough workers). Note that this is cluster-wide constant. * * XXX Should be based on MAX_SQUEEZE_WORKERS? Not sure how to incorporate * scheduler workers in the computation. */ #define NUM_WORKER_TASKS 16 typedef struct WorkerData { WorkerTask tasks[NUM_WORKER_TASKS]; /* * Has cleanup after restart completed? The first worker launched after * server restart should set this flag. */ bool cleanup_done; /* * A lock to synchronize access to slots. Lock in exclusive mode to add / * remove workers, in shared mode to find information on them. * * It's also used to synchronize task creation, so that we don't have more * than one task per table. */ LWLock *lock; int nslots; /* size of the array */ WorkerSlot slots[FLEXIBLE_ARRAY_MEMBER]; } WorkerData; static WorkerData *workerData = NULL; /* Local pointer to the slot in the shared memory. */ WorkerSlot *MyWorkerSlot = NULL; /* Local pointer to the task in the shared memory. */ WorkerTask *MyWorkerTask = NULL; /* * The "squeeze worker" (i.e. one that performs the actual squeezing, as * opposed to the "scheduler worker"). The scheduler worker uses this * structure to keep track of squeeze workers it launched. */ typedef struct SqueezeWorker { BackgroundWorkerHandle *handle; WorkerTask *task; } SqueezeWorker; static SqueezeWorker *squeezeWorkers = NULL; static int squeezeWorkerCount = 0; /* * One slot per worker, but the count is stored separately because cleanup is * also done separately. */ static ReplSlotStatus *squeezeWorkerSlots = NULL; static int squeezeWorkerSlotCount = 0; #define REPL_SLOT_PREFIX "pg_squeeze_slot_" #define REPL_PLUGIN_NAME "pg_squeeze" static void interrupt_worker(WorkerTask *task); static void clear_task(WorkerTask *task); static void release_task(WorkerTask *task); static void squeeze_handle_error_app(ErrorData *edata, WorkerTask *task); static WorkerTask *get_unused_task(Oid dbid, char *relschema, char *relname, int *task_idx, bool *duplicate); static void initialize_worker_task(WorkerTask *task, int task_id, Name indname, Name tbspname, ArrayType *ind_tbsps, bool last_try, bool skip_analyze, int max_xlock_time); static bool start_worker_internal(bool scheduler, int task_idx, BackgroundWorkerHandle **handle); static void worker_sighup(SIGNAL_ARGS); static void worker_sigterm(SIGNAL_ARGS); static void scheduler_worker_loop(void); static void cleanup_workers_and_tasks(bool interrupt); static void wait_for_worker_shutdown(SqueezeWorker *worker); static void process_task(void); static void create_replication_slots(int nslots, MemoryContext mcxt); static void drop_replication_slots(void); static void cleanup_after_server_start(void); static void cleanup_repl_origins(void); static void cleanup_repl_slots(void); static Snapshot build_historic_snapshot(SnapBuild *builder); static void process_task_internal(MemoryContext task_cxt); static uint64 run_command(char *command, int rc); static Size worker_shmem_size(void) { Size size; size = offsetof(WorkerData, slots); size = add_size(size, mul_size(max_squeeze_workers(), sizeof(WorkerSlot))); return size; } #if PG_VERSION_NUM >= 150000 static shmem_request_hook_type prev_shmem_request_hook = NULL; void squeeze_save_prev_shmem_request_hook(void) { prev_shmem_request_hook = shmem_request_hook; } #endif /* * The shmem_request_hook hook was introduced in PG 15. In earlier versions we * call it directly from _PG_init(). */ void squeeze_worker_shmem_request(void) { /* With lower PG versions this function is called from _PG_init(). */ #if PG_VERSION_NUM >= 150000 if (prev_shmem_request_hook) prev_shmem_request_hook(); #endif /* PG_VERSION_NUM >= 150000 */ RequestAddinShmemSpace(worker_shmem_size()); RequestNamedLWLockTranche("pg_squeeze", 1); } static shmem_startup_hook_type prev_shmem_startup_hook = NULL; void squeeze_save_prev_shmem_startup_hook(void) { prev_shmem_startup_hook = shmem_startup_hook; } void squeeze_worker_shmem_startup(void) { bool found; if (prev_shmem_startup_hook) prev_shmem_startup_hook(); LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); workerData = ShmemInitStruct("pg_squeeze", worker_shmem_size(), &found); if (!found) { int i; LWLockPadded *locks; locks = GetNamedLWLockTranche("pg_squeeze"); for (i = 0; i < NUM_WORKER_TASKS; i++) { WorkerTask *task; task = &workerData->tasks[i]; SpinLockInit(&task->mutex); clear_task(task); } workerData->lock = &locks->lock; workerData->cleanup_done = false; workerData->nslots = max_squeeze_workers(); for (i = 0; i < workerData->nslots; i++) { WorkerSlot *slot = &workerData->slots[i]; slot->dbid = InvalidOid; slot->relid = InvalidOid; SpinLockInit(&slot->mutex); MemSet(&slot->progress, 0, sizeof(WorkerProgress)); slot->pid = InvalidPid; } } LWLockRelease(AddinShmemInitLock); } /* Mark this worker's slot unused. */ static void worker_shmem_shutdown(int code, Datum arg) { /* exiting before the slot was initialized? */ if (MyWorkerSlot) { /* * Use spinlock to make sure that invalid dbid implies that the * clearing is done. */ SpinLockAcquire(&MyWorkerSlot->mutex); Assert(MyWorkerSlot->dbid != InvalidOid); MyWorkerSlot->dbid = InvalidOid; MyWorkerSlot->relid = InvalidOid; MyWorkerSlot->pid = InvalidPid; MemSet(&MyWorkerSlot->progress, 0, sizeof(WorkerProgress)); SpinLockRelease(&MyWorkerSlot->mutex); /* This shouldn't be necessary, but ... */ MyWorkerSlot = NULL; } if (MyWorkerTask) release_task(MyWorkerTask); if (am_i_scheduler) /* * Cleanup. Here, instead of just waiting for workers to finish, we * ask them to exit as soon as possible. */ cleanup_workers_and_tasks(true); else if (am_i_standalone) /* * Note that the worker launched by the squeeze_table() function needs * to do the cleanup on its own. */ drop_replication_slots(); /* * Release LW locks acquired outside transaction. * * There's at least one such case: when the worker is looking for a slot * in the shared memory - see squeeze_worker_main(). */ LWLockReleaseAll(); } /* * Start the scheduler worker. */ PG_FUNCTION_INFO_V1(squeeze_start_worker); Datum squeeze_start_worker(PG_FUNCTION_ARGS) { if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("pg_squeeze cannot be used during recovery."))); if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to start squeeze worker")))); start_worker_internal(true, -1, NULL); PG_RETURN_VOID(); } /* * Stop the scheduler worker. */ PG_FUNCTION_INFO_V1(squeeze_stop_worker); Datum squeeze_stop_worker(PG_FUNCTION_ARGS) { int i; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to stop squeeze worker")))); for (i = 0; i < workerData->nslots; i++) { WorkerSlot *slot = &workerData->slots[i]; Oid dbid; bool scheduler; SpinLockAcquire(&slot->mutex); dbid = slot->dbid; scheduler = slot->scheduler; SpinLockRelease(&slot->mutex); if (dbid == MyDatabaseId && scheduler) { kill(slot->pid, SIGTERM); /* * There should only be one scheduler per database. (It'll stop * the squeeze workers it launched.) */ break; } } PG_RETURN_VOID(); } /* * Submit a task for a squeeze worker and wait for its completion. * * This is a replacement for the squeeze_table() function so that pg_squeeze * >= 1.6 can still expose the functionality via the postgres executor. */ extern Datum squeeze_table_new(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(squeeze_table_new); Datum squeeze_table_new(PG_FUNCTION_ARGS) { Name relschema, relname; Name indname = NULL; Name tbspname = NULL; ArrayType *ind_tbsps = NULL; int task_idx; WorkerTask *task = NULL; BackgroundWorkerHandle *handle; BgwHandleStatus status; char *error_msg = NULL; bool task_exists; if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("pg_squeeze cannot be used during recovery."))); if (PG_ARGISNULL(0) || PG_ARGISNULL(1)) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), (errmsg("Both schema and table name must be specified")))); relschema = PG_GETARG_NAME(0); relname = PG_GETARG_NAME(1); if (!PG_ARGISNULL(2)) indname = PG_GETARG_NAME(2); if (!PG_ARGISNULL(3)) tbspname = PG_GETARG_NAME(3); if (!PG_ARGISNULL(4)) { ind_tbsps = PG_GETARG_ARRAYTYPE_P(4); if (VARSIZE(ind_tbsps) >= IND_TABLESPACES_ARRAY_SIZE) ereport(ERROR, (errmsg("the value of \"ind_tablespaces\" is too big"))); } /* Find free task structure. */ task = get_unused_task(MyDatabaseId, NameStr(*relschema), NameStr(*relname), &task_idx, &task_exists); if (task == NULL) { if (task_exists) ereport(ERROR, (errmsg("task for relation \"%s\".\"%s\" already exists", NameStr(*relschema), NameStr(*relname)))); else ereport(ERROR, (errmsg("too many concurrent tasks in progress"))); } /* Fill-in the remaining task information. */ initialize_worker_task(task, -1, indname, tbspname, ind_tbsps, false, true, squeeze_max_xlock_time); /* * Unlike scheduler_worker_loop() we cannot build the snapshot here, the * worker will do. (It will also create the replication slot.) This is * related to the variable am_i_standalone in process_task(). */ /* Start the worker to handle the task. */ if (!start_worker_internal(false, task_idx, &handle)) { /* * The worker could not even get registered, so it won't set its * status to WTS_UNUSED. Make sure the task does not leak. */ release_task(task); ereport(ERROR, (errmsg("squeeze worker could not start")), (errhint("consider increasing \"max_worker_processes\" or decreasing \"squeeze.workers_per_database\""))); } /* Wait for the worker's exit. */ PG_TRY(); { status = WaitForBackgroundWorkerShutdown(handle); } PG_CATCH(); { /* * Make sure the worker stops. Interrupt received from the user is the * typical use case. */ interrupt_worker(task); PG_RE_THROW(); } PG_END_TRY(); if (status == BGWH_POSTMASTER_DIED) { ereport(ERROR, (errmsg("the postmaster died before the background worker could finish"), errhint("More details may be available in the server log."))); /* No need to release the task in the shared memory. */ } /* * WaitForBackgroundWorkerShutdown() should not return anything else. */ Assert(status == BGWH_STOPPED); if (strlen(task->error_msg) > 0) error_msg = pstrdup(task->error_msg); if (error_msg) ereport(ERROR, (errmsg("%s", error_msg))); PG_RETURN_VOID(); } /* * Returns a newly assigned task. Return NULL if there's no unused slot or a * task already exists for given relation. * * The index in the task array is returned in *task_idx. * * The returned task has 'dbid', 'relschema' and 'relname' fields initialized. * * If NULL is returned, *duplicate tells whether it's due to an existing task * for given relation. */ static WorkerTask * get_unused_task(Oid dbid, char *relschema, char *relname, int *task_idx, bool *duplicate) { int i; WorkerTask *task; WorkerTask *result = NULL; int res_idx = -1; *duplicate = false; /* * Find an unused task and make sure that a valid task does not exist for * the same relation. */ LWLockAcquire(workerData->lock, LW_EXCLUSIVE); for (i = 0; i < NUM_WORKER_TASKS; i++) { WorkerTaskState worker_state; bool needs_check = false; task = &workerData->tasks[i]; SpinLockAcquire(&task->mutex); worker_state = task->worker_state; /* * String comparisons shouldn't take place under spinlock, but the * spinlock is actually not necessary. Once we have released it, the * squeeze worker can set the state to UNUSED, so we might report a * duplicate task incorrectly. That's not perfect but should not * happen too often. (If the task is already UNUSED, no one should * change it while we are holding the LW lock.) */ SpinLockRelease(&task->mutex); /* * Stop looking for an unused task and checking duplicates if a * duplicate was seen. */ if (!*duplicate) { if (worker_state != WTS_UNUSED) { /* * Consider tasks which might be in progress for possible * duplicates of the task we're going to submit. */ needs_check = true; } else if (result == NULL) { /* Result candidate */ result = task; res_idx = i; } } if (needs_check) { /* * The strings are only set while workerData->lock is held in * exclusive mode (see below), so we can safely check them here. * * Spinlock not needed to access ->dbid because the worker should * never change it (even when exiting). */ if (task->dbid == dbid && strcmp(NameStr(task->relschema), relschema) == 0 && strcmp(NameStr(task->relname), relname) == 0) { result = NULL; res_idx = -1; *duplicate = true; } } /* * If the task became UNUSED recently, it might still contain obsolete * information because the worker only sets the status when exiting. * (This clean-up shouldn't be necessary because the caller will * initialize it when we return it next time, but it seems a good * practice, e.g. for debugging.) */ if (worker_state == WTS_UNUSED && OidIsValid(task->dbid)) { /* * Note that the scheduler worker should have detached from the * DSM segment pointed to by task->repl_slot.seg, by calling * drop_replication_slots(). (The "standalone" worker should not * have set it.) */ clear_task(task); } } if (result == NULL || *duplicate) goto done; /* * Make sure that no other backend / scheduler can use the task. * * As long as we hold the LW lock, no one else should be currently trying * to allocate this task, so no spinlock is needed. */ result->worker_state = WTS_INIT; /* * While holding the LW lock, initialize the fields we use to check * uniqueness of the task. */ result->dbid = dbid; namestrcpy(&result->relschema, relschema); namestrcpy(&result->relname, relname); done: LWLockRelease(workerData->lock); *task_idx = res_idx; return result; } /* * Fill-in "user data" of WorkerTask. task_id, dbid, relschema and relname * should already be set. */ static void initialize_worker_task(WorkerTask *task, int task_id, Name indname, Name tbspname, ArrayType *ind_tbsps, bool last_try, bool skip_analyze, int max_xlock_time) { StringInfoData buf; initStringInfo(&buf); task->task_id = task_id; appendStringInfo(&buf, "squeeze worker task: id=%d, relschema=%s, relname=%s", task->task_id, NameStr(task->relschema), NameStr(task->relname)); if (indname) { namestrcpy(&task->indname, NameStr(*indname)); appendStringInfo(&buf, ", indname: %s", NameStr(task->indname)); } else NameStr(task->indname)[0] = '\0'; if (tbspname) { namestrcpy(&task->tbspname, NameStr(*tbspname)); appendStringInfo(&buf, ", tbspname: %s", NameStr(task->tbspname)); } else NameStr(task->tbspname)[0] = '\0'; /* ind_tbsps is in a binary format, don't bother logging it right now. */ if (ind_tbsps) { if (VARSIZE(ind_tbsps) > IND_TABLESPACES_ARRAY_SIZE) ereport(ERROR, (errmsg("the array of index tablespaces is too big"))); memcpy(task->ind_tbsps, ind_tbsps, VARSIZE(ind_tbsps)); } else SET_VARSIZE(task->ind_tbsps, 0); ereport(DEBUG1, (errmsg("%s", buf.data))); pfree(buf.data); task->error_msg[0] = '\0'; task->last_try = last_try; task->skip_analyze = skip_analyze; task->max_xlock_time = max_xlock_time; } /* * Register either scheduler or squeeze worker, according to the argument. * * The number of scheduler workers per database is limited by the * squeeze_workers_per_database configuration variable. * * The return value tells whether we could at least register the worker. */ static bool start_worker_internal(bool scheduler, int task_idx, BackgroundWorkerHandle **handle) { WorkerConInteractive con; BackgroundWorker worker; char *kind; Assert(!scheduler || task_idx < 0); /* * Make sure all the task fields are visible to the worker before starting * it. This is similar to the use of the write barrier in * RegisterDynamicBackgroundWorker() in PG core. However, the new process * does not need to use "read barrier" because once it's started, the * shared memory writes done by start_worker_internal() must essentially * have been read. (Otherwise the worker would not start.) */ if (task_idx >= 0) pg_write_barrier(); kind = scheduler ? "scheduler" : "squeeze"; con.dbid = MyDatabaseId; con.roleid = GetUserId(); con.scheduler = scheduler; con.task_idx = task_idx; squeeze_initialize_bgworker(&worker, NULL, &con, MyProcPid); ereport(DEBUG1, (errmsg("registering pg_squeeze %s worker", kind))); if (!RegisterDynamicBackgroundWorker(&worker, handle)) return false; if (handle == NULL) /* * Caller is not interested in the status, the return value does not * matter. */ return false; Assert(*handle != NULL); return true; } /* * Convenience routine to allocate the structure in TopMemoryContext. We need * it to survive fork and initialization of the worker. * * (The allocation cannot be avoided as BackgroundWorker.bgw_extra does not * provide enough space for us.) */ WorkerConInit * allocate_worker_con_info(char *dbname, char *rolename) { WorkerConInit *result; result = (WorkerConInit *) MemoryContextAllocZero(TopMemoryContext, sizeof(WorkerConInit)); result->dbname = MemoryContextStrdup(TopMemoryContext, dbname); result->rolename = MemoryContextStrdup(TopMemoryContext, rolename); return result; } /* * Initialize the worker and pass connection info in the appropriate form. * * 'con_init' is passed only for the scheduler worker, whereas * 'con_interactive' can be passed for both squeeze worker and scheduler * worker. */ void squeeze_initialize_bgworker(BackgroundWorker *worker, WorkerConInit *con_init, WorkerConInteractive *con_interactive, pid_t notify_pid) { char *dbname; bool scheduler; char *kind; worker->bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; worker->bgw_start_time = BgWorkerStart_RecoveryFinished; worker->bgw_restart_time = BGW_NEVER_RESTART; sprintf(worker->bgw_library_name, "pg_squeeze"); sprintf(worker->bgw_function_name, "squeeze_worker_main"); if (con_init != NULL) { worker->bgw_main_arg = (Datum) PointerGetDatum(con_init); dbname = con_init->dbname; scheduler = true; } else if (con_interactive != NULL) { worker->bgw_main_arg = (Datum) 0; StaticAssertStmt(sizeof(WorkerConInteractive) <= BGW_EXTRALEN, "WorkerConInteractive is too big"); memcpy(worker->bgw_extra, con_interactive, sizeof(WorkerConInteractive)); /* * Catalog lookup is possible during interactive start, so do it for * the sake of bgw_name. Comment of WorkerConInteractive structure * explains why we still must use the OID for worker registration. */ dbname = get_database_name(con_interactive->dbid); scheduler = con_interactive->scheduler; } else elog(ERROR, "Connection info not available for squeeze worker."); kind = scheduler ? "scheduler" : "squeeze"; snprintf(worker->bgw_name, BGW_MAXLEN, "pg_squeeze %s worker for database %s", kind, dbname); snprintf(worker->bgw_type, BGW_MAXLEN, "squeeze worker"); worker->bgw_notify_pid = notify_pid; } static volatile sig_atomic_t got_sighup = false; static volatile sig_atomic_t got_sigterm = false; /* * Sleep time (in seconds) of the scheduler worker. * * If there are no tables eligible for squeezing, the worker sleeps this * amount of seconds and then try again. The value should be low enough to * ensure that no scheduled table processing is missed, while the schedule * granularity is one minute. * * So far there seems to be no reason to have separate variables for the * scheduler and the squeeze worker. */ static int worker_naptime = 20; void squeeze_worker_main(Datum main_arg) { Datum arg; int i; bool found_scheduler; int nworkers; int task_idx = -1; /* The worker should do its cleanup when exiting. */ before_shmem_exit(worker_shmem_shutdown, (Datum) 0); pqsignal(SIGHUP, worker_sighup); pqsignal(SIGTERM, worker_sigterm); BackgroundWorkerUnblockSignals(); /* Retrieve connection info. */ Assert(MyBgworkerEntry != NULL); arg = MyBgworkerEntry->bgw_main_arg; if (arg != (Datum) 0) { WorkerConInit *con; con = (WorkerConInit *) DatumGetPointer(arg); am_i_scheduler = true; BackgroundWorkerInitializeConnection(con->dbname, con->rolename, 0 /* flags */ ); } else { WorkerConInteractive con; /* Ensure aligned access. */ memcpy(&con, MyBgworkerEntry->bgw_extra, sizeof(WorkerConInteractive)); am_i_scheduler = con.scheduler; BackgroundWorkerInitializeConnectionByOid(con.dbid, con.roleid, 0); task_idx = con.task_idx; } /* * Initialize MyWorkerTask as soon as possible so that * worker_shmem_shutdown() can clean it up in the shared memory in case of * ERROR. */ if (task_idx >= 0) { Assert(!am_i_scheduler); Assert(task_idx < NUM_WORKER_TASKS); MyWorkerTask = &workerData->tasks[task_idx]; } found_scheduler = false; nworkers = 0; /* * Find and initialize a slot for this worker. * * While doing that, make sure that there is no more than one scheduler * and no more than squeeze_workers_per_database workers running on this * database. * * Exclusive lock is needed to make sure that the maximum number of * workers is not exceeded due to race conditions. */ Assert(MyWorkerSlot == NULL); LWLockAcquire(workerData->lock, LW_EXCLUSIVE); /* * The first worker after restart is responsible for cleaning up * replication slots and/or origins that other workers could not remove * due to server crash. Do that while holding the exclusive lock - that * also ensures that the other workers wait for the cleanup to finish * before they create new slots / origins, which we might then drop * accidentally. * * If no "standalone" squeeze worker performed the cleanup yet, the * scheduler must do it now because it'll also create replication slots / * origins. Those could be dropped by one of the new workers if that * worker was to perform the cleanup. */ if (!workerData->cleanup_done) { cleanup_after_server_start(); workerData->cleanup_done = true; } for (i = 0; i < workerData->nslots; i++) { WorkerSlot *slot = &workerData->slots[i]; Oid dbid; /* * The spinlock might seem unnecessary, but w/o that it could happen * that we saw 'dbid' invalid (i.e. ready to use) while another worker * is still clearing the other fields (before exit) and thus it can * overwrite our settings - see worker_shmem_shutdown(). */ SpinLockAcquire(&slot->mutex); dbid = slot->dbid; SpinLockRelease(&slot->mutex); if (dbid == MyDatabaseId) { if (am_i_scheduler && slot->scheduler) { elog(WARNING, "one scheduler worker already running on database oid=%u", MyDatabaseId); found_scheduler = true; break; } else if (!am_i_scheduler && !slot->scheduler) { if (++nworkers >= squeeze_workers_per_database) { elog(WARNING, "%d squeeze worker(s) already running on database oid=%u", nworkers, MyDatabaseId); break; } } } else if (dbid == InvalidOid && MyWorkerSlot == NULL) MyWorkerSlot = slot; } if (found_scheduler || (nworkers >= squeeze_workers_per_database)) { LWLockRelease(workerData->lock); goto done; } /* * Fill-in all the information we have. (relid will be set in * process_task() unless this worker is a scheduler.) */ if (MyWorkerSlot) { WorkerSlot *slot = MyWorkerSlot; /* * The spinlock is probably not necessary here (no one else should be * interested in this slot). */ SpinLockAcquire(&slot->mutex); slot->dbid = MyDatabaseId; Assert(slot->relid == InvalidOid); Assert(slot->pid == InvalidPid); slot->pid = MyProcPid; slot->scheduler = am_i_scheduler; MemSet(&slot->progress, 0, sizeof(WorkerProgress)); SpinLockRelease(&slot->mutex); } LWLockRelease(workerData->lock); /* Is there no unused slot? */ if (MyWorkerSlot == NULL) { elog(WARNING, "no unused slot found for pg_squeeze worker process"); goto done; } if (am_i_scheduler) scheduler_worker_loop(); else process_task(); done: proc_exit(0); } static void worker_sighup(SIGNAL_ARGS) { int save_errno = errno; got_sighup = true; SetLatch(MyLatch); errno = save_errno; } static void worker_sigterm(SIGNAL_ARGS) { int save_errno = errno; got_sigterm = true; SetLatch(MyLatch); errno = save_errno; } static void scheduler_worker_loop(void) { long delay = 0L; int i; MemoryContext sched_cxt, old_cxt; /* Context for allocations which cannot be freed too early. */ sched_cxt = AllocSetContextCreate(TopMemoryContext, "pg_squeeze scheduler context", ALLOCSET_DEFAULT_SIZES); while (!got_sigterm) { StringInfoData query; int rc; uint64 ntask; TupleDesc tupdesc; TupleTableSlot *slot; ListCell *lc; int nslots; List *task_idxs = NIL; /* * Make sure all the workers we launched in the previous loop and * their tasks and replication slots are cleaned up. */ cleanup_workers_and_tasks(false); /* Free the corresponding memory. */ MemoryContextReset(sched_cxt); rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, delay, PG_WAIT_EXTENSION); ResetLatch(MyLatch); if (rc & WL_POSTMASTER_DEATH) proc_exit(1); if (got_sighup) { got_sighup = false; ProcessConfigFile(PGC_SIGHUP); } run_command("SELECT squeeze.check_schedule()", SPI_OK_SELECT); /* * Turn new tasks into ready (or processed if the tables should not * really be squeezed). */ run_command("SELECT squeeze.dispatch_new_tasks()", SPI_OK_SELECT); /* * Are there some tasks with no worker assigned? */ initStringInfo(&query); appendStringInfo( &query, "SELECT t.id, tb.tabschema, tb.tabname, tb.clustering_index, " "tb.rel_tablespace, tb.ind_tablespaces, t.tried >= tb.max_retry, " "tb.skip_analyze " "FROM squeeze.tasks t, squeeze.tables tb " "LEFT JOIN squeeze.get_active_workers() AS w " "ON (tb.tabschema, tb.tabname) = (w.tabschema, w.tabname) " "WHERE w.tabname ISNULL AND t.state = 'ready' AND t.table_id = tb.id " "ORDER BY t.id " "LIMIT %d", squeeze_workers_per_database); StartTransactionCommand(); PushActiveSnapshot(GetTransactionSnapshot()); if (SPI_connect() != SPI_OK_CONNECT) ereport(ERROR, (errmsg("could not connect to SPI manager"))); pgstat_report_activity(STATE_RUNNING, query.data); rc = SPI_execute(query.data, true, 0); pgstat_report_activity(STATE_IDLE, NULL); if (rc != SPI_OK_SELECT) ereport(ERROR, (errmsg("SELECT command failed: %s", query.data))); #if PG_VERSION_NUM >= 130000 ntask = SPI_tuptable->numvals; #else ntask = SPI_processed; #endif ereport(DEBUG1, (errmsg("scheduler worker: %zu tasks available", ntask))); if (ntask > 0) { tupdesc = CreateTupleDescCopy(SPI_tuptable->tupdesc); slot = MakeSingleTupleTableSlot(tupdesc, &TTSOpsHeapTuple); } /* Initialize the task slots. */ for (i = 0; i < ntask; i++) { int idx, task_id; WorkerTask *task; HeapTuple tup; Datum datum; bool isnull; Name relschema, relname, cl_index, rel_tbsp; ArrayType *ind_tbsps; bool last_try; bool skip_analyze; bool task_exists = false; cl_index = NULL; rel_tbsp = NULL; ind_tbsps = NULL; /* Retrieve the tuple attributes and use them to fill the task. */ tup = heap_copytuple(SPI_tuptable->vals[i]); ExecClearTuple(slot); ExecStoreHeapTuple(tup, slot, true); datum = slot_getattr(slot, 1, &isnull); Assert(!isnull); task_id = DatumGetInt32(datum); datum = slot_getattr(slot, 2, &isnull); Assert(!isnull); relschema = DatumGetName(datum); datum = slot_getattr(slot, 3, &isnull); Assert(!isnull); relname = DatumGetName(datum); task = get_unused_task(MyDatabaseId, NameStr(*relschema), NameStr(*relname), &idx, &task_exists); if (task == NULL) { if (task_exists) { /* Already in progress, go for the next one. */ ereport(WARNING, (errmsg("task already exists for table \"%s\".\"%s\"", NameStr(*relschema), NameStr(*relname)))); continue; } else { /* * No point in fetching the remaining columns if all the * tasks are already used. */ ereport(WARNING, (errmsg("the task queue is currently full"))); break; } } datum = slot_getattr(slot, 4, &isnull); if (!isnull) cl_index = DatumGetName(datum); datum = slot_getattr(slot, 5, &isnull); if (!isnull) rel_tbsp = DatumGetName(datum); datum = slot_getattr(slot, 6, &isnull); if (!isnull) ind_tbsps = DatumGetArrayTypePCopy(datum); datum = slot_getattr(slot, 7, &isnull); Assert(!isnull); last_try = DatumGetBool(datum); datum = slot_getattr(slot, 8, &isnull); Assert(!isnull); skip_analyze = DatumGetBool(datum); /* Fill the task. */ initialize_worker_task(task, task_id, cl_index, rel_tbsp, ind_tbsps, last_try, skip_analyze, /* XXX Should max_xlock_time be added to * squeeze.tables ? */ 0); /* The list must survive SPI_finish(). */ old_cxt = MemoryContextSwitchTo(sched_cxt); task_idxs = lappend_int(task_idxs, idx); MemoryContextSwitchTo(old_cxt); } if (ntask > 0) { ExecDropSingleTupleTableSlot(slot); FreeTupleDesc(tupdesc); } /* Finish the data retrieval. */ if (SPI_finish() != SPI_OK_FINISH) ereport(ERROR, (errmsg("SPI_finish failed"))); PopActiveSnapshot(); CommitTransactionCommand(); pgstat_report_stat(false); /* Initialize the array to track the workers we start. */ squeezeWorkerCount = nslots = list_length(task_idxs); if (squeezeWorkerCount > 0) { /* * The worker info should be in the sched_cxt which we reset at * the top of each iteration. */ squeezeWorkers = (SqueezeWorker *) MemoryContextAllocZero(sched_cxt, squeezeWorkerCount * sizeof(SqueezeWorker)); /* Create and initialize the replication slot for each worker. */ PG_TRY(); { create_replication_slots(nslots, sched_cxt); } PG_CATCH(); { foreach(lc, task_idxs) { int task_idx = lfirst_int(lc); WorkerTask *task = &workerData->tasks[task_idx]; /* * worker_shmem_shutdown() will call release_task() but we * need to do it here on behalf of the workers which will * never start. * * get_unused_task() will detach the shared segments where * they exist. */ release_task(task); } PG_RE_THROW(); } PG_END_TRY(); /* * Now that the transaction has committed, we can start the * workers. (start_worker_internal() needs to run in a transaction * because it does access the system catalog.) */ i = 0; foreach(lc, task_idxs) { SqueezeWorker *worker; int task_idx; bool registered; worker = &squeezeWorkers[i]; worker->handle = NULL; task_idx = lfirst_int(lc); worker->task = &workerData->tasks[task_idx]; worker->task->repl_slot = squeezeWorkerSlots[i]; SetCurrentStatementStartTimestamp(); StartTransactionCommand(); /* * The handle (and possibly other allocations) must survive * the current transaction. */ old_cxt = MemoryContextSwitchTo(sched_cxt); registered = start_worker_internal(false, task_idx, &worker->handle); MemoryContextSwitchTo(old_cxt); if (!registered) { /* * The worker could not even get registered, so it won't * set its status to WTS_UNUSED. Make sure the task does * not leak. */ release_task(worker->task); ereport(ERROR, (errmsg("squeeze worker could not start")), (errhint("consider increasing \"max_worker_processes\" or decreasing \"squeeze.workers_per_database\""))); } CommitTransactionCommand(); i++; } } /* Check later if any table meets the schedule. */ delay = worker_naptime * 1000L; } /* * Do not reset/delete sched_cxt, worker_shmem_shutdown() may need the * information it contains. */ } static void cleanup_workers_and_tasks(bool interrupt) { SqueezeWorker *worker; int i; if (interrupt) { /* Notify the tasks that they should exit. */ for (i = 0; i < squeezeWorkerCount; i++) { worker = &squeezeWorkers[i]; if (worker->task) interrupt_worker(worker->task); } } /* * Wait until all the workers started in the previous loops have * finished. */ for (i = 0; i < squeezeWorkerCount; i++) { worker = &squeezeWorkers[i]; /* Not even start or already stopped? */ if (worker->handle == NULL) continue; wait_for_worker_shutdown(worker); } squeezeWorkerCount = 0; /* The reset of sched_cxt will free the memory. */ squeezeWorkers = NULL; /* Drop the replication slots. */ if (squeezeWorkerSlotCount > 0) drop_replication_slots(); } static void wait_for_worker_shutdown(SqueezeWorker *worker) { BgwHandleStatus status; status = WaitForBackgroundWorkerShutdown(worker->handle); if (status == BGWH_POSTMASTER_DIED) { ereport(ERROR, (errmsg("the postmaster died before the squeeze worker could finish"), errhint("More details may be available in the server log."))); } /* * WaitForBackgroundWorkerShutdown() should not return anything * else. */ Assert(status == BGWH_STOPPED); pfree(worker->handle); worker->handle = NULL; } static void process_task(void) { MemoryContext task_cxt; ErrorData *edata; Assert(MyWorkerTask != NULL); /* * Memory context for auxiliary per-task allocations. */ task_cxt = AllocSetContextCreate(TopMemoryContext, "pg_squeeze task context", ALLOCSET_DEFAULT_SIZES); squeeze_max_xlock_time = MyWorkerTask->max_xlock_time; /* Process the assigned task. */ PG_TRY(); { process_task_internal(task_cxt); } PG_CATCH(); { squeeze_handle_error_db(&edata, task_cxt); squeeze_handle_error_app(edata, MyWorkerTask); /* * Not sure it makes sense to rethrow the ERROR. The worker is going * to exit anyway. */ } PG_END_TRY(); MemoryContextDelete(task_cxt); } /* * Create a replication slot for each squeeze worker and find the start point * for logical decoding. * * We create and initialize all the slots at once because * DecodingContextFindStartpoint() waits for the running transactions to * complete. If each worker had to initialize its slot, it'd have wait until * the other worker(s) are done with their current job (which usually takes * some time), so the workers wouldn't actually do their work in parallel. */ static void create_replication_slots(int nslots, MemoryContext mcxt) { uint32 i; ReplSlotStatus *res_ptr; MemoryContext old_cxt; Assert(squeezeWorkerSlots == NULL && squeezeWorkerSlotCount == 0); /* * Use a transaction so that all the slot related locks are freed on ERROR * and thus drop_replication_slots() can do its work. */ StartTransactionCommand(); #if PG_VERSION_NUM >= 150000 CheckSlotPermissions(); #endif CheckLogicalDecodingRequirements(); /* * We are in a transaction, so make sure various allocations survive the * transaction commit. */ old_cxt = MemoryContextSwitchTo(mcxt); squeezeWorkerSlots = (ReplSlotStatus *) palloc0(nslots * sizeof(ReplSlotStatus)); res_ptr = squeezeWorkerSlots; /* * XXX It might be faster if we created one slot using the API and the * other ones by copying, however pg_copy_logical_replication_slot() * passes need_full_snapshot=false to CreateInitDecodingContext(). */ for (i = 0; i < nslots; i++) { char name[NAMEDATALEN]; LogicalDecodingContext *ctx; ReplicationSlot *slot; Snapshot snapshot; Size snap_size; char *snap_dst; int slot_nr; if (am_i_standalone) { /* * squeeze_table() can be called concurrently (for different * tables), so make sure that each call generates an unique slot * name. */ Assert(nslots == 1); /* * Try to minimize the probability of collision with a * "non-standalone" worker. */ slot_nr = Min(MyProcPid, MyProcPid + 1024); } else slot_nr = i; snprintf(name, NAMEDATALEN, REPL_SLOT_PREFIX "%u_%u", MyDatabaseId, slot_nr); #if PG_VERSION_NUM >= 170000 ReplicationSlotCreate(name, true, RS_PERSISTENT, false, false, false); #elif PG_VERSION_NUM >= 140000 ReplicationSlotCreate(name, true, RS_PERSISTENT, false); #else ReplicationSlotCreate(name, true, RS_PERSISTENT); #endif slot = MyReplicationSlot; /* * Save the name early so that the slot gets cleaned up if the steps * below throw ERROR. */ namestrcpy(&res_ptr->name, slot->data.name.data); squeezeWorkerSlotCount++; /* * Neither prepare_write nor do_write callback nor update_progress is * useful for us. * * Regarding the value of need_full_snapshot, we pass true to protect * its data from VACUUM. Otherwise the historical snapshot we use for * the initial load could miss some data. (Unlike logical decoding, we * need the historical snapshot for non-catalog tables.) */ ctx = CreateInitDecodingContext(REPL_PLUGIN_NAME, NIL, true, InvalidXLogRecPtr, #if PG_VERSION_NUM >= 130000 XL_ROUTINE(.page_read = read_local_xlog_page, .segment_open = wal_segment_open, .segment_close = wal_segment_close), #else logical_read_local_xlog_page, #endif NULL, NULL, NULL); /* * We don't have control on setting fast_forward, so at least check * it. */ Assert(!ctx->fast_forward); /* * Bring the snapshot builder into the SNAPBUILD_CONSISTENT state so * that the worker can get its snapshot and start decoding * immediately. This is where we might need to wait for other * transactions to finish, so it should not be done by the workers. */ DecodingContextFindStartpoint(ctx); /* The call above could have changed the memory context.*/ MemoryContextSwitchTo(mcxt); /* Get the values the caller is interested int. */ res_ptr->confirmed_flush = slot->data.confirmed_flush; /* * Unfortunately the API is such that CreateDecodingContext() assumes * need_full_snapshot=false, so the worker won't be able to create the * snapshot for the initial load. Therefore we serialize the snapshot * here and pass it to the worker via shared memory. */ snapshot = build_historic_snapshot(ctx->snapshot_builder); snap_size = EstimateSnapshotSpace(snapshot); if (!am_i_standalone) { res_ptr->snap_seg = dsm_create(snap_size, 0); /* * The current transaction's commit must not detach the * segment. */ dsm_pin_mapping(res_ptr->snap_seg); res_ptr->snap_handle = dsm_segment_handle(res_ptr->snap_seg); res_ptr->snap_private = NULL; snap_dst = (char *) dsm_segment_address(res_ptr->snap_seg); } else { res_ptr->snap_seg = NULL; res_ptr->snap_handle = DSM_HANDLE_INVALID; snap_dst = (char *) MemoryContextAlloc(mcxt, snap_size); res_ptr->snap_private = snap_dst; } /* * XXX Should we care about alignment? The function doesn't seem to * need that. */ SerializeSnapshot(snapshot, snap_dst); res_ptr++; /* * Done for now, the worker will have to setup the context on its own. */ FreeDecodingContext(ctx); /* Prevent ReplicationSlotRelease() from clearing effective_xmin. */ SpinLockAcquire(&slot->mutex); Assert(TransactionIdIsValid(slot->effective_xmin) && !TransactionIdIsValid(slot->data.xmin)); slot->data.xmin = slot->effective_xmin; SpinLockRelease(&slot->mutex); ReplicationSlotRelease(); } MemoryContextSwitchTo(old_cxt); CommitTransactionCommand(); Assert(squeezeWorkerSlotCount == nslots); } /* * Drop replication slots the worker created. If this is the scheduler worker, * we may need to wait for the squeeze workers to release the slots. */ static void drop_replication_slots(void) { int i; /* * Called during normal operation and now called again by the * worker_shmem_shutdown callback? */ if (squeezeWorkerSlots == NULL) { Assert(squeezeWorkerSlotCount == 0); return; } /* * ERROR in create_replication_slots() can leave us with one of the slots * acquired, so release it before we start dropping them all. */ if (MyReplicationSlot) ReplicationSlotRelease(); for (i = 0; i < squeezeWorkerSlotCount; i++) { ReplSlotStatus *slot; slot = &squeezeWorkerSlots[i]; if (strlen(NameStr(slot->name)) > 0) { /* nowait=false, i.e. wait */ ReplicationSlotDrop(NameStr(slot->name), false); } /* Detach from the shared memory segment. */ if (slot->snap_seg) { dsm_detach(slot->snap_seg); slot->snap_seg = NULL; slot->snap_handle = DSM_HANDLE_INVALID; } } squeezeWorkerSlotCount = 0; /* * Caller should reset the containing memory context. (Unless this is * called during exit.) */ squeezeWorkerSlots = NULL; } /* * The first squeeze worker launched after server start calls this function to * make sure that no replication slots / origins exist. * * task_idx is needed so that error message can be sent to the backend that * launched the worker. ERROR is supposed to terminate the worker. */ static void cleanup_after_server_start(void) { PG_TRY(); { cleanup_repl_origins(); cleanup_repl_slots(); } PG_CATCH(); { ErrorData *edata; Assert(MyWorkerTask != NULL); /* * The worker should exit pretty soon, it's o.k. to use * TopMemoryContext (i.e. it causes no real memory leak). */ squeeze_handle_error_db(&edata, TopMemoryContext); PG_RE_THROW(); } PG_END_TRY(); } /* * Sub-routine of cleanup_after_server_start(). */ static void cleanup_repl_origins(void) { Relation rel; TableScanDesc scan; HeapTuple tuple; char *orig_name; List *origs = NIL; ListCell *lc; StartTransactionCommand(); rel = table_open(ReplicationOriginRelationId, AccessShareLock); scan = table_beginscan_catalog(rel, 0, NULL); while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { Form_pg_replication_origin form; form = (Form_pg_replication_origin) GETSTRUCT(tuple); orig_name = text_to_cstring(&form->roname); origs = lappend(origs, orig_name); } table_endscan(scan); table_close(rel, AccessShareLock); foreach(lc, origs) { orig_name = (char *) lfirst(lc); /* Drop the origin iff it looks like one created by pg_squeeze. */ if (strncmp(orig_name, REPLORIGIN_NAME_PREFIX, strlen(REPLORIGIN_NAME_PREFIX)) == 0) { ereport(DEBUG1, (errmsg("cleaning up replication origin \"%s\"", orig_name))); #if PG_VERSION_NUM >= 140000 /* nowait=true because no one should be using the origin. */ replorigin_drop_by_name(orig_name, false, true); #else { Oid originid; originid = replorigin_by_name(orig_name, false); replorigin_drop(originid, true); } #endif } } list_free(origs); CommitTransactionCommand(); } /* * Sub-routine of cleanup_after_server_start(). */ static void cleanup_repl_slots(void) { int slotno; List *slot_names = NIL; LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (slotno = 0; slotno < max_replication_slots; slotno++) { ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[slotno]; ReplicationSlot slot_contents; char *name; if (!slot->in_use) continue; SpinLockAcquire(&slot->mutex); slot_contents = *slot; SpinLockRelease(&slot->mutex); name = NameStr(slot_contents.data.name); if (strncmp(name, REPL_SLOT_PREFIX, strlen(REPL_SLOT_PREFIX)) == 0) slot_names = lappend(slot_names, pstrdup(name)); } LWLockRelease(ReplicationSlotControlLock); if (list_length(slot_names) > 0) { ListCell *lc; /* * XXX Is transaction needed here? We do not access system catalogs * and LW locks are released by the worker on exit anyway. */ foreach(lc, slot_names) { char *slot_name = (char *) lfirst(lc); ereport(DEBUG1, (errmsg("cleaning up replication slot \"%s\"", slot_name))); ReplicationSlotDrop(slot_name, true); } list_free_deep(slot_names); } } /* * Wrapper for SnapBuildInitialSnapshot(). * * We do not have to meet the assertions that SnapBuildInitialSnapshot() * contains, nor should we set MyPgXact->xmin. */ static Snapshot build_historic_snapshot(SnapBuild *builder) { Snapshot result; int XactIsoLevel_save; TransactionId xmin_save; /* * Fake XactIsoLevel so that the assertions in SnapBuildInitialSnapshot() * don't fire. */ XactIsoLevel_save = XactIsoLevel; XactIsoLevel = XACT_REPEATABLE_READ; /* * Likewise, fake MyPgXact->xmin so that the corresponding check passes. */ #if PG_VERSION_NUM >= 140000 xmin_save = MyProc->xmin; MyProc->xmin = InvalidTransactionId; #else xmin_save = MyPgXact->xmin; MyPgXact->xmin = InvalidTransactionId; #endif /* * Call the core function to actually build the snapshot. */ result = SnapBuildInitialSnapshot(builder); /* * Restore the original values. */ XactIsoLevel = XactIsoLevel_save; #if PG_VERSION_NUM >= 140000 MyProc->xmin = xmin_save; #else MyPgXact->xmin = xmin_save; #endif return result; } /* * process_next_task() function used to be implemented in pl/pgsql. However, * since it calls the squeeze_table() function and since the commit 240e0dbacd * in PG core makes it impossible to call squeeze_table() via the postgres * executor, this function must be implemented in C and call squeeze_table() * directly. * * task_id is an index into the shared memory array of tasks */ static void process_task_internal(MemoryContext task_cxt) { Name relschema, relname; Name cl_index = NULL; Name rel_tbsp = NULL; ArrayType *ind_tbsps = NULL; WorkerTask *task; uint32 arr_size; TimestampTz start_ts; bool success; RangeVar *relrv; Relation rel; Oid relid; ErrorData *edata; task = MyWorkerTask; /* * Create the replication slot if there is none. This happens when the * worker is started by the squeeze_table() function, which is run by the * PG executor and therefore cannot build the historic snapshot (due to * the commit 240e0dbacd in PG core). (And the scheduler worker, which * usually creates the slots, is not involved here.) */ if (task->repl_slot.snap_handle == DSM_HANDLE_INVALID) am_i_standalone = true; if (am_i_standalone) { /* * TopMemoryContext is o.k. here, this worker only processes a single * task and then exits. */ create_replication_slots(1, TopMemoryContext); task->repl_slot = squeezeWorkerSlots[0]; } /* * Once the task is allocated and the worker is launched, only the worker * is expected to change the task, so access it w/o locking. */ Assert(task->worker_state == WTS_INIT); task->worker_state = WTS_IN_PROGRESS; relschema = &task->relschema; relname = &task->relname; if (strlen(NameStr(task->indname)) > 0) cl_index = &task->indname; if (strlen(NameStr(task->tbspname)) > 0) rel_tbsp = &task->tbspname; /* * Copy the tablespace mapping array, if one is passed. */ arr_size = VARSIZE(task->ind_tbsps); if (arr_size > 0) { Assert(arr_size <= IND_TABLESPACES_ARRAY_SIZE); ind_tbsps = (ArrayType *) task->ind_tbsps; } /* Now process the task. */ ereport(DEBUG1, (errmsg("task for table %s.%s is ready for processing", NameStr(*relschema), NameStr(*relname)))); /* Retrieve relid of the table. */ StartTransactionCommand(); relrv = makeRangeVar(NameStr(*relschema), NameStr(*relname), -1); rel = table_openrv(relrv, AccessShareLock); relid = RelationGetRelid(rel); table_close(rel, AccessShareLock); CommitTransactionCommand(); /* Declare that this worker takes care of the relation. */ SpinLockAcquire(&MyWorkerSlot->mutex); Assert(MyWorkerSlot->dbid == MyDatabaseId); MyWorkerSlot->relid = relid; MemSet(&MyWorkerSlot->progress, 0, sizeof(WorkerProgress)); SpinLockRelease(&MyWorkerSlot->mutex); /* * The session origin will be used to mark WAL records produced by the * pg_squeeze extension itself so that they can be skipped easily during * decoding. (We avoid the decoding for performance reasons. Even if those * changes were decoded, our output plugin should not apply them because * squeeze_table_impl() exits before its transaction commits.) * * The origin needs to be created in a separate transaction because other * workers, waiting for an unique origin id, need to wait for this * transaction to complete. If we called both replorigin_create() and * squeeze_table_impl() in the same transaction, the calls of * squeeze_table_impl() would effectively get serialized. * * Errors are not catched here. If an operation as trivial as this fails, * worker's exit is just the appropriate action. */ manage_session_origin(relid); /* Perform the actual work. */ SetCurrentStatementStartTimestamp(); StartTransactionCommand(); start_ts = GetCurrentStatementStartTimestamp(); success = squeeze_table_impl(relschema, relname, cl_index, rel_tbsp, ind_tbsps, &edata, task_cxt); if (success) { CommitTransactionCommand(); /* * Now that the transaction is committed, we can run a new one to * drop the origin. */ Assert(replorigin_session_origin != InvalidRepOriginId); manage_session_origin(InvalidOid); } else { /* * The transaction should be aborted by squeeze_table_impl(). */ squeeze_handle_error_app(edata, task); } /* Insert an entry into the "squeeze.log" table. */ if (success) { Oid outfunc; bool isvarlena; FmgrInfo fmgrinfo; char *start_ts_str; StringInfoData query; MemoryContext oldcxt; initStringInfo(&query); StartTransactionCommand(); getTypeOutputInfo(TIMESTAMPTZOID, &outfunc, &isvarlena); fmgr_info(outfunc, &fmgrinfo); start_ts_str = OutputFunctionCall(&fmgrinfo, TimestampTzGetDatum(start_ts)); /* Make sure the string survives TopTransactionContext. */ oldcxt = MemoryContextSwitchTo(task_cxt); start_ts_str = pstrdup(start_ts_str); MemoryContextSwitchTo(oldcxt); CommitTransactionCommand(); resetStringInfo(&query); /* * No one should change the progress fields now, so we can access * them w/o the spinlock below. */ appendStringInfo(&query, "INSERT INTO squeeze.log(tabschema, tabname, started, finished, ins_initial, ins, upd, del) \ VALUES ('%s', '%s', '%s', clock_timestamp(), %ld, %ld, %ld, %ld)", NameStr(*relschema), NameStr(*relname), start_ts_str, MyWorkerSlot->progress.ins_initial, MyWorkerSlot->progress.ins, MyWorkerSlot->progress.upd, MyWorkerSlot->progress.del); run_command(query.data, SPI_OK_INSERT); if (task->task_id >= 0) { /* Finalize the task if it was a scheduled one. */ resetStringInfo(&query); appendStringInfo(&query, "SELECT squeeze.finalize_task(%d)", task->task_id); run_command(query.data, SPI_OK_SELECT); if (!task->skip_analyze) { /* * Analyze the new table, unless user rejects it * explicitly. * * XXX Besides updating planner statistics in general, * this sets pg_class(relallvisible) to 0, so that planner * is not too optimistic about this figure. The * preferrable solution would be to run (lazy) VACUUM * (with the ANALYZE option) to initialize visibility map. * However, to make the effort worthwile, we shouldn't do * it until all transactions can see all the changes done * by squeeze_table() function. What's the most suitable * way to wait? Asynchronous execution of the VACUUM is * probably needed in any case. */ resetStringInfo(&query); appendStringInfo(&query, "ANALYZE %s.%s", NameStr(*relschema), NameStr(*relname)); run_command(query.data, SPI_OK_UTILITY); } } } /* Clear the relid field of this worker's slot. */ SpinLockAcquire(&MyWorkerSlot->mutex); MyWorkerSlot->relid = InvalidOid; MemSet(&MyWorkerSlot->progress, 0, sizeof(WorkerProgress)); SpinLockRelease(&MyWorkerSlot->mutex); } /* * Handle an error from the perspective of pg_squeeze * * Here we are especially interested in errors like incorrect user input * (e.g. non-existing table specified) or expiration of the * squeeze_max_xlock_time parameter. If the squeezing succeeded, the following * operations should succeed too, unless there's a bug in the extension - in * such a case it's o.k. to let the ERROR stop the worker. */ static void squeeze_handle_error_app(ErrorData *edata, WorkerTask *task) { StringInfoData query; initStringInfo(&query); appendStringInfo(&query, "INSERT INTO squeeze.errors(tabschema, tabname, sql_state, err_msg, err_detail) \ VALUES ('%s', '%s', '%s', %s, %s)", NameStr(task->relschema), NameStr(task->relname), unpack_sql_state(edata->sqlerrcode), quote_literal_cstr(edata->message), edata->detail ? quote_literal_cstr(edata->detail) : "''"); run_command(query.data, SPI_OK_INSERT); if (task->task_id >= 0) { /* If the active task failed too many times, cancel it. */ resetStringInfo(&query); if (task->last_try) { appendStringInfo(&query, "SELECT squeeze.cancel_task(%d)", task->task_id); run_command(query.data, SPI_OK_SELECT); } else { /* Account for the current attempt. */ appendStringInfo(&query, "UPDATE squeeze.tasks SET tried = tried + 1 WHERE id = %d", task->task_id); run_command(query.data, SPI_OK_UPDATE); } /* Clear the relid field of this worker's slot. */ SpinLockAcquire(&MyWorkerSlot->mutex); MyWorkerSlot->relid = InvalidOid; MemSet(&MyWorkerSlot->progress, 0, sizeof(WorkerProgress)); SpinLockRelease(&MyWorkerSlot->mutex); } } static void interrupt_worker(WorkerTask *task) { SpinLockAcquire(&task->mutex); /* Do not set if the task exited on its own. */ if (task->worker_state != WTS_UNUSED) task->exit_requested = true; SpinLockRelease(&task->mutex); } static void clear_task(WorkerTask *task) { task->worker_state = WTS_UNUSED; task->exit_requested = false; task->dbid = InvalidOid; NameStr(task->relschema)[0] = '\0'; NameStr(task->relname)[0] = '\0'; NameStr(task->indname)[0] = '\0'; NameStr(task->tbspname)[0] = '\0'; task->max_xlock_time = 0; task->task_id = -1; task->last_try = false; task->skip_analyze = false; memset(task->ind_tbsps, 0, sizeof(task->ind_tbsps)); NameStr(task->repl_slot.name)[0] = '\0'; task->repl_slot.confirmed_flush = InvalidXLogRecPtr; task->repl_slot.snap_handle = DSM_HANDLE_INVALID; task->repl_slot.snap_seg = NULL; task->repl_slot.snap_private = NULL; task->error_msg[0] = '\0'; } /* * The squeeze worker should call this before exiting. */ static void release_task(WorkerTask *task) { SpinLockAcquire(&task->mutex); task->worker_state = WTS_UNUSED; Assert(task == MyWorkerTask || MyWorkerTask == NULL); /* * The "standalone" worker might have used its private memory for the * snapshot. */ if (task->repl_slot.snap_private) { Assert(am_i_standalone); /* * Do not call pfree() when holding spinlock. The worker should * only process a single task anyway, so it's not really a leak. */ task->repl_slot.snap_private = NULL; } /* * Do not care about detaching from the shared memory: * setup_decoding() runs in a transaction, so the resource owner of * that transaction will take care. */ MyWorkerTask = NULL; /* Let others see the WTS_UNUSED state. */ SpinLockRelease(&task->mutex); } /* * Run an SQL command that does not return any value. * * 'rc' is the expected return code. * * The return value tells how many tuples are returned by the query. */ static uint64 run_command(char *command, int rc) { int ret; uint64 ntup = 0; SetCurrentStatementStartTimestamp(); StartTransactionCommand(); SPI_connect(); PushActiveSnapshot(GetTransactionSnapshot()); pgstat_report_activity(STATE_RUNNING, command); ret = SPI_execute(command, false, 0); pgstat_report_activity(STATE_IDLE, NULL); if (ret != rc) elog(ERROR, "command failed: %s", command); if (rc == SPI_OK_SELECT || rc == SPI_OK_INSERT_RETURNING || rc == SPI_OK_DELETE_RETURNING || rc == SPI_OK_UPDATE_RETURNING) { #if PG_VERSION_NUM >= 130000 ntup = SPI_tuptable->numvals; #else ntup = SPI_processed; #endif } SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); pgstat_report_stat(false); return ntup; } #define ACTIVE_WORKERS_RES_ATTRS 7 /* Get information on squeeze workers on the current database. */ PG_FUNCTION_INFO_V1(squeeze_get_active_workers); Datum squeeze_get_active_workers(PG_FUNCTION_ARGS) { WorkerSlot *slots, *dst; int i, nslots = 0; #if PG_VERSION_NUM >= 150000 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; InitMaterializedSRF(fcinfo, 0); #else FuncCallContext *funcctx; int call_cntr, max_calls; HeapTuple *tuples; #endif /* * Copy the slots information so that we don't have to keep the slot array * locked for longer time than necessary. */ slots = (WorkerSlot *) palloc(workerData->nslots * sizeof(WorkerSlot)); dst = slots; LWLockAcquire(workerData->lock, LW_SHARED); for (i = 0; i < workerData->nslots; i++) { WorkerSlot *slot = &workerData->slots[i]; if (!slot->scheduler && slot->pid != InvalidPid && slot->dbid == MyDatabaseId) { memcpy(dst, slot, sizeof(WorkerSlot)); dst++; nslots++; } } LWLockRelease(workerData->lock); #if PG_VERSION_NUM >= 150000 for (i = 0; i < nslots; i++) { WorkerSlot *slot = &slots[i]; WorkerProgress *progress = &slot->progress; Datum values[ACTIVE_WORKERS_RES_ATTRS]; bool isnull[ACTIVE_WORKERS_RES_ATTRS]; char *relnspc = NULL; char *relname = NULL; NameData tabname, tabschema; memset(isnull, false, ACTIVE_WORKERS_RES_ATTRS * sizeof(bool)); values[0] = Int32GetDatum(slot->pid); if (OidIsValid(slot->relid)) { Oid nspid; /* * It's possible that processing of the relation has finished and * the relation (or even the namespace) was dropped. Therefore, * stop catalog lookups as soon as any object is missing. XXX * Furthermore, the relid can already be in use by another * relation, but that's very unlikely, not worth special effort. */ nspid = get_rel_namespace(slot->relid); if (OidIsValid(nspid)) relnspc = get_namespace_name(nspid); if (relnspc) relname = get_rel_name(slot->relid); } if (relnspc == NULL || relname == NULL) continue; namestrcpy(&tabschema, relnspc); values[1] = NameGetDatum(&tabschema); namestrcpy(&tabname, relname); values[2] = NameGetDatum(&tabname); values[3] = Int64GetDatum(progress->ins_initial); values[4] = Int64GetDatum(progress->ins); values[5] = Int64GetDatum(progress->upd); values[6] = Int64GetDatum(progress->del); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, isnull); } return (Datum) 0; #else /* Less trivial implementation, to be removed when PG 14 is EOL. */ if (SRF_IS_FIRSTCALL()) { MemoryContext oldcontext; TupleDesc tupdesc; int ntuples = 0; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("function returning record called in context " "that cannot accept type record"))); /* XXX Is this necessary? */ funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); /* Process only the slots that we really can display. */ tuples = (HeapTuple *) palloc0(nslots * sizeof(HeapTuple)); for (i = 0; i < nslots; i++) { WorkerSlot *slot = &slots[i]; WorkerProgress *progress = &slot->progress; char *relnspc = NULL; char *relname = NULL; NameData tabname, tabschema; Datum *values; bool *isnull; values = (Datum *) palloc(ACTIVE_WORKERS_RES_ATTRS * sizeof(Datum)); isnull = (bool *) palloc0(ACTIVE_WORKERS_RES_ATTRS * sizeof(bool)); if (OidIsValid(slot->relid)) { Oid nspid; /* See the PG 15 implementation above. */ nspid = get_rel_namespace(slot->relid); if (OidIsValid(nspid)) relnspc = get_namespace_name(nspid); if (relnspc) relname = get_rel_name(slot->relid); } if (relnspc == NULL || relname == NULL) continue; values[0] = Int32GetDatum(slot->pid); namestrcpy(&tabschema, relnspc); values[1] = NameGetDatum(&tabschema); namestrcpy(&tabname, relname); values[2] = NameGetDatum(&tabname); values[3] = Int64GetDatum(progress->ins_initial); values[4] = Int64GetDatum(progress->ins); values[5] = Int64GetDatum(progress->upd); values[6] = Int64GetDatum(progress->del); tuples[ntuples++] = heap_form_tuple(tupdesc, values, isnull); } funcctx->user_fctx = tuples; funcctx->max_calls = ntuples;; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); call_cntr = funcctx->call_cntr; max_calls = funcctx->max_calls; tuples = (HeapTuple *) funcctx->user_fctx; if (call_cntr < max_calls) { HeapTuple tuple = tuples[call_cntr]; Datum result; result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } else SRF_RETURN_DONE(funcctx); #endif }