pax_global_header00006660000000000000000000000064144271362400014515gustar00rootroot0000000000000052 comment=4ac7afc0603e805693a23046afa607ab60586c6f pg_failover_slots-1.0.1/000077500000000000000000000000001442713624000152355ustar00rootroot00000000000000pg_failover_slots-1.0.1/.clang-format000066400000000000000000000021631442713624000176120ustar00rootroot00000000000000# -*- yaml -*- # git ls-files -i -x '*.[ch]' | xargs clang-format -i --- Language: Cpp # BasedOnStyle: LLVM # true would be better here. but it's bugged in combination with # "PointerAlignment: Right" which we also use as is more important AlignConsecutiveDeclarations: false AlignEscapedNewlines: Right AllowShortFunctionsOnASingleLine: None AlwaysBreakAfterDefinitionReturnType: true BreakBeforeBraces: Allman BreakBeforeTernaryOperators: false BreakConstructorInitializersBeforeComma: true BreakStringLiterals: false ColumnLimit: 79 ForEachMacros: - foreach - forboth - dlist_foreach - dlist_foreach_modify - slist_foreach - slist_foreach_modify IncludeBlocks: Preserve IncludeCategories: # c.h and postgres.h should be first - Regex: '.*' Priority: 1 - Regex: '^' Priority: -1 - Regex: '^' Priority: -1 IndentCaseLabels: true IndentWidth: 4 MacroBlockBegin: "PG_TRY();|PG_CATCH();" MacroBlockEnd: "PG_END_TRY();" MaxEmptyLinesToKeep: 3 PointerAlignment: Right SpaceAfterCStyleCast: true TabWidth: 4 UseTab: Always ... pg_failover_slots-1.0.1/.editorconfig000066400000000000000000000002071442713624000177110ustar00rootroot00000000000000root = true [*.{c,h,pl,pm}] indent_style = tab indent_size = tab tab_width = 4 [*.{sql,md,yml}] indent_style = space indent_size = 2 pg_failover_slots-1.0.1/.gitignore000066400000000000000000000001331442713624000172220ustar00rootroot00000000000000tmp_check*/ *~ *.swo *.swp *.o *.so *.dylib *.gcov *.gcov.out *.gcda *.gcno *.bc .DS_Store pg_failover_slots-1.0.1/LICENSE000066400000000000000000000020121442713624000162350ustar00rootroot00000000000000Postgres Failover Slots (pg_failover_slots) Copyright (c) 2023, EnterpriseDB Corporation. Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. IN NO EVENT SHALL ENTERPRISEDB CORPORATION BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF ENTERPRISEDB CORPORATION HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ENTERPRISEDB CORPORATION SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND ENTERPRISEDB CORPORATION HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. pg_failover_slots-1.0.1/Makefile000066400000000000000000000014371442713624000167020ustar00rootroot00000000000000MODULE_big = pg_failover_slots OBJS = pg_failover_slots.o PG_CPPFLAGS += -I $(libpq_srcdir) SHLIB_LINK += $(libpq) TAP_TESTS = 1 PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) prove_installcheck: $(pgxsdir)/src/test/perl/$(core_perl_module) rm -rf $(CURDIR)/tmp_check mkdir -p $(CURDIR)/tmp_check &&\ PERL5LIB="$${PERL5LIB}:$(srcdir)/t:$(pgxsdir)/src/test/perl" \ PG_VERSION_NUM='$(VERSION_NUM)' \ TESTDIR='$(CURDIR)' \ SRCDIR='$(srcdir)' \ PATH="$(TEST_PATH_PREFIX):$(PATH)" \ PGPORT='6$(DEF_PGPORT)' \ top_builddir='$(CURDIR)/tmp_check' \ PG_REGRESS='$(pgxsdir)/src/test/regress/pg_regress' \ PGCTLTIMEOUT=180 \ $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) \ $(addprefix $(srcdir)/,$(or $(PROVE_TESTS),t/*.pl)) check_prove: prove_installcheck pg_failover_slots-1.0.1/README.md000066400000000000000000000173431442713624000165240ustar00rootroot00000000000000# pg_failover_slots PG Failover Slots is for anyone with Logical Replication Slots on Postgres databases that are also part of a Physical Streaming Replication architecture. Since logical replication slots are only maintained on the primary node, downstream subscribers don't receive any new changes from a newly promoted primary until the slot is created, which is unsafe because the information that includes which data a subscriber has confirmed receiving and which log data still needs to be retained for the subscriber will have been lost, resulting in an unknown gap in data changes. PG Failover Slots makes logical replication slots usable across a physical failover using the following features: - Copies any missing replication slots from the primary to the standby - Removes any slots from the standby that aren't found on the primary - Periodically synchronizes the position of slots on the standby based on the primary - Ensures that selected standbys receive data before any of the logical slot walsenders can send data to consumers PostgreSQL 11 or higher is required. ## How to check the standby is ready The slots are not synchronized to the standby immediately, because of consistency reasons. The standby can be too behind logical slots, or too ahead of logical slots on primary when the pg_failover_slots extension is activated, so the extension does verification and only synchronizes slots when it's actually safe. This, however brings a need to verify that the slots are synchronized and that the standby is actually ready to be a failover target with consistent logical decoding for all slots. This only needs to be done initially, once the slots are synchronized the first time, they will always be consistent as long as the extension is active in the cluster. The check for whether slots are fully synchronized with primary is relatively simple. The slots just need to be present in `pg_replication_slots` view on standby and have `active` state `false`. An `active` state `true` means the slots is currently being initialized. For example consider the following psql session: ```psql # SELECT slot_name, active FROM pg_replication_slots WHERE slot_type = 'logical'; slot_name | active -----------------+-------- regression_slot1 | f regression_slot2 | f regression_slot3 | t ``` This means that slots `regression_slot1` and `regression_slot2` are synchronized from primary to standby and `regression_slot3` is still being synchronized. If failover happens at this stage, the `regression_slot3` will be lost. Now let's wait a little and query again: ```psql # SELECT slot_name, active FROM pg_replication_slots WHERE slot_type = 'logical'; slot_name | active -----------------+-------- regression_slot1 | f regression_slot2 | f regression_slot3 | f ``` Now all the the three slots are synchronized and the standby can be used for failover without losing logical decoding state for any of them. ## Prerequisite settings The extension throws hard errors if the following settings are not adjusted: - `hot_standby_feedback` should be `on` - `primary_slot_name` should be non-empty These are necessary to connect to the primary so it can send the xmin and catalog_xmin separately over hot_standby_feedback. ## Configuration options The extension itself must be added to `shared_preload_libraries` on both the primary instance as well as any standby that is used for high availability (failover or switchover) purposes. The behavior of pg_failover_slots is configurable using these configuration options (set in `postgresql.conf`). ### pg_failover_slots.synchronize_slot_names This standby option allows setting which logical slots should be synchronized to this physical standby. It's a comma-separated list of slot filters. A slot filter is defined as `key:value` pair (separated by colon) where `key` can be one of: - `name` - specifies to match exact slot name - `name_like` - specifies to match slot name against SQL `LIKE` expression - `plugin` - specifies to match slot plugin name against the value The `key` can be omitted and will default to `name` in that case. For example, `'my_slot_name,plugin:test_decoding'` will synchronize the slot named "my_slot_name" and any slots that use the test_decoding plugin. If this is set to an empty string, no slots will be synchronized to this physical standby. The default value is `'name_like:%'`, which means all logical replication slots will be synchronized. ### pg_failover_slots.drop_extra_slots This standby option controls what happens to extra slots on the standby that are not found on the primary using the `pg_failover_slots.synchronize_slot_names` filter. If it's set to true (which is the default), they will be dropped, otherwise they will be kept. ### pg_failover_slots.primary_dsn A standby option for specifying the connection string to use to connect to the primary when fetching slot information. If empty (default), then use same connection string as `primary_conninfo`. Note that `primary_conninfo` cannot be used if there is a `password` field in the connection string because it gets obfuscated by PostgreSQL and pg_failover_slots can't actually see the password. In this case, `pg_failover_slots.primary_dsn` must be configured. ### pg_failover_slots.standby_slot_names This option is typically used in failover configurations to ensure that the failover-candidate streaming physical replica(s) have received and flushed all changes before they ever become visible to any subscribers. That guarantees that a commit cannot vanish on failover to a standby for the consumer of a logical slot. Replication slots whose names are listed in the comma-separated `pg_failover_slots.standby_slot_names` list are treated specially by the walsender on the primary. Logical replication walsenders will ensure that all local changes are sent and flushed to the replication slots in `pg_failover_slots.standby_slot_names` before the walsender sends those changes for the logical replication slots. Effectively, it provides a synchronous replication barrier between the named list of slots and all the consumers of logically decoded streams from walsender. Any replication slot may be listed in `pg_failover_slots.standby_slot_names`; both logical and physical slots work, but it's generally used for physical slots. Without this safeguard, two anomalies are possible where a commit can be received by a subscriber and then vanish from the provider on failover because the failover candidate hadn't received it yet: * For 1+ subscribers, the subscriber may have applied the change but the new provider may execute new transactions that conflict with the received change, as it never happened as far as the provider is concerned; and/or * For 2+ subscribers, at the time of failover, not all subscribers have applied the change. The subscribers now have inconsistent and irreconcilable states because the subscribers that didn't receive the commit have no way to get it now. Setting `pg_failover_slots.standby_slot_names` will (by design) cause subscribers to lag behind the provider if the provider's failover-candidate replica(s) are not keeping up. Monitoring is thus essential. ### pg_failover_slots.standby_slots_min_confirmed Controls how many of the `pg_failover_slots.standby_slot_names` have to confirm before we send data through the logical replication slots. Setting -1 (the default) means to wait for all entries in `pg_failover_slots.standby_slot_names`. ## Release notes ### v1.0.1 Version 1.0.1 fixes several compatibility bugs. - Fix support for PG13 and older The missing interfaces caused either disconnects or outright crashes on PG13 and older. - Test compatibility improvements Tests now work on PG11, and are more resilient to testing on slower machines. - PG16 compatibility improvements - Various minor cleanups pg_failover_slots-1.0.1/pg_failover_slots.c000066400000000000000000001203061442713624000211240ustar00rootroot00000000000000/* * Postgres Failover Slots (pg_failover_slots) * * Copyright (c) 2023, EnterpriseDB Corporation. */ #include "postgres.h" #include #include #include #include #include "funcapi.h" #include "miscadmin.h" #include "pgstat.h" #include "access/genam.h" #if PG_VERSION_NUM >= 120000 #include "access/table.h" #else #include "access/heapam.h" #define table_open heap_open #define table_close heap_close #endif #include "access/xact.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" #endif #include "catalog/indexing.h" #include "catalog/pg_database.h" #include "postmaster/bgworker.h" #if PG_VERSION_NUM >= 130000 #include "postmaster/interrupt.h" #endif #include "replication/decode.h" #include "replication/logical.h" #include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/ipc.h" #include "storage/procarray.h" #include "tcop/tcopprot.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/fmgrprotos.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/pg_lsn.h" #include "utils/resowner.h" #include "utils/snapmgr.h" #include "utils/varlena.h" #include "libpq-fe.h" #include "libpq/auth.h" #include "libpq/libpq.h" #define PG_FAILOVER_SLOTS_VERSION "1.0.1" PG_MODULE_MAGIC; #if PG_VERSION_NUM < 130000 #define SignalHandlerForConfigReload PostgresSigHupHandler #define GetWalRcvFlushRecPtr GetWalRcvWriteRecPtr #endif #define EXTENSION_NAME "pg_failover_slots" #define WORKER_NAP_TIME 60000L #define WORKER_WAIT_FEEDBACK 10000L typedef struct RemoteSlot { char *name; char *plugin; char *database; bool two_phase; XLogRecPtr restart_lsn; XLogRecPtr confirmed_lsn; TransactionId catalog_xmin; } RemoteSlot; typedef enum FailoverSlotFilterKey { FAILOVERSLOT_FILTER_NAME = 1, FAILOVERSLOT_FILTER_NAME_LIKE, FAILOVERSLOT_FILTER_PLUGIN } FailoverSlotFilterKey; typedef struct FailoverSlotFilter { FailoverSlotFilterKey key; char *val; /* eg: test_decoding */ } FailoverSlotFilter; /* Used for physical-before-logical ordering */ static char *standby_slot_names_raw; static char *standby_slot_names_string = NULL; List *standby_slot_names = NIL; int standby_slots_min_confirmed; XLogRecPtr standby_slot_names_oldest_flush_lsn = InvalidXLogRecPtr; /* Slots to sync */ char *pg_failover_slots_dsn; char *pg_failover_slot_names; static char *pg_failover_slot_names_str = NULL; static List *pg_failover_slot_names_list = NIL; static bool pg_failover_slots_drop = true; char *pg_failover_slots_version_str; void _PG_init(void); PGDLLEXPORT void pg_failover_slots_main(Datum main_arg); static bool check_failover_slot_names(char **newval, void **extra, GucSource source) { List *namelist = NIL; char *rawname = pstrdup(*newval); bool valid; valid = SplitIdentifierString(rawname, ',', &namelist); if (!valid) GUC_check_errdetail("List syntax is invalid."); pfree(rawname); list_free(namelist); return valid; } static void assign_failover_slot_names(const char *newval, void *extra) { MemoryContext old_ctx; List *slot_names_list = NIL; ListCell *lc; /* cleanup memory to prevent leaking or SET/config reload */ if (pg_failover_slot_names_str) pfree(pg_failover_slot_names_str); if (pg_failover_slot_names_list) { foreach (lc, pg_failover_slot_names_list) { FailoverSlotFilter *filter = lfirst(lc); /* val was pointer to pg_failover_slot_names_str */ pfree(filter); } list_free(pg_failover_slot_names_list); } pg_failover_slot_names_list = NIL; /* Allocate memory in long lasting context. */ old_ctx = MemoryContextSwitchTo(TopMemoryContext); pg_failover_slot_names_str = pstrdup(newval); SplitIdentifierString(pg_failover_slot_names_str, ',', &slot_names_list); foreach (lc, slot_names_list) { char *raw_val = lfirst(lc); char *key = strtok(raw_val, ":"); FailoverSlotFilter *filter = palloc(sizeof(FailoverSlotFilter)); filter->val = strtok(NULL, ":"); /* Default key is name */ if (!filter->val) { filter->val = key; filter->key = FAILOVERSLOT_FILTER_NAME; } else if (strcmp(key, "name") == 0) filter->key = FAILOVERSLOT_FILTER_NAME; else if (strcmp(key, "name_like") == 0) filter->key = FAILOVERSLOT_FILTER_NAME_LIKE; else if (strcmp(key, "plugin") == 0) filter->key = FAILOVERSLOT_FILTER_PLUGIN; else ereport( ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg( "unrecognized synchronize_failover_slot_names key \"%s\"", key))); /* Check that there was just one ':' */ if (strtok(NULL, ":")) ereport( ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg( "unrecognized synchronize_failover_slot_names format"))); pg_failover_slot_names_list = lappend(pg_failover_slot_names_list, filter); } /* Clean the temporary list, but not the contents. */ list_free(slot_names_list); MemoryContextSwitchTo(old_ctx); } static bool check_standby_slot_names(char **newval, void **extra, GucSource source) { List *namelist = NIL; char *rawname = pstrdup(*newval); bool valid; valid = SplitIdentifierString(rawname, ',', &namelist); if (!valid) GUC_check_errdetail("List syntax is invalid."); pfree(rawname); list_free(namelist); return valid; } static void assign_standby_slot_names(const char *newval, void *extra) { MemoryContext old_ctx; if (standby_slot_names_string) pfree(standby_slot_names_string); if (standby_slot_names) list_free(standby_slot_names); /* * We must invalidate our idea of the oldest lsn in all the named slots if * we might have changed the list. */ standby_slot_names_oldest_flush_lsn = InvalidXLogRecPtr; old_ctx = MemoryContextSwitchTo(TopMemoryContext); standby_slot_names_string = pstrdup(newval); (void) SplitIdentifierString(standby_slot_names_string, ',', &standby_slot_names); (void) MemoryContextSwitchTo(old_ctx); } /* * Get failover slots from upstream */ static List * remote_get_primary_slot_info(PGconn *conn, List *slot_filter) { PGresult *res; int i; char *op = ""; List *slots = NIL; ListCell *lc; StringInfoData query; initStringInfo(&query); if (PQserverVersion(conn) >= 140000) { appendStringInfoString( &query, "SELECT slot_name, plugin, database, two_phase, catalog_xmin, restart_lsn, confirmed_flush_lsn" " FROM pg_catalog.pg_replication_slots" " WHERE database IS NOT NULL AND ("); } else { appendStringInfoString( &query, "SELECT slot_name, plugin, database, false AS two_phase, catalog_xmin, restart_lsn, confirmed_flush_lsn" " FROM pg_catalog.pg_replication_slots" " WHERE database IS NOT NULL AND ("); } foreach (lc, slot_filter) { FailoverSlotFilter *filter = lfirst(lc); switch (filter->key) { case FAILOVERSLOT_FILTER_NAME: appendStringInfo( &query, " %s slot_name OPERATOR(pg_catalog.=) %s", op, PQescapeLiteral(conn, filter->val, strlen(filter->val))); break; case FAILOVERSLOT_FILTER_NAME_LIKE: appendStringInfo( &query, " %s slot_name LIKE %s", op, PQescapeLiteral(conn, filter->val, strlen(filter->val))); break; case FAILOVERSLOT_FILTER_PLUGIN: appendStringInfo( &query, " %s plugin OPERATOR(pg_catalog.=) %s", op, PQescapeLiteral(conn, filter->val, strlen(filter->val))); break; default: Assert(0); elog(ERROR, "unrecognized slot filter key %u", filter->key); } op = "OR"; } appendStringInfoString(&query, ")"); res = PQexec(conn, query.data); pfree(query.data); if (PQresultStatus(res) != PGRES_TUPLES_OK) elog(ERROR, "could not fetch slot information from provider: %s\n", res != NULL ? PQresultErrorMessage(res) : PQerrorMessage(conn)); for (i = 0; i < PQntuples(res); i++) { RemoteSlot *slot = palloc0(sizeof(RemoteSlot)); slot->name = pstrdup(PQgetvalue(res, i, 0)); slot->plugin = pstrdup(PQgetvalue(res, i, 1)); slot->database = pstrdup(PQgetvalue(res, i, 2)); parse_bool(PQgetvalue(res, i, 3), &slot->two_phase); slot->catalog_xmin = !PQgetisnull(res, i, 4) ? atoi(PQgetvalue(res, i, 4)) : InvalidTransactionId; slot->restart_lsn = !PQgetisnull(res, i, 5) ? DatumGetLSN(DirectFunctionCall1( pg_lsn_in, CStringGetDatum(PQgetvalue(res, i, 5)))) : InvalidXLogRecPtr; slot->confirmed_lsn = !PQgetisnull(res, i, 6) ? DatumGetLSN(DirectFunctionCall1( pg_lsn_in, CStringGetDatum(PQgetvalue(res, i, 6)))) : InvalidXLogRecPtr; slots = lappend(slots, slot); } PQclear(res); return slots; } static XLogRecPtr remote_get_physical_slot_lsn(PGconn *conn, const char *slot_name) { PGresult *res; XLogRecPtr lsn; StringInfoData query; initStringInfo(&query); appendStringInfo(&query, "SELECT restart_lsn" " FROM pg_catalog.pg_replication_slots" " WHERE slot_name OPERATOR(pg_catalog.=) %s", PQescapeLiteral(conn, slot_name, strlen(slot_name))); res = PQexec(conn, query.data); if (PQresultStatus(res) != PGRES_TUPLES_OK) elog(ERROR, "could not fetch slot information from provider: %s\n", res != NULL ? PQresultErrorMessage(res) : PQerrorMessage(conn)); if (PQntuples(res) != 1) elog(ERROR, "physical slot %s not found on primary", slot_name); if (PQgetisnull(res, 0, 0)) lsn = InvalidXLogRecPtr; else lsn = DatumGetLSN(DirectFunctionCall1( pg_lsn_in, CStringGetDatum(PQgetvalue(res, 0, 0)))); PQclear(res); return lsn; } /* * Can't use get_database_oid from dbcommands.c because it does not work * without db connection. */ static Oid get_database_oid(const char *dbname) { HeapTuple tuple; Relation relation; SysScanDesc scan; ScanKeyData key[1]; Oid dboid = InvalidOid; /* * form a scan key */ ScanKeyInit(&key[0], Anum_pg_database_datname, BTEqualStrategyNumber, F_NAMEEQ, CStringGetDatum(dbname)); /* * Open pg_database and fetch a tuple. Force heap scan if we haven't yet * built the critical shared relcache entries (i.e., we're starting up * without a shared relcache cache file). */ relation = table_open(DatabaseRelationId, AccessShareLock); scan = systable_beginscan(relation, DatabaseNameIndexId, criticalSharedRelcachesBuilt, NULL, 1, key); tuple = systable_getnext(scan); /* Must copy tuple before releasing buffer */ if (HeapTupleIsValid(tuple)) #if PG_VERSION_NUM < 120000 dboid = HeapTupleGetOid(tuple); #else { Form_pg_database datForm = (Form_pg_database) GETSTRUCT(tuple); dboid = datForm->oid; } #endif else ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname))); /* all done */ systable_endscan(scan); table_close(relation, AccessShareLock); return dboid; } /* * Fill connection string info based on config. * * This is slightly complicated because we default to primary_conninfo if * user didn't explicitly set anything and we might need to request explicit * database name override, that's why we need dedicated function for this. */ static void make_sync_failover_slots_dsn(StringInfo connstr, char *db_name) { if (pg_failover_slots_dsn && strlen(pg_failover_slots_dsn) > 0) { if (db_name) appendStringInfo(connstr, "%s dbname=%s", pg_failover_slots_dsn, db_name); else appendStringInfoString(connstr, pg_failover_slots_dsn); } else { Assert(WalRcv); appendStringInfo(connstr, "%s dbname=%s", WalRcv->conninfo, db_name ? db_name : "postgres"); } } /* * Connect to remote pg server */ static PGconn * remote_connect(const char *connstr, const char *appname) { #define CONN_PARAM_ARRAY_SIZE 8 int i = 0; PGconn *conn; const char *keys[CONN_PARAM_ARRAY_SIZE]; const char *vals[CONN_PARAM_ARRAY_SIZE]; StringInfoData s; initStringInfo(&s); appendStringInfoString(&s, connstr); keys[i] = "dbname"; vals[i] = connstr; i++; keys[i] = "application_name"; vals[i] = appname; i++; keys[i] = "connect_timeout"; vals[i] = "30"; i++; keys[i] = "keepalives"; vals[i] = "1"; i++; keys[i] = "keepalives_idle"; vals[i] = "20"; i++; keys[i] = "keepalives_interval"; vals[i] = "20"; i++; keys[i] = "keepalives_count"; vals[i] = "5"; i++; keys[i] = NULL; vals[i] = NULL; Assert(i <= CONN_PARAM_ARRAY_SIZE); /* * We use the expand_dbname parameter to process the connection string * (or URI), and pass some extra options. */ conn = PQconnectdbParams(keys, vals, /* expand_dbname = */ true); if (PQstatus(conn) != CONNECTION_OK) { ereport(ERROR, (errmsg("could not connect to the postgresql server: %s", PQerrorMessage(conn)), errdetail("dsn was: %s", s.data))); } resetStringInfo(&s); elog(DEBUG2, "established connection to remote backend with pid %d", PQbackendPID(conn)); return conn; } /* * Wait for remote slot to pass locally reserved position. * * Wait until the slot named in 'remote_slot' on the host at 'conn' has all its * requirements satisfied by the local slot 'slot' by polling 'conn'. This * relies on us having already reserved the WAL for the old position of * `remote_slot` so `slot` can't continue to advance. */ static bool wait_for_primary_slot_catchup(ReplicationSlot *slot, RemoteSlot *remote_slot) { List *slots; PGconn *conn; StringInfoData connstr; TimestampTz cb_wait_start = 0; /* first invocation should happen immediately */ elog( LOG, "waiting for remote slot %s lsn (%X/%X) and catalog xmin (%u) to pass local slot lsn (%X/%X) and catalog xmin (%u)", remote_slot->name, (uint32) (remote_slot->restart_lsn >> 32), (uint32) (remote_slot->restart_lsn), remote_slot->catalog_xmin, (uint32) (slot->data.restart_lsn >> 32), (uint32) (slot->data.restart_lsn), slot->data.catalog_xmin); initStringInfo(&connstr); /* * Append the dbname of the remote slot. We don't use a generic db * like postgres here because plugin callback bellow might want to invoke * extension functions. */ make_sync_failover_slots_dsn(&connstr, remote_slot->database); conn = remote_connect(connstr.data, "pg_failover_slots"); pfree(connstr.data); for (;;) { RemoteSlot *new_slot; int rc; FailoverSlotFilter *filter = palloc(sizeof(FailoverSlotFilter)); XLogRecPtr receivePtr; CHECK_FOR_INTERRUPTS(); if (!RecoveryInProgress()) { /* * The remote slot didn't pass the locally reserved position * at the time of local promotion, so it's not safe to use. */ ereport( WARNING, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg( "replication slot sync wait for slot %s interrupted by promotion", remote_slot->name))); PQfinish(conn); return false; } filter->key = FAILOVERSLOT_FILTER_NAME; filter->val = remote_slot->name; slots = remote_get_primary_slot_info(conn, list_make1(filter)); if (!list_length(slots)) { /* Slot on provider vanished */ PQfinish(conn); return false; } receivePtr = GetWalRcvFlushRecPtr(NULL, NULL); Assert(list_length(slots) == 1); new_slot = linitial(slots); if (new_slot->restart_lsn > receivePtr) new_slot->restart_lsn = receivePtr; if (new_slot->confirmed_lsn > receivePtr) new_slot->confirmed_lsn = receivePtr; if (new_slot->restart_lsn >= slot->data.restart_lsn && TransactionIdFollowsOrEquals(new_slot->catalog_xmin, MyReplicationSlot->data.catalog_xmin)) { remote_slot->restart_lsn = new_slot->restart_lsn; remote_slot->confirmed_lsn = new_slot->confirmed_lsn; remote_slot->catalog_xmin = new_slot->catalog_xmin; PQfinish(conn); return true; } /* * Invoke any callbacks that will help move the slots along */ if (TimestampDifferenceExceeds( cb_wait_start, GetCurrentTimestamp(), Min(wal_retrieve_retry_interval * 5, PG_WAIT_EXTENSION))) { if (cb_wait_start > 0) elog( LOG, "still waiting for remote slot %s lsn (%X/%X) and catalog xmin (%u) to pass local slot lsn (%X/%X) and catalog xmin (%u)", remote_slot->name, (uint32) (new_slot->restart_lsn >> 32), (uint32) (new_slot->restart_lsn), new_slot->catalog_xmin, (uint32) (slot->data.restart_lsn >> 32), (uint32) (slot->data.restart_lsn), slot->data.catalog_xmin); cb_wait_start = GetCurrentTimestamp(); } rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, wal_retrieve_retry_interval, PG_WAIT_EXTENSION); if (rc & WL_POSTMASTER_DEATH) proc_exit(1); ResetLatch(MyLatch); } } /* * Synchronize one logical replication slot's state from the master to this * standby, creating it if necessary. * * Note that this only works safely because we know for sure that this is * executed on standby where primary has another slot which reserves resources * at the position to which we are moving the local slot to. * * This standby uses a physical replication slot to connect to the master so it * can send the xmin and catalog_xmin separately over hot_standby_feedback. Our * physical slot on the master ensures the master's catalog_xmin never goes * below ours after the initial setup period. */ static void synchronize_one_slot(RemoteSlot *remote_slot) { int i; bool found = false; if (!RecoveryInProgress()) { /* Should only happen when promotion occurs at the same time we sync */ ereport( WARNING, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg( "attempted to sync slot from master when not in recovery"))); return; } SetCurrentStatementStartTimestamp(); StartTransactionCommand(); PushActiveSnapshot(GetTransactionSnapshot()); /* Search for the named slot locally */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; /* Not in use, not interesting. */ if (!s->in_use) continue; if (strcmp(NameStr(s->data.name), remote_slot->name) == 0) { found = true; break; } } LWLockRelease(ReplicationSlotControlLock); /* * Remote slot exists locally, acquire and move. There's a race here where * the slot could've been dropped since we checked, but we'll just ERROR * out in `ReplicationSlotAcquire` and retry next loop so it's harmless. * * Moving the slot this way does not do logical decoding. We're not * processing WAL, we're just updating the slot metadata. */ if (found) { ReplicationSlotAcquire(remote_slot->name, true); /* * We can't satisfy this remote slot's requirements with our known-safe * local restart_lsn, catalog_xmin and xmin. * * This shouldn't happen for existing slots unless someone else messed * with our physical replication slot on the master. */ if (remote_slot->restart_lsn < MyReplicationSlot->data.restart_lsn || TransactionIdPrecedes(remote_slot->catalog_xmin, MyReplicationSlot->data.catalog_xmin)) { elog( WARNING, "not synchronizing slot %s; synchronization would move it backward", remote_slot->name); ReplicationSlotRelease(); PopActiveSnapshot(); CommitTransactionCommand(); return; } LogicalConfirmReceivedLocation(remote_slot->confirmed_lsn); LogicalIncreaseXminForSlot(remote_slot->confirmed_lsn, remote_slot->catalog_xmin); LogicalIncreaseRestartDecodingForSlot(remote_slot->confirmed_lsn, remote_slot->restart_lsn); ReplicationSlotMarkDirty(); ReplicationSlotSave(); elog( DEBUG2, "synchronized existing slot %s to lsn (%X/%X) and catalog xmin (%u)", remote_slot->name, (uint32) (remote_slot->restart_lsn >> 32), (uint32) (remote_slot->restart_lsn), remote_slot->catalog_xmin); } /* * Otherwise create the local slot and initialize it to the state of the * upstream slot. There's a race here where the slot could've been * concurrently created, but we'll just ERROR out and retry so it's * harmless. */ else { TransactionId xmin_horizon = InvalidTransactionId; ReplicationSlot *slot; /* * We have to create the slot to reserve its name and resources, but * don't want it to persist if we fail. */ #if PG_VERSION_NUM >= 140000 ReplicationSlotCreate(remote_slot->name, true, RS_EPHEMERAL, remote_slot->two_phase); #else ReplicationSlotCreate(remote_slot->name, true, RS_EPHEMERAL); #endif slot = MyReplicationSlot; SpinLockAcquire(&slot->mutex); slot->data.database = get_database_oid(remote_slot->database); strlcpy(NameStr(slot->data.plugin), remote_slot->plugin, NAMEDATALEN); SpinLockRelease(&slot->mutex); /* * Stop our physical slot from advancing past the position needed * by the new remote slot by making its reservations locally * effective. It's OK if we can't guarantee their safety yet, * the slot isn't visible to anyone else at this point. */ ReplicationSlotReserveWal(); LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); xmin_horizon = GetOldestSafeDecodingTransactionId(true); slot->effective_catalog_xmin = xmin_horizon; slot->data.catalog_xmin = xmin_horizon; ReplicationSlotsComputeRequiredXmin(true); LWLockRelease(ProcArrayLock); /* * Our xmin and/or catalog_xmin may be > that required by one or more * of the slots we are trying to sync from the master, and/or we don't * have enough retained WAL for the slot's restart_lsn. * * If we persist the slot locally in that state it'll make a false * promise we can't satisfy. * * This can happen if this replica is fairly new or has only recently * started failover slot sync. * * TODO: Don't stop synchronization of other slots for this, we can't * add timeout because that could result in some slots never being * synchronized as they will always be behind the physical slot. */ if (remote_slot->restart_lsn < MyReplicationSlot->data.restart_lsn || TransactionIdPrecedes(remote_slot->catalog_xmin, MyReplicationSlot->data.catalog_xmin)) { if (!wait_for_primary_slot_catchup(MyReplicationSlot, remote_slot)) { /* Provider slot didn't catch up to locally reserved position */ ReplicationSlotRelease(); PopActiveSnapshot(); CommitTransactionCommand(); return; } } /* * We can locally satisfy requirements of remote slot's current * position now. Apply the new position if any and make it persistent. */ LogicalConfirmReceivedLocation(remote_slot->confirmed_lsn); LogicalIncreaseXminForSlot(remote_slot->confirmed_lsn, remote_slot->catalog_xmin); LogicalIncreaseRestartDecodingForSlot(remote_slot->confirmed_lsn, remote_slot->restart_lsn); ReplicationSlotMarkDirty(); ReplicationSlotPersist(); elog(DEBUG1, "synchronized new slot %s to lsn (%X/%X) and catalog xmin (%u)", remote_slot->name, (uint32) (remote_slot->restart_lsn >> 32), (uint32) (remote_slot->restart_lsn), remote_slot->catalog_xmin); } ReplicationSlotRelease(); PopActiveSnapshot(); CommitTransactionCommand(); } /* * Synchronize the slot states from master to standby. * * This logic emulates the "failover slots" behaviour unsuccessfully proposed * for 9.6 using the PostgreSQL 10 features "catalog xmin in hot standby * feedback" and "logical decoding follows timeline switches". * * This is only called in recovery from main loop of manager and only in PG10+ * because in older versions the manager worker uses * bgw_start_time = BgWorkerStart_RecoveryFinished. * * We could technically synchronize slot positions even on older versions of * PostgreSQL but since logical decoding can't go over the timeline switch * before PG10, it's pointless to have slots synchronized. Also, older versions * can't keep catalog_xmin separate from xmin in hot standby feedback, so * sending the feedback we need to preserve our catalog_xmin could cause severe * table bloat on the master. * * This runs periodically. That's safe when the slots on the master already * exist locally because we have their resources reserved via hot standby * feedback. New subscriptions can't move that position backwards... but we * won't immediately know they exist when the master creates them. So there's a * window after each new subscription is created on the master where failover * to this standby will break that subscription. */ static long synchronize_failover_slots(long sleep_time) { List *slots; uint32 nslots = 0; ListCell *lc; PGconn *conn; XLogRecPtr safe_lsn; XLogRecPtr lsn = InvalidXLogRecPtr; static bool was_lsn_safe = false; bool is_lsn_safe = false; StringInfoData connstr; if (!WalRcv || !HotStandbyActive() || list_length(pg_failover_slot_names_list) == 0) return sleep_time; /* XXX should these be errors or just soft return like above? */ if (!hot_standby_feedback) elog( ERROR, "cannot synchronize replication slot positions because hot_standby_feedback is off"); if (WalRcv->slotname[0] == '\0') elog( ERROR, "cannot synchronize replication slot positions because primary_slot_name is not set"); elog(DEBUG1, "starting replication slot synchronization from primary"); initStringInfo(&connstr); make_sync_failover_slots_dsn(&connstr, NULL /* Use default db name */); conn = remote_connect(connstr.data, "pg_failover_slots"); /* * Do not synchronize WAL decoder slots on a physical standy. * * WAL decoder slots are used to produce LCRs. These LCRs are not * synchronized on a physical standby after initial backup and hence are * not included in the base backup. Thus WAL decoder slots, if synchronized * on physical standby, do not reflect the status of LCR directory as they * do on primary. * * There are other slots whose WAL senders use LCRs. These other slots are * synchronized and used after promotion. Since the WAL decoder slots are * ahead of these other slots, the WAL decoder when started after promotion * might miss LCRs required by WAL senders of the other slots. This would * cause data inconsistency after promotion. * * Hence do not synchronize WAL decoder slot. Those will be created after * promotion */ slots = remote_get_primary_slot_info(conn, pg_failover_slot_names_list); safe_lsn = remote_get_physical_slot_lsn(conn, WalRcv->slotname); /* * Delete locally-existing slots that don't exist on the master. */ for (;;) { int i; char *dropslot = NULL; LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; bool active; bool found = false; active = (s->active_pid != 0); /* Only check inactive slots. */ if (!s->in_use || active) continue; /* Try to find slot in slots returned by primary. */ foreach (lc, slots) { RemoteSlot *remote_slot = lfirst(lc); if (strcmp(NameStr(s->data.name), remote_slot->name) == 0) { found = true; break; } } /* * Not found, should be dropped if synchronize_failover_slots_drop * is enabled. */ if (!found && pg_failover_slots_drop) { dropslot = pstrdup(NameStr(s->data.name)); break; } } LWLockRelease(ReplicationSlotControlLock); if (dropslot) { elog(WARNING, "dropping replication slot \"%s\"", dropslot); ReplicationSlotDrop(dropslot, false); pfree(dropslot); } else break; } if (!list_length(slots)) { PQfinish(conn); return sleep_time; } /* Find oldest restart_lsn still needed by any failover slot. */ foreach (lc, slots) { RemoteSlot *remote_slot = lfirst(lc); if (lsn == InvalidXLogRecPtr || remote_slot->restart_lsn < lsn) lsn = remote_slot->restart_lsn; } if (safe_lsn == InvalidXLogRecPtr || WalRcv->latestWalEnd == InvalidXLogRecPtr) { ereport( WARNING, (errmsg( "cannot synchronize replication slot positions yet because feedback was not sent yet"))); was_lsn_safe = false; PQfinish(conn); return Min(sleep_time, WORKER_WAIT_FEEDBACK); } else if (WalRcv->latestWalEnd < lsn) { ereport( WARNING, (errmsg( "requested slot synchronization point %X/%X is ahead of the standby position %X/%X, not synchronizing slots", (uint32) (lsn >> 32), (uint32) (lsn), (uint32) (WalRcv->latestWalEnd >> 32), (uint32) (WalRcv->latestWalEnd)))); was_lsn_safe = false; PQfinish(conn); return Min(sleep_time, WORKER_WAIT_FEEDBACK); } foreach (lc, slots) { RemoteSlot *remote_slot = lfirst(lc); XLogRecPtr receivePtr; /* * If we haven't received WAL for a remote slot's current * confirmed_flush_lsn our local copy shouldn't reflect a confirmed * position in the future. Cap it at the position we really received. * * Because the client will use a replication origin to track its * position, in most cases it'll still fast-forward to the new * confirmed position even if that skips over a gap of WAL we never * received from the provider before failover. We can't detect or * prevent that as the same fast forward is normal when we lost slot * state in a provider crash after subscriber committed but before we * saved the new confirmed flush lsn. The master will also fast forward * the slot over irrelevant changes and then the subscriber will update * its confirmed_flush_lsn in response to master standby status * updates. */ receivePtr = GetWalRcvFlushRecPtr(NULL, NULL); if (remote_slot->confirmed_lsn > receivePtr) remote_slot->confirmed_lsn = receivePtr; /* * For simplicity we always move restart_lsn of all slots to the * restart_lsn needed by the furthest-behind master slot. */ if (remote_slot->restart_lsn > lsn) remote_slot->restart_lsn = lsn; synchronize_one_slot(remote_slot); nslots++; } PQfinish(conn); if (!was_lsn_safe && is_lsn_safe) elog(LOG, "slot synchronization from primary now active"); was_lsn_safe = is_lsn_safe; return sleep_time; } void pg_failover_slots_main(Datum main_arg) { /* Establish signal handlers. */ pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGTERM, die); pqsignal(SIGHUP, SignalHandlerForConfigReload); BackgroundWorkerUnblockSignals(); /* Make it easy to identify our processes. */ SetConfigOption("application_name", MyBgworkerEntry->bgw_name, PGC_SU_BACKEND, PGC_S_OVERRIDE); elog(LOG, "starting pg_failover_slots replica worker"); /* Setup connection to pinned catalogs (we only ever read pg_database). */ BackgroundWorkerInitializeConnection(NULL, NULL, 0); /* Main wait loop. */ while (true) { int rc; long sleep_time = WORKER_NAP_TIME; CHECK_FOR_INTERRUPTS(); if (RecoveryInProgress()) sleep_time = synchronize_failover_slots(WORKER_NAP_TIME); else sleep_time = WORKER_NAP_TIME * 10; rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, sleep_time, PG_WAIT_EXTENSION); ResetLatch(MyLatch); /* Emergency bailout if postmaster has died. */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); /* Reload the config if needed. */ if (ConfigReloadPending) { ConfigReloadPending = false; ProcessConfigFile(PGC_SIGHUP); } } } static bool list_member_str(List *l, const char *str) { ListCell *lc; foreach (lc, l) if (strcmp((const char *) lfirst(lc), str) == 0) return true; return false; } /* * Check whether we want to actually wait for standby_slot_names */ static bool skip_standby_slot_names(XLogRecPtr commit_lsn) { static List *cached_standby_slot_names = NIL; if (standby_slot_names != cached_standby_slot_names) { if (MyReplicationSlot) { if (list_member_str(standby_slot_names, NameStr(MyReplicationSlot->data.name))) { standby_slots_min_confirmed = 0; elog( DEBUG1, "found my slot in pg_failover_slots.standby_slot_names, no need to wait for confirmations"); } } cached_standby_slot_names = standby_slot_names; } /* * If we already know all slots of interest satisfy the requirement we can * skip checks entirely. The assignment hook for * pg_failover_slots.standby_slot_names invalidates the cache. */ if (standby_slot_names_oldest_flush_lsn >= commit_lsn || standby_slots_min_confirmed == 0 || list_length(standby_slot_names) == 0) return true; return false; } /* * Wait until the nominated set of standbys, if any, have flushed past the * specified lsn. Standbys are identified by slot name, not application_name * like in synchronous_standby_names. * * confirmed_flush_lsn is used for physical slots, restart_lsn for logical * slots. * */ static void wait_for_standby_confirmation(XLogRecPtr commit_lsn) { XLogRecPtr flush_pos = InvalidXLogRecPtr; TimestampTz wait_start = GetCurrentTimestamp(); if (skip_standby_slot_names(commit_lsn)) return; while (1) { int i; int wait_slots_remaining; XLogRecPtr oldest_flush_pos = InvalidXLogRecPtr; int rc; if (standby_slots_min_confirmed == -1) { /* * Default pg_failover_slots.standby_slots_min_confirmed (-1) is to * wait for all entries in pg_failover_slots.standby_slot_names. */ wait_slots_remaining = list_length(standby_slot_names); } else { /* * pg_failover_slots.standby_slots_min_confirmed cannot wait for * more slots than are named in the * pg_failover_slots.standby_slot_names. */ wait_slots_remaining = Min(standby_slots_min_confirmed, list_length(standby_slot_names)); } Assert(wait_slots_remaining > 0 && wait_slots_remaining <= list_length(standby_slot_names)); LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; if (!s->in_use) continue; if (!list_member_str(standby_slot_names, NameStr(s->data.name))) continue; SpinLockAcquire(&s->mutex); if (s->data.database == InvalidOid) /* Physical slots advance restart_lsn on flush and ignore * confirmed_flush_lsn */ flush_pos = s->data.restart_lsn; else /* For logical slots we must wait for commit and flush */ flush_pos = s->data.confirmed_flush; SpinLockRelease(&s->mutex); /* We want to find out the min(flush pos) over all named slots */ if (oldest_flush_pos == InvalidXLogRecPtr || oldest_flush_pos > flush_pos) oldest_flush_pos = flush_pos; if (flush_pos >= commit_lsn && wait_slots_remaining > 0) wait_slots_remaining--; } LWLockRelease(ReplicationSlotControlLock); if (wait_slots_remaining == 0) { /* * If the oldest slot pos across all named slots advanced, update * the cache so we can skip future calls. It'll be invalidated * if the GUCs change. */ if (standby_slot_names_oldest_flush_lsn < oldest_flush_pos) standby_slot_names_oldest_flush_lsn = oldest_flush_pos; return; } /* * Ideally we'd be able to ask these walsenders to wake us if they * advance past the point of interest, but that'll require some core * patching. For now, poll. * * We don't test for postmaster death here because it turns out to * be really slow. The postmaster should kill us, we'll notice when * we time out, and it's not a long sleep. * * TODO some degree of backoff on sleeps? */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, 100L, PG_WAIT_EXTENSION); if (rc & WL_POSTMASTER_DEATH) proc_exit(1); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); if (wal_sender_timeout > 0 && GetCurrentTimestamp() > TimestampTzPlusMilliseconds(wait_start, wal_sender_timeout)) { ereport( COMMERROR, (errmsg( "terminating walsender process due to pg_failover_slots.standby_slot_names replication timeout"))); proc_exit(0); } /* * The user might clear bdr.standby_slot_name or change it to a new * standby. If we don't notice, we'll keep looping indefinitely here, * so we have to check for config changes. */ if (ConfigReloadPending) { ConfigReloadPending = false; ProcessConfigFile(PGC_SIGHUP); if (skip_standby_slot_names(commit_lsn)) return; } } } /* * Hackery to inject ourselves into walsender's logical stream starts here */ static const PQcommMethods *OldPqCommMethods; static void socket_comm_reset(void) { OldPqCommMethods->comm_reset(); } static int socket_flush(void) { return OldPqCommMethods->flush(); } static int socket_flush_if_writable(void) { return OldPqCommMethods->flush_if_writable(); } static bool socket_is_send_pending(void) { return OldPqCommMethods->is_send_pending(); } static int socket_putmessage(char msgtype, const char *s, size_t len) { return OldPqCommMethods->putmessage(msgtype, s, len); } static void socket_putmessage_noblock(char msgtype, const char *s, size_t len) { if (msgtype == 'd' && len >= 17) { if (s[0] == 'w') { XLogRecPtr lsn; /* * Extract the lsn from the wal message, and convert it from * network byte order. */ memcpy(&lsn, &s[1], sizeof(XLogRecPtr)); lsn = pg_ntoh64(lsn); /* Wait for the lsn */ wait_for_standby_confirmation(lsn); } } OldPqCommMethods->putmessage_noblock(msgtype, s, len); } #if PG_VERSION_NUM < 140000 static void socket_startcopyout(void) { OldPqCommMethods->startcopyout(); } static void socket_endcopyout(bool errorAbort) { OldPqCommMethods->endcopyout(errorAbort); } #endif #if PG_VERSION_NUM >= 120000 static const #else static #endif PQcommMethods PqCommSocketMethods = { socket_comm_reset, socket_flush, socket_flush_if_writable, socket_is_send_pending, socket_putmessage, socket_putmessage_noblock #if PG_VERSION_NUM < 140000 , socket_startcopyout, socket_endcopyout #endif }; static ClientAuthentication_hook_type original_client_auth_hook = NULL; static void attach_to_walsender(Port *port, int status) { /* * Any other plugins which use ClientAuthentication_hook. */ if (original_client_auth_hook) original_client_auth_hook(port, status); if (am_db_walsender) { OldPqCommMethods = PqCommMethods; PqCommMethods = &PqCommSocketMethods; } } void _PG_init(void) { BackgroundWorker bgw; if (!process_shared_preload_libraries_in_progress) elog(ERROR, "pg_failover_slots is not in shared_preload_libraries"); DefineCustomStringVariable( "pg_failover_slots.version", "pg_failover_slots module version", "", &pg_failover_slots_version_str, PG_FAILOVER_SLOTS_VERSION, PGC_INTERNAL, GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE, NULL, NULL, NULL); DefineCustomStringVariable( "pg_failover_slots.standby_slot_names", "list of names of slot that must confirm changes before they're sent by the decoding plugin", "List of physical replication slots that must confirm durable " "flush of a given lsn before commits up to that lsn may be " "replicated to logical peers by the output plugin. " "Imposes ordering of physical replication before logical " "replication.", &standby_slot_names_raw, "", PGC_SIGHUP, GUC_LIST_INPUT, check_standby_slot_names, assign_standby_slot_names, NULL); DefineCustomIntVariable( "pg_failover_slots.standby_slots_min_confirmed", "Number of slots from pg_failover_slots.standby_slot_names that must confirm lsn", "Modifies behaviour of pg_failover_slots.standby_slot_names so to allow " "logical replication of a transaction after at least " "pg_failover_slots.standby_slots_min_confirmed physical peers have confirmed " "the transaction as durably flushed. " "The value -1 (default) means all entries in pg_failover_slots.standby_slot_names" "must confirm the write. The value 0 causes " "pg_failover_slots.standby_slots_min_confirmedto be effectively ignored.", &standby_slots_min_confirmed, -1, -1, 100, PGC_SIGHUP, 0, NULL, NULL, NULL); DefineCustomStringVariable( "pg_failover_slots.synchronize_slot_names", "list of slots to synchronize from primary to physical standby", "", &pg_failover_slot_names, "name_like:%%", PGC_SIGHUP, /* Sync ALL slots by default */ GUC_LIST_INPUT, check_failover_slot_names, assign_failover_slot_names, NULL); DefineCustomBoolVariable( "pg_failover_slots.drop_extra_slots", "whether to drop extra slots on standby that don't match pg_failover_slots.synchronize_slot_names", NULL, &pg_failover_slots_drop, true, PGC_SIGHUP, 0, NULL, NULL, NULL); DefineCustomStringVariable( "pg_failover_slots.primary_dsn", "connection string to the primary server for synchronization logical slots on standby", "if empty, uses the defaults to primary_conninfo", &pg_failover_slots_dsn, "", PGC_SIGHUP, GUC_SUPERUSER_ONLY, NULL, NULL, NULL); if (IsBinaryUpgrade) return; /* Run the worker. */ memset(&bgw, 0, sizeof(bgw)); bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; bgw.bgw_start_time = BgWorkerStart_ConsistentState; snprintf(bgw.bgw_library_name, BGW_MAXLEN, EXTENSION_NAME); snprintf(bgw.bgw_function_name, BGW_MAXLEN, "pg_failover_slots_main"); snprintf(bgw.bgw_name, BGW_MAXLEN, "pg_failover_slots worker"); bgw.bgw_restart_time = 60; RegisterBackgroundWorker(&bgw); /* Install Hooks */ original_client_auth_hook = ClientAuthentication_hook; ClientAuthentication_hook = attach_to_walsender; } pg_failover_slots-1.0.1/t/000077500000000000000000000000001442713624000155005ustar00rootroot00000000000000pg_failover_slots-1.0.1/t/010_slot_sync.pl000077500000000000000000000102601442713624000204340ustar00rootroot00000000000000 use strict; use warnings; use File::Path qw(rmtree); use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; # Test set-up my $node_primary = PostgreSQL::Test::Cluster->new('test'); $node_primary->init(allows_streaming => 'logical'); $node_primary->append_conf('postgresql.conf', 'shared_preload_libraries = pg_failover_slots'); $node_primary->start; is( $node_primary->psql( 'postgres', qq[SELECT pg_create_physical_replication_slot('standby_1');]), 0, 'physical slot created on primary'); my $backup_name = 'my_backup'; # Take backup $node_primary->backup($backup_name); # Create streaming standby linking to primary my $node_standby = PostgreSQL::Test::Cluster->new('standby_1'); $node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1); $node_standby->append_conf('postgresql.conf', 'hot_standby_feedback = on'); my $pg_version = `pg_config --version | awk '{print \$2}'`; if ($pg_version >= 12) { $node_standby->append_conf('postgresql.conf', 'primary_slot_name = standby_1'); } else { $node_standby->append_conf('recovery.conf', 'primary_slot_name = standby_1'); } $node_standby->start; # Wait for the sync worker to start $node_standby->poll_query_until('postgres', "SELECT count(*) > 0 FROM pg_stat_activity where application_name LIKE 'pg_failover_slots%'"); # Create table. $node_primary->safe_psql('postgres', "CREATE TABLE test_repl_stat(col1 serial)"); # Create replication slots. $node_primary->safe_psql( 'postgres', qq[ SELECT pg_create_logical_replication_slot('regression_slot1', 'test_decoding'); SELECT pg_create_logical_replication_slot('regression_slot2', 'test_decoding'); SELECT pg_create_logical_replication_slot('regression_slot3', 'test_decoding'); SELECT pg_create_logical_replication_slot('regression_slot4', 'test_decoding'); ]); # Simulate some small load to move things forward and wait for slots to be # synced downstream. while (1) { $node_primary->safe_psql( 'postgres', qq[ SELECT data FROM pg_logical_slot_get_changes('regression_slot1', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); SELECT data FROM pg_logical_slot_get_changes('regression_slot2', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); SELECT data FROM pg_logical_slot_get_changes('regression_slot3', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); SELECT data FROM pg_logical_slot_get_changes('regression_slot4', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); ]); $node_primary->safe_psql('postgres', "INSERT INTO test_repl_stat DEFAULT VALUES;"); last if ($node_standby->safe_psql('postgres',"SELECT count(*) > 3 FROM pg_replication_slots WHERE NOT active") eq "t"); sleep(1); } # Now that slots moves they should be all synced is($node_standby->safe_psql('postgres', "SELECT slot_name FROM pg_replication_slots ORDER BY slot_name"), q[regression_slot1 regression_slot2 regression_slot3 regression_slot4]); # Wait for replication to catch up my $primary_lsn = $node_primary->lsn('write'); $node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); # Test to drop one of the replication slot $node_primary->safe_psql('postgres', "SELECT pg_drop_replication_slot('regression_slot4')"); $node_primary->stop; $node_primary->start; $node_primary->stop; my $datadir = $node_primary->data_dir; my $slot3_replslotdir = "$datadir/pg_replslot/regression_slot3"; rmtree($slot3_replslotdir); $node_primary->append_conf('postgresql.conf', 'max_replication_slots = 3'); $node_primary->start; # cleanup $node_primary->safe_psql('postgres', "SELECT pg_drop_replication_slot('regression_slot1')"); $node_primary->safe_psql('postgres', "DROP TABLE test_repl_stat"); # Wait for replication to catch up $primary_lsn = $node_primary->lsn('write'); $node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); # Check that the slots were dropped on standby too $node_standby->poll_query_until('postgres', "SELECT count(*) < 2 FROM pg_replication_slots"); is($node_standby->safe_psql('postgres', "SELECT slot_name FROM pg_replication_slots ORDER BY slot_name"), q[regression_slot2]); # shutdown $node_standby->stop; $node_primary->stop; done_testing(); pg_failover_slots-1.0.1/t/020_physical_before_logical.pl000077500000000000000000000071001442713624000232470ustar00rootroot00000000000000 use strict; use warnings; use File::Path qw(rmtree); use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; my $offset = 0; # Test set-up my $node_primary = PostgreSQL::Test::Cluster->new('test'); $node_primary->init(allows_streaming => 'logical'); $node_primary->append_conf('postgresql.conf', "shared_preload_libraries = pg_failover_slots"); # Setup physical before logical slot $node_primary->append_conf('postgresql.conf', "pg_failover_slots.standby_slot_names = 'standby_1'"); $node_primary->start; is( $node_primary->psql( 'postgres', qq[SELECT pg_create_physical_replication_slot('standby_1');]), 0, 'physical slot created on primary'); my $backup_name = 'my_backup'; # Take backup $node_primary->backup($backup_name); # Create streaming standby linking to primary my $node_standby = PostgreSQL::Test::Cluster->new('standby_1'); $node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1); $node_standby->append_conf('postgresql.conf', 'hot_standby_feedback = on'); my $pg_version = `pg_config --version | awk '{print \$2}'`; if ($pg_version >= 12) { $node_standby->append_conf('postgresql.conf', 'primary_slot_name = standby_1'); } else { $node_standby->append_conf('recovery.conf', 'primary_slot_name = standby_1'); } # Create table. $node_primary->safe_psql('postgres', "CREATE TABLE test_repl_stat(col1 int)"); # Create subscriber node my $node_subscriber = PostgreSQL::Test::Cluster->new('subscriber'); $node_subscriber->init(allows_streaming => 'logical'); $node_subscriber->start; $node_subscriber->safe_psql('postgres', "CREATE TABLE test_repl_stat(col1 int)"); my $node_primary_connstr = $node_primary->connstr . ' dbname=postgres application_name=tap_sub'; $node_primary->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR ALL TABLES"); $node_subscriber->safe_psql('postgres', "CREATE SUBSCRIPTION tap_sub CONNECTION '$node_primary_connstr' PUBLICATION tap_pub" ); $node_primary->wait_for_catchup('tap_sub'); # Create replication slots. $node_primary->safe_psql( 'postgres', qq[ SELECT pg_create_logical_replication_slot('regression_slot1', 'test_decoding'); ]); # Insert some data. $node_primary->safe_psql('postgres', "INSERT INTO test_repl_stat values(generate_series(1, 5));"); # Fetching using pg_logical_slot_get_changes should work fine $node_primary->safe_psql( 'postgres', qq[ SELECT data FROM pg_logical_slot_get_changes('regression_slot1', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); ]); # Replication via pub/sub should time out though $offset = $node_primary->wait_for_log( qr/terminating walsender process due to pg_failover_slots.standby_slot_names replication timeout/, 0); # And subscriber should have nothing is($node_subscriber->safe_psql('postgres', "SELECT * FROM test_repl_stat"), ""); # Start standby $node_standby->start; # Wait for it to replicate my $primary_lsn = $node_primary->lsn('write'); $node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); # Make sure subscriber replicates $node_subscriber->poll_query_until('postgres', "SELECT count(*) > 4 FROM test_repl_stat"); # Stop standby again $node_standby->stop; # Insert more data $node_primary->safe_psql('postgres', "INSERT INTO test_repl_stat values(generate_series(10, 15));"); # Pub/Sub replication should timeout again $offset = $node_primary->wait_for_log( qr/terminating walsender process due to pg_failover_slots.standby_slot_names replication timeout/, $offset); # shutdown $node_primary->stop; $node_subscriber->stop; done_testing(); pg_failover_slots-1.0.1/t/030_failover.pl000077500000000000000000000101741442713624000202340ustar00rootroot00000000000000 use strict; use warnings; use File::Path qw(rmtree); use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; # Test set-up my $node_primary = PostgreSQL::Test::Cluster->new('test'); $node_primary->init(allows_streaming => 'logical'); $node_primary->append_conf('postgresql.conf', 'shared_preload_libraries = pg_failover_slots'); $node_primary->start; is( $node_primary->psql( 'postgres', qq[SELECT pg_create_physical_replication_slot('standby_1');]), 0, 'physical slot created on primary'); my $backup_name = 'my_backup'; # Take backup $node_primary->backup($backup_name); # Create streaming standby linking to primary my $node_standby = PostgreSQL::Test::Cluster->new('standby_1'); $node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1); $node_standby->append_conf('postgresql.conf', 'hot_standby_feedback = on'); my $pg_version = `pg_config --version | awk '{print \$2}'`; if ($pg_version >= 12) { $node_standby->append_conf('postgresql.conf', 'primary_slot_name = standby_1'); } else { $node_standby->append_conf('recovery.conf', 'primary_slot_name = standby_1'); } $node_standby->start; # Wait for the sync worker to start $node_standby->poll_query_until('postgres', "SELECT count(*) > 0 FROM pg_stat_activity where application_name LIKE 'pg_failover_slots%'"); # Create table. $node_primary->safe_psql('postgres', "CREATE TABLE test_repl_stat(col1 serial)"); # Create replication slots. $node_primary->safe_psql( 'postgres', qq[ SELECT pg_create_logical_replication_slot('regression_slot1', 'test_decoding'); SELECT pg_create_logical_replication_slot('regression_slot2', 'test_decoding'); SELECT pg_create_logical_replication_slot('regression_slot3', 'test_decoding'); SELECT pg_create_logical_replication_slot('regression_slot4', 'test_decoding'); ]); # Simulate some small load to move things forward and wait for slots to be # synced downstream. while (1) { $node_primary->safe_psql( 'postgres', qq[ SELECT data FROM pg_logical_slot_get_changes('regression_slot1', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); SELECT data FROM pg_logical_slot_get_changes('regression_slot2', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); SELECT data FROM pg_logical_slot_get_changes('regression_slot3', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); SELECT data FROM pg_logical_slot_get_changes('regression_slot4', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); ]); $node_primary->safe_psql('postgres', "INSERT INTO test_repl_stat DEFAULT VALUES;"); last if ($node_standby->safe_psql('postgres',"SELECT count(*) > 3 FROM pg_replication_slots WHERE NOT active") eq "t"); sleep(1); } # Now that slots moves they should be all synced is($node_standby->safe_psql('postgres', "SELECT slot_name FROM pg_replication_slots ORDER BY slot_name"), q[regression_slot1 regression_slot2 regression_slot3 regression_slot4]); # Wait for replication to catch up my $primary_lsn = $node_primary->lsn('write'); $node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); # failover to standby $node_primary->stop; $node_standby->promote; # Check that slots are on promoted standby is($node_standby->safe_psql('postgres', "SELECT slot_name FROM pg_replication_slots ORDER BY slot_name"), q[regression_slot1 regression_slot2 regression_slot3 regression_slot4]); # Write on promoted standby $node_standby->safe_psql('postgres', "INSERT INTO test_repl_stat DEFAULT VALUES;"); # Check that slots are consumable on promoted stanby $node_standby->safe_psql( 'postgres', qq[ SELECT data FROM pg_logical_slot_get_changes('regression_slot1', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); SELECT data FROM pg_logical_slot_get_changes('regression_slot2', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); SELECT data FROM pg_logical_slot_get_changes('regression_slot3', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); SELECT data FROM pg_logical_slot_get_changes('regression_slot4', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); ]); # shutdown $node_standby->stop; done_testing();