pax_global_header00006660000000000000000000000064141424436720014521gustar00rootroot0000000000000052 comment=31593a6782adb60e09326ffb784ee18144e08a6b pg_auto_failover-1.6.3/000077500000000000000000000000001414244367200150555ustar00rootroot00000000000000pg_auto_failover-1.6.3/.dockerignore000066400000000000000000000000631414244367200175300ustar00rootroot00000000000000.git src/bin/pg_autoctl/pg_autoctl docs Dockerfile pg_auto_failover-1.6.3/.editorconfig000066400000000000000000000010031414244367200175240ustar00rootroot00000000000000# top-most EditorConfig file root = true # rules for all files # we use tabs with indent size 4 [*] indent_style = tab indent_size = 4 tab_width = 4 end_of_line = lf insert_final_newline = true charset = utf-8 trim_trailing_whitespace = true # Don't change test output files, pngs or test data files [*.{out,png,data}] insert_final_newline = unset trim_trailing_whitespace = unset [*.{sql,sh,py,tex}] indent_style = space indent_size = 4 tab_width = 4 [*.yml] indent_style = space indent_size = 2 tab_width = 2 pg_auto_failover-1.6.3/.gitattributes000066400000000000000000000020431414244367200177470ustar00rootroot00000000000000* whitespace=space-before-tab,trailing-space *.[chly] whitespace=space-before-tab,trailing-space,indent-with-non-tab,tabwidth=4 *.dsl whitespace=space-before-tab,trailing-space,tab-in-indent *.patch -whitespace *.pl whitespace=space-before-tab,trailing-space,tabwidth=4 *.po whitespace=space-before-tab,trailing-space,tab-in-indent,-blank-at-eof *.sgml whitespace=space-before-tab,trailing-space,tab-in-indent,-blank-at-eol *.x[ms]l whitespace=space-before-tab,trailing-space,tab-in-indent # Avoid confusing ASCII underlines with leftover merge conflict markers README conflict-marker-size=32 README.* conflict-marker-size=32 # These files are maintained or generated elsewhere. We take them as is. configure -whitespace # all C files (implementation and header) use our style... *.[ch] citus-style # except these exceptions... src/bin/lib/parson/** -citus-style src/bin/lib/log/** -citus-style src/bin/lib/libs/** -citus-style src/bin/lib/pg/** -citus-style src/bin/lib/subcommands.c/** -citus-style src/monitor/version_compat.c -citus-style pg_auto_failover-1.6.3/.gitignore000066400000000000000000000012621414244367200170460ustar00rootroot00000000000000# Global excludes across all subdirectories *.o *.bc *.so *.so.[0-9] *.so.[0-9].[0-9] *.sl *.sl.[0-9] *.sl.[0-9].[0-9] *.dylib *.dll *.a *.mo *.pot objfiles.txt .deps/ *.gcno *.gcda *.gcov *.gcov.out lcov.info coverage/ *.vcproj *.vcxproj win32ver.rc *.exe lib*dll.def lib*.pc # Local excludes in root directory /config.log /config.status /pgsql.sln /pgsql.sln.cache /Debug/ /Release/ /autom4te.cache /Makefile.global /src/Makefile.custom /tests/__pycache__/ /env/ # Exclude generated SQL files pgautofailover--?.?.sql !pgautofailover--1.0.sql # Exclude generated PDF and PNG files for the graphics docs/tikz/*.pdf docs/tikz/*.png # Exclude our demo/test tmux directory tmux/ valgrind/ pg_auto_failover-1.6.3/.readthedocs.yaml000066400000000000000000000003471414244367200203100ustar00rootroot00000000000000version: 2 # Build from the docs/ directory with Sphinx sphinx: configuration: docs/conf.py # Explicitly set the version of Python and its requirements python: version: 3.7 install: - requirements: docs/requirements.txtpg_auto_failover-1.6.3/.travis.yml000066400000000000000000000045451414244367200171760ustar00rootroot00000000000000sudo: required dist: focal language: c python: - "3.7" cache: apt: true directories: - /home/travis/postgresql matrix: fast_finish: true include: - env: PGVERSION=10 TEST=multi - env: PGVERSION=11 TEST=multi - env: PGVERSION=12 TEST=multi - env: PGVERSION=13 TEST=multi - env: PGVERSION=14 TEST=multi - env: PGVERSION=10 TEST=single - env: PGVERSION=11 TEST=single - env: PGVERSION=12 TEST=single - env: PGVERSION=13 TEST=single - env: PGVERSION=14 TEST=single - env: PGVERSION=10 TEST=monitor - env: PGVERSION=11 TEST=monitor - env: PGVERSION=12 TEST=monitor - env: PGVERSION=13 TEST=monitor - env: PGVERSION=14 TEST=monitor - env: PGVERSION=10 TEST=ssl - env: PGVERSION=11 TEST=ssl - env: PGVERSION=12 TEST=ssl - env: PGVERSION=13 TEST=ssl - env: PGVERSION=14 TEST=ssl - env: LINTING=true allow_failures: - env: PGVERSION=14 TEST=multi - env: PGVERSION=14 TEST=single - env: PGVERSION=14 TEST=monitor - env: PGVERSION=14 TEST=ssl before_install: - git clone -b v0.7.18 --depth 1 https://github.com/citusdata/tools.git - sudo make -C tools install - 'if [ -z "$LINTING" ]; then setup_apt; fi' - 'if [ -z "$LINTING" ]; then nuke_pg; fi' - python --version - python3 --version - sudo apt-get install liblz4-1 - sudo apt-get install liblz4-dev - sudo apt-get install bridge-utils - sudo apt-get install python3-pip - sudo apt-get install python3-nose - sudo apt-get install python3-psycopg2 - sudo apt-get install python3-setuptools - sudo -H pip3 install pyroute2>=0.5.17 - pip3 install --user black install: - 'if [ -n "$LINTING" ]; then install_uncrustify; fi' # cleanup uncrustify build files - 'if [ -n "$LINTING" ]; then rm -rf uncrustify*; fi' - 'if [ -z "$LINTING" ]; then install_pg; fi' - 'if [ -z "$LINTING" ]; then install_custom_pg; fi' - env - 'if [ -z "$LINTING" ]; then pg_config; fi' - 'if [ -z "$LINTING" ]; then PATH=`pg_config --bindir`:$PATH which pg_ctl; fi' script: - 'if [ -n "$LINTING" ]; then citus_indent --check; fi' - 'if [ -n "$LINTING" ]; then black --check .; fi' - 'if [ -n "$LINTING" ]; then ci/banned.h.sh; fi' - 'if [ -z "$LINTING" ]; then make -j5 CFLAGS=-Werror; fi' - 'if [ -z "$LINTING" ]; then sudo make install; fi' - 'if [ -z "$LINTING" ]; then PATH=`pg_config --bindir`:$PATH make test; fi' pg_auto_failover-1.6.3/CHANGELOG.md000066400000000000000000000371061414244367200166750ustar00rootroot00000000000000### pg_auto_failover v1.6.3 (November 5, 2021) ### This is a bug fix release for the 1.6 series. This release also introduces a new ncurses interactive dashboard that makes it easier to understand the current state (and transitions) of a formation. The new command `pg_autoctl watch` can be used to monitor pg_auto_failover activity. #### Added * New ncurses dashboard with command pg_autoctl watch (#809) #### Changed * Allow setting maximum-backup-rate on create postgres step (#812) #### Fixed * Work around pg_replication_slot_advance xmin maintenance bug. (#815) * Fix "Current State" to "Reported State", and a docs cross-ref. * Monitor config set postgresql.pg_ctl bug fix (#818) * Fix how we clean-up our logs semaphore. (#811) * Fix synchronous_standby_names return value when there is no primary (#807) ### pg_auto_failover v1.6.2 (September 8, 2021) ### This is a bug fix release for the 1.6 series. #### Added * Also retry libpq connections to a local host. (#793) #### Changed * Only exit at upgrade when the on-disk binary is ready. (#771) * Only use wait_maintenance to wait for wait_primary (#794) * Get rid of the JOIN_PRIMARY state. (#796) * Make sure to disable sync rep when initializing a primary. (#801) #### Fixed * Avoid re-electing primary during a switchover. (#772) * Improve error messages for missing configuration files. (#779) * Fix replication slot maintenance on secondary nodes. (#781) * Fix problems with bad migration to 1.5 (#792) * Fix maintenance state related transitions. (#786) * Per valgrind, fix some memory leaks. (#799) * When creating from an existing PGDATA, fix missing initialization. (#802) ### pg_auto_failover v1.6.1 (July 7, 2021) ### This release contains monitor schema changes, so the monitor extension gets a version bump from 1.5 to 1.6, and this is the first release in the 1.6 series. In this release we introduce a new state in the FSM: "dropped". This state allows a node to realise it's been dropped from the monitor, and act accordingly (mostly, stops cleanly and register it's been dropped). This means that the command `pg_autoctl drop node` now only completes when the node that is being dropped is still reachable. To drop a node that is unreachable (e.g. machine died), you should now use the command `pg_autoctl drop node --force`. #### Added * Feature crash recovery before pg_rewind (#656) * Allow building pg_auto_failover with Postgres 14. (#716) * Track nodes current timeline on the monitor and use it during election. (#730) * Implement drop-at-a-distance semantics. (#734) * Add the reported timeline id to the monitor events table. (#753) #### Changed * Fix how many nodes need to report their LSN to perform a failover. (#707) * Allow an old primary node (demoted/catchingup) to join another election. (#727) * Have pg_autoctl drop node command wait until the node has been removed. (#748) #### Fixed * Fix/consider timeline when reaching secondary (#695) * Install btree_gist when upgrade to >= 1.4, not just 1.4. (#714) * Fix a race condition issue around health check updates. (#720) * Not all the monitor code got the memo about nodeid being a bigint. (#729) * Use sysctl(3) on BSD (#733) * Fix transaction begin failure handling (#751) * Fix a connection leak at SIGINT. (#759) ### pg_auto_failover v1.5.2 (May 20, 2021) ### This is a bugfix release for the v1.5 series. In addition to bug fixes, this release also contains a lift of the restriction to always have at least two nodes with a non-zero candidate priority in a group. It is now possible to use pg_auto_failover and only have manual failover. If you're using the output from the command `pg_autoctl show settings --json` please notice that we changed the JSON format we use in the output. See #697 for details. #### Added * Check that a "replication" connection is possible before pg_rewind. [#665] * Allow manual promotion of nodes with candidate priority zero. [#661] * Implement a new configuration option listen_notifications_timeout. [#677] * Log monitor health changes as events. [#703] #### Changed * Use PGDATA owner for the systemd service file. [#666] * Remove logging of connection password in monitor string [#512] * Improve docs color contrast for accessibility [#674] * Fix pg_autoctl show settings --json output. [#697] #### Fixed * Docs: typo fix for Postgres client certificate file (postgresql.crt). [#652] * Plug connection leaks found during profiling [#582] * Review find_extension_control_file[) error handling. (#659] * Fix/identify system before pg basebackup [#658] * Fix a pipe issue and return code [#619] * Fix memory leak allocated by createPQExpBuffer() (#671] * Fix parsing pg version string for replication slots support on standby. [#676] * Fix/debian cluster for the monitor [#681] * Fix a memory leak in uri_contains_password. [#687] * Fix a memory leak in BuildNodesArrayValues. [#693] * Complete transition of a second [or greater) failed primary (#706] ### pg_auto_failover v1.5.1 (March 24, 2021) ### This release contains monitor schema changes, so the monitor extension gets a version bump from 1.4 to 1.5, and this is the first release in the 1.5 series. #### Added * Add support for systemd ExecReload option. [#623] * Implement online enable/disable monitor support. [#591] * Add individual pages for the pg_autoctl commands. [#632] * Implement a demo application showing client-side reconnections. [#568] #### Changed The main change in the CLI is that `pg_autoctl show uri --monitor` does not display the connection string to the monitor anymore, instead it allows passing the URI to the monitor, same as with the other `pg_autoctl show commands`. To display the monitor connection string, use `pg_autoctl show uri --formation monitor` now. * Allow using --monitor uri for a lot of commands [#576] * Review pg_autoctl show state output, and docs. [#617] * Avoid using synchronous standby name wildcard [#629] #### Fixed * Fix supervisor messages about exited services. [#589] * Fix memory management issue in monitor_register_node. [#590] * Fix a buffer overlap instruction that macOs libc fails to process. [#610] * Add pg_logging_init for PG version 12 and above [#612] * Fix skip hba propagation [#588, #609] * Improve DNS lookup error handling. [#615] * Do not leak psycopg2 connections during testing [#628] ### pg_auto_failover v1.4.2 (February 3, 2021) ### This is a bugfix release for v1.4 series #### Added * Implement pg_autoctl do azure commands (QA tooling). [#544] * pg autoctl show settings. [#549] * Improve docker images (build, release). [#556] * Run monitor extension test suite in the CI. [#553] * Implement pg_autoctl create postgres --pg-hba-lan option. [#561] #### Fixed * Deduplicate PATH entries, following symlinks. [#547] * Review consequences of pg_autoctl set formation number-sync-standbys 0. [#535] * Fix bytes mangling opportunity. [#550] * Allow setting replication settings to their current value. [#570] * Fix the return code when stop the node that is started by pg_autoctl. [#572] * Set formation kind when expected. [#577] * Fix retry loops with wait time to stop using 100% CPU. [#578] ### pg_auto_failover v1.4.1 (December 3, 2020) ### This is a bugfix release for v1.4.0 #### Added * Implement HBA hostname DNS checks, and warn users when needed. [#458] * Make it obvious when an extension is missing. [#475] #### Changed * Refrain from using FIRST %d in synchronous_standby_names. [#461] * Always use node id and node name in messages. [#462] #### Fixed * Force closing the connection used to check replication slots. [#451] * Fix when to ProceedWithMSFailover after a candidate is selected. [#439] * Ensure that the monitor owns the Postgres (sub-)process. [#455] * Avoid a race condition between file_exists() and read_file(). [#460] * Review memory/malloc errors and potential leaks. [#478] * Review APPLY_SETTINGS transitions in cases including node removal. [#480] * Fix when we switch synchronous_standby_names to '*'. [#488] * Fix hostname discovery [#479] * Fix non default formation [#489] * Review when to assign catching-up on unhealthy secondary. [#493] * Fix race conditions in multiple-standby Group FSM for failover. [#499] * Fix synchronous_standby_names when disabling maintenance. [#502] * Fix debian stats temp directory [#504] * Use PG_CONFIG has a hint to choose one of multiple Postgres versions. [#510] * Fix build for *BSD [#519] * Refrain from considering a WAIT_STANDBY node unhealthy. [#524] * Allow a DEMOTED primary to come back to a new wait/join/primary. [#524] * Fix pg_autoctl show standby-names. [#524] * Fix unhealthy nodes processing when wait_primary. [#521] * Fix FSM when a non-candidate node is back online. [#533] * Fix assigning catchingup to unhealthy secondaries. [#534] * Fix pg_autoctl set formation number-sync-standbys log output. [#536] ### pg_auto_failover v1.4.0 (September 23, 2020) ### The main focus of this release is the new capability of pg_auto_failover to manage any number of standby nodes, where it used to manage only a single secondary before. This comes with new documentation coverage and Postgres production architectures support. The main changes of this release are: - A Postgres group allows any number of Postgres nodes, any of them can be setup to participate in the replication quorum (it's then a sync standby) or not (it's then an async standby). - Any node can be setup with a candidate priority of zero, and then a failover will never pick this node as the new primary. Two nodes with non-zero candidate priority are expected in any group, and the monitor enforces that. - The --nodename option is removed, replaced by the new option --hostname. At upgrade time your configuration file is migrated to the new format automatically. Adding to that, it is now possible to give names to your nodes. This change breaks scripts that you may have: replace --nodename with --hostname, add --name if needed. It is also possible to edit a node's metadata at runtime or when starting a node. The command `pg_autoctl run` now supports options `--name`, `--hostname`, and `--pgport`. That's useful in environments where IP addresses must be used and can change at each reboot. - The `pg_autoctl` process is now the parent process of Postgres. As of `pg_autoctl` 1.4 Postgres is running only when `pg_autoctl` is running. This simplifies systemd integration and allows a better upgrade process. It also makes `pg_autoctl` easier to run in containers. - It is possible to `pg_autoctl create postgres` when PGDATA is an already existing data_directory: as usual the monitor decides if that node is going to be a primary or a standby. All the Postgres nodes in the same group must have the same Postgres system identifer, and that is now enforced on the monitor. #### Added * Allow multiple standbys in a Postgres group [#326] * Allow adding a standby node from an exiting PGDATA directory [#276] * Edit the HBA file as soon as a new node is added to the group [#311] * Check for monitor extension version every time we connect [#318] * Command pg_autoctl perform failover now waits until failover is done [#290] * Implement pg_autoctl enable maintenance --allow-failover [#306] * Implement a node name [#340] * Test primary_conninfo before starting Postgres standby [#350] * Compute some memory/cpu Postgres tuning for the host [#335] * Provide a better pg_auto_failover upgrade experience [#296] * Add support for building against Postgres 13 [#312] * Allow registering multiple standbys concurrently [#395] * Implement a retry policy for deadlocks and other transient failures [#359] * Implement support for interactive tmux sessions [#409] * Use monitor notifications to wake up from sleep in the main keeper loop [#387] #### Changed * Make pg_autoctl the parent process for Postgres [#265, #267, #284] * Rename --nodename to --hostname, and nodename to nodehost [#273] * Improve output for many pg_autoctl commands, including JSON output * Log and notify node health changes [#334] * Set default password encryption based on --auth [#383] #### Fixed * Fix pg_autoctl perform failover/switchover default --group [#289] * Fix pgautofailover extension upgrade script from 1.2 to 1.3 [#288] * Improve connection retry attempts [#299, #302] * Skip network DNS probes on pg_autoctl create monitor --skip-pg-hba [#298] * Fix pg_autoctl perform failover [#294, #307] * Do not always require --ssl-ca-file for custom SSL [#303] * Review the registering process & transaction [#309] * Fix usage of replication slots to avoid a Postgres bug [#321] * Fix fatal auth pgautofailover monitor [#361] ### pg_auto_failover v1.3.1 (May 7, 2020) ### * Fix build system for packaging extension files [#258] ### pg_auto_failover v1.3 (April 30, 2020) ### * Fix systemd integration [#212] * Change default TLS cipher list [#214] * SSL certificates management fixes [#228] * Improve replication slots handling [#222] * Implement pg_autoctl enable|disable ssl [#233] * Implement pg_autoctl show uri --monitor [#238] * Implement pg_autoctl stop|reload for the monitor [#241] * Don't create pgautofailover_monitor user anymore [#247] ### pg_auto_failover v1.2 (March 17, 2020) ### * Feature implement an option (--skip-pg-hba) to skip editing of HBA rules by pg_autoctl (#169) * Feature implement support for SSL connections (--ssl-mode) (#171) * Feature implement pg_autoctl drop monitor and drop node --nodename --nodeport (#179) * Feature implement SSL support with self-signed certificates * Feature make --auth option mandatory when creating nodes * Fix error out when the pgautofailover is not in shared_preload_libraries * Fixes for warnings found in static analysis ### pg_auto_failover v1.0.6 (Feb 13, 2020) ### * Fix permissions missing in monitor database #94 via $141 * Fix creating a secondary server in an already existing directory. (#96) * Fix unable to get --pgdata value in pg_autoctl get config command #99 * Fix registering a pre-existing Postgres cluster to the monitor #111 * Fix demoted primary cannot catchup, wrong working directory? #129 * Fix review main loop chatter, make it less verbose by default. (#97) * Fix refrain from using PGDATA as the systemd service WorkingDirectory. (#123) * Fix behaviour with stale postmaster pid (#152) * Feature add perform destroy command, and -destroy option to drop node command #141 * Feature support debian/ubuntu style PostgreSQLclusters #135 * Feature add files option to show command * Feature add --run option to create command #110 * Feature add --json option to all commands for json output #106 * Feature report current LSN of Postgres nodes rather than WAL lag. (#53) ### pg_auto_failover v1.0.5 (Sep 20, 2019) ### * Fix not being able to create monitor/postgres nodes on mac (#60) * Add man page entry (#61) ### pg_auto_failover v1.0.4 (Sep 5, 2019) ### * Add PG 12 support ### pg_auto_failover v1.0.3 (Jul 30, 2019) ### * Add support for systemd integration * Allow pg_auto_failover extension upgrade * Add enable/disable maintenance command in CLI * Add --auth option to configure authentication method * Fix crash when ip address can not be resolved in network interface (#40) * Fix replication slot being left open after a failover (#42) * Other minor fixes ### pg_auto_failover v1.0.2 (May 23, 2019) ### * Implement a default value for --nodename (#6, #16) * Code cleanup ### pg_auto_failover v1.0.1 (May 6, 2019) ### * Fix a problem where the Postgres service was not restarted when shutdown (#2) * Clarify name in background workers for the monitor (#3) * Show full version number in `pg_autoctl version` (#4) * Warn the user when the primary node is not running pg_autoctl as a service while initializing the secondary (#11) * Improve documentation (#10) ### pg_auto_failover v1.0.0 (May 6, 2019) ### * First release. pg_auto_failover-1.6.3/CONTRIBUTING.md000066400000000000000000000065761414244367200173240ustar00rootroot00000000000000## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com. When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repositories using our CLA. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. ### Following our coding conventions We format all our code using the coding conventions in the [citus_indent](https://github.com/citusdata/tools/tree/develop/uncrustify) tool. This tool uses uncrustify under the hood. To format the python test files we use [black](https://github.com/psf/black). ```bash # Uncrustify changes the way it formats code every release a bit. To make sure # everyone formats consistently we use version 0.68.1: curl -L https://github.com/uncrustify/uncrustify/archive/uncrustify-0.68.1.tar.gz | tar xz cd uncrustify-uncrustify-0.68.1/ mkdir build cd build cmake .. make -j5 sudo make install cd ../.. git clone https://github.com/citusdata/tools.git cd tools make uncrustify/.install # Be sure to add ~/.local/bin to PATH so you can find black pip install black --user ``` After installing like this you can run the following before committing: ```bash make indent ``` You can also run the following to automatically format all the files that you have changed before committing. ```bash cat > .git/hooks/pre-commit << __EOF__ #!/bin/bash citus_indent --check --diff || { citus_indent --diff; exit 1; } black --check --quiet . || { black .; exit 1; } __EOF__ chmod +x .git/hooks/pre-commit ``` ### Running tests The integration tests are written using Python and the [nose](https://nose.readthedocs.io/en/latest/index.html) testing framework. They are run in a docker container, so you need [docker](https://docs.docker.com/get-docker/) installed locally. ```bash make run-test ``` You can filter the tests you are running with the `TEST` environment variable. ```bash make TEST=multi run-test # runs tests matching tests/test_multi* make TEST=single run-test # runs tests _not_ matching tests/test_multi* make TEST=test_auth run-test # runs tests/test_auth.py ``` ### Producing the documentation diagrams The diagrams are TikZ sources, which means they're edited with your usual editor tooling. The diagrams are actually code, and the compilation tool chain involves the following software: - LuaTex - TikZ - pdftocairo, found in the poppler software Current TeX distributions should include luatex and tikz for you already. One such distribution is TexLive and is widely available. If you want to use TexLive, note that you may need to install some extra packages for the styles we use in our PDFs. For example `texlive-fonts-extra` is needed for Debian. #### For Ubuntu ``` sudo apt-get install latexmk texlive texlive-luatex texlive-latex-extra poppler-utils ``` pg_auto_failover-1.6.3/Dockerfile000066400000000000000000000072111414244367200170500ustar00rootroot00000000000000FROM debian:buster-slim as build-test ENV PGVERSION 10 RUN apt-get update \ && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ curl \ gnupg \ git \ iproute2 \ libicu-dev \ libkrb5-dev \ libssl-dev \ libedit-dev \ libreadline-dev \ libpam-dev \ zlib1g-dev \ libxml2-dev \ libxslt1-dev \ libselinux1-dev \ libncurses-dev \ libncurses6 \ make \ openssl \ pipenv \ python3-nose \ python3 \ python3-setuptools \ python3-psycopg2 \ python3-pip \ sudo \ tmux \ watch \ lsof \ psutils \ valgrind \ postgresql-common \ && rm -rf /var/lib/apt/lists/* RUN pip3 install pyroute2>=0.5.17 RUN curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - RUN echo "deb http://apt.postgresql.org/pub/repos/apt buster-pgdg main" > /etc/apt/sources.list.d/pgdg.list # bypass initdb of a "main" cluster RUN echo 'create_main_cluster = false' | sudo tee -a /etc/postgresql-common/createcluster.conf RUN apt-get update \ && apt-get install -y --no-install-recommends \ postgresql-server-dev-${PGVERSION} \ postgresql-${PGVERSION} \ && rm -rf /var/lib/apt/lists/* RUN pip3 install pyroute2>=0.5.17 RUN adduser --disabled-password --gecos '' docker RUN adduser docker sudo RUN adduser docker postgres RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers WORKDIR /usr/src/pg_auto_failover COPY Makefile ./ COPY ./src/ ./src RUN make -s clean && make -s install -j8 COPY ./tests/ ./tests COPY ./valgrind ./valgrind RUN chmod a+w ./valgrind USER docker ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/postgresql/${PGVERSION}/bin ENV PG_AUTOCTL_DEBUG 1 FROM debian:buster-slim as run ENV PGVERSION 10 RUN apt-get update \ && apt-get install -y --no-install-recommends \ ca-certificates \ curl \ gnupg \ make \ sudo \ tmux \ watch \ libncurses6 \ lsof \ psutils \ dnsutils \ bind9-host \ postgresql-common \ libpq-dev \ && rm -rf /var/lib/apt/lists/* RUN curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - RUN echo "deb http://apt.postgresql.org/pub/repos/apt buster-pgdg main" > /etc/apt/sources.list.d/pgdg.list # bypass initdb of a "main" cluster RUN echo 'create_main_cluster = false' | sudo tee -a /etc/postgresql-common/createcluster.conf RUN apt-get update\ && apt-get install -y --no-install-recommends postgresql-${PGVERSION} \ && rm -rf /var/lib/apt/lists/* RUN adduser --disabled-password --gecos '' --home /var/lib/postgres docker RUN adduser docker sudo RUN adduser docker postgres RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers COPY --from=build-test /usr/lib/postgresql/${PGVERSION}/lib/pgautofailover.so /usr/lib/postgresql/${PGVERSION}/lib COPY --from=build-test /usr/share/postgresql/${PGVERSION}/extension/pgautofailover* /usr/share/postgresql/${PGVERSION}/extension/ COPY --from=build-test /usr/lib/postgresql/${PGVERSION}/bin/pg_autoctl /usr/local/bin # # In tests/upgrade/docker-compose.yml we use internal docker volumes in # order to be able to restart the nodes and keep the data around. For that # to work, we must prepare a mount-point that is owned by our target user # (docker), so that once the volume in mounted there by docker compose, # pg_autoctl has the necessary set of privileges. # RUN mkdir -p /var/lib/postgres \ && chown -R docker /var/lib/postgres USER docker ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/postgresql/${PGVERSION}/bin ENV PG_AUTOCTL_DEBUG 1 ENV PGDATA /var/lib/postgres/pgaf CMD pg_autoctl do tmux session --nodes 3 --binpath /usr/local/bin/pg_autoctl pg_auto_failover-1.6.3/Dockerfile.i386000066400000000000000000000035361414244367200175460ustar00rootroot00000000000000FROM i386/ubuntu:bionic ENV PGVERSION 10 RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Paris \ apt-get update \ && DEBIAN_FRONTEND=noninteractive TZ=Europe/Paris \ apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ git \ iproute2 \ libicu-dev \ libkrb5-dev \ libssl-dev \ libedit-dev \ libreadline-dev \ libpam-dev \ zlib1g-dev \ libxml2-dev \ libxslt1-dev \ libselinux1-dev \ make \ openssl \ python3-nose \ python3 \ python3-setuptools \ python3-psycopg2 \ python3-pip \ sudo \ tmux \ watch \ lsof \ psutils \ valgrind \ postgresql-common \ postgresql-server-dev-${PGVERSION} \ && rm -rf /var/lib/apt/lists/* RUN pip3 install pyroute2>=0.5.17 RUN pip3 install pipenv # install Postgres 11 (current in bullseye), bypass initdb of a "main" cluster RUN echo 'create_main_cluster = false' | sudo tee -a /etc/postgresql-common/createcluster.conf RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Paris \ apt-get update \ && DEBIAN_FRONTEND=noninteractive TZ=Europe/Paris \ apt-get install -y --no-install-recommends postgresql-${PGVERSION} \ && rm -rf /var/lib/apt/lists/* RUN adduser --disabled-password --gecos '' docker RUN adduser docker sudo RUN adduser docker postgres RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers WORKDIR /usr/src/pg_auto_failover COPY Makefile ./ COPY ./src/ ./src RUN make -s clean && make -s install COPY ./tests/ ./tests COPY ./valgrind ./valgrind RUN chmod a+w ./valgrind # make installcheck needs to write in src/monitor (regressions.diffs and such) RUN chmod -R a+rwx ./src/monitor USER docker ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/postgresql/${PGVERSION}/bin ENV PG_AUTOCTL_DEBUG 1 ENV PGDATA /tmp/monitor RUN pg_autoctl create monitor --auth trust --no-ssl pg_auto_failover-1.6.3/LICENSE000066400000000000000000000017611414244367200160670ustar00rootroot00000000000000Copyright (c) Microsoft Corporation. All rights reserved. PostgreSQL License Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. IN NO EVENT SHALL MICROSOFT CORPORATION BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF MICROSOFT CORPORATION HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MICROSOFT CORPORATION SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND MICROSOFT CORPORATION HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. pg_auto_failover-1.6.3/Makefile000066400000000000000000000170371414244367200165250ustar00rootroot00000000000000# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the PostgreSQL License. TOP := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) CONTAINER_NAME = pg_auto_failover TEST_CONTAINER_NAME = pg_auto_failover_test DOCKER_RUN_OPTS = --privileged -ti --rm NOSETESTS = $(shell which nosetests3 || which nosetests) # Tests for the monitor TESTS_MONITOR = test_extension_update TESTS_MONITOR += test_installcheck TESTS_MONITOR += test_monitor_disabled TESTS_MONITOR += test_replace_monitor # This could be in TESTS_MULTI, but adding it here optimizes Travis run time TESTS_MONITOR += test_multi_alternate_primary_failures # Tests for single standby TESTS_SINGLE = test_auth TESTS_SINGLE += test_basic_operation TESTS_SINGLE += test_basic_operation_listen_flag TESTS_SINGLE += test_create_run TESTS_SINGLE += test_create_standby_with_pgdata TESTS_SINGLE += test_ensure TESTS_SINGLE += test_skip_pg_hba TESTS_SINGLE += test_config_get_set # Tests for SSL TESTS_SSL = test_enable_ssl TESTS_SSL += test_ssl_cert TESTS_SSL += test_ssl_self_signed # This could be in TESTS_SINGLE, but adding it here optimizes Travis run time TESTS_SSL += test_debian_clusters # Tests for multiple standbys TESTS_MULTI = test_multi_async TESTS_MULTI += test_multi_ifdown TESTS_MULTI += test_multi_maintenance TESTS_MULTI += test_multi_standbys # TEST indicates the testfile to run TEST ?= ifeq ($(TEST),) TEST_ARGUMENT = --where=tests else ifeq ($(TEST),multi) TEST_ARGUMENT = --where=tests --tests=$(TESTS_MULTI) else ifeq ($(TEST),single) TEST_ARGUMENT = --where=tests --tests=$(TESTS_SINGLE) else ifeq ($(TEST),monitor) TEST_ARGUMENT = --where=tests --tests=$(TESTS_MONITOR) else ifeq ($(TEST),ssl) TEST_ARGUMENT = --where=tests --tests=$(TESTS_SSL) else TEST_ARGUMENT = $(TEST:%=tests/%.py) endif # Documentation and images FSM = docs/fsm.png PDF = ./docs/_build/latex/pg_auto_failover.pdf # Command line with DEBUG facilities VALGRIND ?= ifeq ($(VALGRIND),) BINPATH = ./src/bin/pg_autoctl/pg_autoctl PG_AUTOCTL = PG_AUTOCTL_DEBUG=1 ./src/bin/pg_autoctl/pg_autoctl else BINPATH = $(abspath $(TOP))/src/tools/pg_autoctl.valgrind PG_AUTOCTL = PG_AUTOCTL_DEBUG=1 PG_AUTOCTL_DEBUG_BIN_PATH="$(BINPATH)" ./src/tools/pg_autoctl.valgrind endif NODES ?= 2 # total count of Postgres nodes NODES_ASYNC ?= 0 # count of replication-quorum false nodes NODES_PRIOS ?= 50 # either "50", or "50,50", or "50,50,0" etc NODES_SYNC_SB ?= -1 FIRST_PGPORT ?= 5500 CLUSTER_OPTS = "" # could be "--skip-pg-hba" TMUX_EXTRA_COMMANDS ?= "" TMUX_LAYOUT ?= even-vertical # could be "tiled" TMUX_TOP_DIR = ./tmux TMUX_SCRIPT = ./tmux/script-$(FIRST_PGPORT).tmux # make azcluster arguments AZURE_PREFIX ?= ha-demo-$(shell whoami) AZURE_REGION ?= paris AZURE_LOCATION ?= francecentral # Pick a version of Postgres and pg_auto_failover packages to install # in our target Azure VMs when provisionning # # sudo apt-get install -q -y postgresql-13-auto-failover-1.5=1.5.2 # postgresql-${AZ_PG_VERSION}-auto-failover-${AZ_PGAF_DEB_VERSION}=${AZ_PGAF_VERSION} AZ_PG_VERSION ?= 13 AZ_PGAF_DEB_VERSION ?= 1.6 AZ_PGAF_DEB_REVISION ?= 1.6.3-1 export AZ_PG_VERSION export AZ_PGAF_DEB_VERSION export AZ_PGAF_DEB_REVISION all: monitor bin ; install: install-monitor install-bin ; clean: clean-monitor clean-bin ; check: check-monitor ; monitor: $(MAKE) -C src/monitor/ all clean-monitor: $(MAKE) -C src/monitor/ clean install-monitor: monitor $(MAKE) -C src/monitor/ install check-monitor: install-monitor $(MAKE) -C src/monitor/ installcheck bin: $(MAKE) -C src/bin/ all clean-bin: $(MAKE) -C src/bin/ clean install-bin: bin $(MAKE) -C src/bin/ install test: sudo -E env "PATH=${PATH}" USER=$(shell whoami) \ $(NOSETESTS) \ --verbose \ --nologcapture \ --nocapture \ --stop \ ${TEST_ARGUMENT} indent: citus_indent black . docs: $(FSM) $(MAKE) -C docs html build: docker build \ $(DOCKER_BUILD_OPTS) \ -t $(CONTAINER_NAME) \ . interactive-test: docker run --name $(CONTAINER_NAME) --rm -ti $(CONTAINER_NAME) build-test: docker build \ $(DOCKER_BUILD_OPTS) \ --target build-test \ -t $(TEST_CONTAINER_NAME) \ . run-test: build-test docker run \ --name $(TEST_CONTAINER_NAME) \ $(DOCKER_RUN_OPTS) \ $(TEST_CONTAINER_NAME) \ make -C /usr/src/pg_auto_failover test \ TEST='${TEST}' build-i386: docker build -t i386:latest -f Dockerfile.i386 . # expected to be run from within the i386 docker container installcheck-i386: pg_autoctl run & pg_autoctl do pgsetup wait $(MAKE) -C src/monitor installcheck run-installcheck-i386: build-i386 docker run --platform linux/386 --rm -it --privileged i386 make installcheck-i386 man: $(MAKE) -C docs man pdf: $(PDF) $(PDF): $(MAKE) -s -C docs/tikz pdf perl -pi -e 's/(^.. figure:: .*)\.svg/\1.pdf/' docs/*.rst perl -pi -e 's/▒/~/g' docs/ref/pg_autoctl_do_demo.rst $(MAKE) -s -C docs latexpdf perl -pi -e 's/(^.. figure:: .*)\.pdf/\1.svg/' docs/*.rst perl -pi -e 's/~/▒/g' docs/ref/pg_autoctl_do_demo.rst ls -l $@ $(FSM): bin $(PG_AUTOCTL) do fsm gv | dot -Tpng > $@ $(TMUX_SCRIPT): bin mkdir -p $(TMUX_TOP_DIR) $(PG_AUTOCTL) do tmux script \ --root $(TMUX_TOP_DIR) \ --first-pgport $(FIRST_PGPORT) \ --nodes $(NODES) \ --async-nodes $(NODES_ASYNC) \ --node-priorities $(NODES_PRIOS) \ --sync-standbys $(NODES_SYNC_SB) \ $(CLUSTER_OPTS) \ --binpath $(BINPATH) \ --layout $(TMUX_LAYOUT) > $@ tmux-script: $(TMUX_SCRIPT) ; tmux-clean: bin $(PG_AUTOCTL) do tmux clean \ --root $(TMUX_TOP_DIR) \ --first-pgport $(FIRST_PGPORT) \ --nodes $(NODES) tmux-session: bin $(PG_AUTOCTL) do tmux session \ --root $(TMUX_TOP_DIR) \ --first-pgport $(FIRST_PGPORT) \ --nodes $(NODES) \ --async-nodes $(NODES_ASYNC) \ --node-priorities $(NODES_PRIOS) \ --sync-standbys $(NODES_SYNC_SB) \ $(CLUSTER_OPTS) \ --binpath $(BINPATH) \ --layout $(TMUX_LAYOUT) cluster: install tmux-clean # This is explicitly not a target, otherwise when make uses multiple jobs # tmux-clean and tmux-session can have a race condidition where tmux-clean # removes the files that are just created by tmux-session. $(MAKE) tmux-session valgrind-session: build-test docker run \ --name $(TEST_CONTAINER_NAME) \ $(DOCKER_RUN_OPTS) \ $(TEST_CONTAINER_NAME) \ make -C /usr/src/pg_auto_failover \ VALGRIND=1 \ TMUX_TOP_DIR=/tmp/tmux \ NODES=$(NODES) \ NODES_ASYNC=$(NODES_ASYNC) \ NODES_PRIOS=$(NODES_PRIOS) \ NODES_SYNC_SB=$(NODES_SYNC_SB) \ CLUSTER_OPTS=$(CLUSTER_OPTS) \ TMUX_EXTRA_COMMANDS=$(TMUX_EXTRA_COMMANDS) \ TMUX_LAYOUT=$(TMUX_LAYOUT) \ tmux-session azcluster: all $(PG_AUTOCTL) do azure create \ --prefix $(AZURE_PREFIX) \ --region $(AZURE_REGION) \ --location $(AZURE_LOCATION) \ --nodes $(NODES) # make azcluster has been done before, just re-attach az: all $(PG_AUTOCTL) do azure tmux session azdrop: all $(PG_AUTOCTL) do azure drop .PHONY: all clean check install docs .PHONY: monitor clean-monitor check-monitor install-monitor .PHONY: bin clean-bin install-bin .PHONY: build-test run-test .PHONY: tmux-clean cluster .PHONY: azcluster azdrop az pg_auto_failover-1.6.3/NOTICE000066400000000000000000000130511414244367200157610ustar00rootroot00000000000000NOTICES AND INFORMATION Do Not Translate or Localize This software incorporates material from third parties. Microsoft makes certain open source code available at https://3rdpartysource.microsoft.com, or you may send a check or money order for US $5.00, including the product name, the open source component name, platform, and version number, to: Source Code Compliance Team Microsoft Corporation One Microsoft Way Redmond, WA 98052 USA Notwithstanding any other terms, you may reverse engineer this software to the extent required to debug changes to any libraries licensed under the GNU Lesser General Public License. --------------------------------------------------------- kgabis/parson 8beeb5ea4da5eedff8d3221307ef04855804a920 - MIT Copyright (c) 2012 - 2020 Krzysztof Gabis MIT License Copyright (c) 2012 - 2020 Krzysztof Gabis Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --------------------------------------------------------- --------------------------------------------------------- rxi/log.c f9ea34994bd58ed342d2245cd4110bb5c6790153 - MIT Copyright (c) 2020 rxi Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --------------------------------------------------------- mattiasgustavsson/libs a64e6e6f06b7b8392cec5614280f70411282508c - MIT Copyright (c) 2015 Mattias Gustavsson Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --------------------------------------------------------- postgres/postgres 9213462c539e6412fe0498a7f8e20b662e15c4ec PostgreSQL Database Management System (formerly known as Postgres, then as Postgres95) Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group Portions Copyright (c) 1994, The Regents of the University of California Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. --------------------------------------------------------- pg_auto_failover-1.6.3/README.md000066400000000000000000000336261414244367200163460ustar00rootroot00000000000000# pg_auto_failover [![Slack Status](http://slack.citusdata.com/badge.svg)](https://slack.citusdata.com) [![Documentation Status](https://readthedocs.org/projects/pg-auto-failover/badge/?version=master)](https://pg-auto-failover.readthedocs.io/en/master/?badge=master) pg_auto_failover is an extension and service for PostgreSQL that monitors and manages automated failover for a Postgres cluster. It is optimized for simplicity and correctness and supports Postgres 10 and newer. pg_auto_failover supports several Postgres architectures and implements a safe automated failover for your Postgres service. It is possible to get started with only two data nodes which will be given the roles of primary and secondary by the monitor. ![pg_auto_failover Architecture with 2 nodes](docs/tikz/arch-single-standby.svg?raw=true "pg_auto_failover Architecture with 2 nodes") The pg_auto_failover Monitor implements a state machine and relies on in-core PostgreSQL facilities to deliver HA. For example. when the **secondary** node is detected to be unavailable, or when its lag is too much, then the Monitor removes it from the `synchronous_standby_names` setting on the **primary** node. Until the **secondary** is back to being monitored healthy, failover and switchover operations are not allowed, preventing data loss. pg_auto_failover consists of the following parts: - a PostgreSQL extension named `pgautofailover` - a PostgreSQL service to operate the pg_auto_failover monitor - a pg_auto_failover keeper to operate your PostgreSQL instances, see `pg_autoctl run` Starting with pg_auto_failover version 1.4, it is possible to implement a production architecture with any number of Postgres nodes, for better data availability guarantees. ![pg_auto_failover Architecture with 3 nodes](docs/tikz/arch-multi-standby.svg?raw=true "pg_auto_failover Architecture with 3 nodes") By default, pg_auto_failover uses synchronous replication and every node that reaches the secondary state is added to synchronous_standby_names on the primary. With pg_auto_failover 1.4 it is possible to remove a node from the _replication quorum_ of Postgres. ## Dependencies At runtime, pg_auto_failover depends on only Postgres. Postgres versions 10, 11, 12, 13, and 14 are currently supported. At buildtime. pg_auto_failover depends on Postgres server development package like any other Postgres extensions (the server development package for Postgres 11 when using debian or Ubuntu is named `postgresql-server-dev-11`), and then `libssl-dev` and `libkrb5-dev` are needed to for the client side when building with all the `libpq` authentication options. ## Documentation Please check out project [documentation](https://pg-auto-failover.readthedocs.io/en/master/) for how to guides and troubleshooting information. ## Installing pg_auto_failover from packages Note that pg_auto_failover packages are also found in Postgres PGDG package repositories. If you're using those repositories already, you can install the packages from there. ### Ubuntu or Debian: When using debian, two packages are provided for pg_auto_failover: the monitor Postgres extension is packaged separately and depends on the Postgres version you want to run for the monitor itself. The monitor's extension package is named `postgresql-14-auto-failover` when targetting Postgres 14. Then another package is prepared that contains the `pg_autoctl` command, and the name of the package is `pg-auto-failover-cli`. That's the package that is needed for the Postgres nodes. ```bash # Add the repository to your system curl https://install.citusdata.com/community/deb.sh | sudo bash # Install pg_auto_failover sudo apt-get install postgresql-11-auto-failover # Confirm installation /usr/bin/pg_autoctl --version ``` ### Fedora, CentOS, or Red Hat: ```bash # Add the repository to your system curl https://install.citusdata.com/community/rpm.sh | sudo bash # Install pg_auto_failover sudo yum install -y pg-auto-failover10_11 # Confirm installation /usr/pgsql-11/bin/pg_autoctl --version ``` ## Building pg_auto_failover from source To build the project, make sure you have installed the build-dependencies, then just type `make`. You can install the resulting binary using `make install`. Build dependencies example on debian for Postgres 11: ~~~ bash $ sudo apt-get install postgresql-server-dev-11 libssl-dev libkrb5-dev libncurses6 ~~~ Then build pg_auto_failover from sources with the following instructions: ~~~ bash $ make $ sudo make install -j10 ~~~ For this to work though, the PostgreSQL client (libpq) and server (postgresql-server-dev) libraries must be available in your standard include and link paths. The `make install` step will deploy the `pgautofailover` PostgreSQL extension in the PostgreSQL directory for extensions as pointed by `pg_config`, and install the `pg_autoctl` binary command in the directory pointed to by `pg_config --bindir`, alongside other PostgreSQL tools such as `pg_ctl` and `pg_controldata`. ## Trying pg_auto_failover on your local computer Once the building and installation is done, follow those steps: 0. If you're building from sources, and if you've already been using tmux, then try the following command: ~~~ bash $ make cluster ~~~ This creates a tmux session with multiple panes that are each running a node for pg_auto_failover: the monitor, a first Postgres node, a second Postgres node, and then there is another tmux pane for interactive commands. 1. Install and run a monitor ~~~ bash $ export PGDATA=./monitor $ export PGPORT=5000 $ pg_autoctl create monitor --ssl-self-signed --hostname localhost --auth trust --run ~~~ 2. Get the Postgres URI (connection string) for the monitor node: ~~~ bash $ pg_autoctl show uri --monitor --pgdata ./monitor postgres://autoctl_node@localhost:5000/pg_auto_failover?sslmode=require ~~~ The following two steps are going to use the option `--monitor` which expects that connection string. So copy/paste your actual Postgres URI for the monitor in the next steps. 3. Install and run a primary PostgreSQL instance: ~~~ bash $ export PGDATA=./node_1 $ export PGPORT=5001 $ pg_autoctl create postgres \ --hostname localhost \ --auth trust \ --ssl-self-signed \ --monitor 'postgres://autoctl_node@localhost:5000/pg_auto_failover?sslmode=require' \ --run ~~~ 4. Install and run a secondary PostgreSQL instance, using exactly the same command, but with a different PGDATA and PGPORT, because we're running everything on the same host: ~~~ bash $ export PGDATA=./node_2 $ export PGPORT=5002 $ pg_autoctl create postgres \ --hostname localhost \ --auth trust \ --ssl-self-signed \ --monitor 'postgres://autoctl_node@localhost:5000/pg_auto_failover?sslmode=require' \ --run ~~~ 4. See the state of the new system: ~~~ bash $ pg_autoctl show state Name | Node | Host:Port | LSN | Reachable | Current State | Assigned State -------+-------+----------------+-----------+-----------+---------------------+-------------------- node_1 | 1 | localhost:5001 | 0/30000D8 | yes | primary | primary node_2 | 2 | localhost:5002 | 0/30000D8 | yes | secondary | secondary ~~~ That's it! You now have a running pg_auto_failover setup with two PostgreSQL nodes using Streaming Replication to implement fault-tolerance. ## Your first failover Now that we have two nodes setup and running, we can initiate a manual failover, also named a switchover. It is possible to trigger such an operation without any node having to actually fail when using pg_auto_failover. The command `pg_autoctl perform switchover` can be used to force pg_auto_failover to orchestrate a failover. Because all the nodes are actually running fine (meaning that `pg_autoctl` actively reports the local state of each node to the monitor), the failover process does not have to carefuly implement timeouts to make sure to avoid split-brain. ~~~ bash $ pg_autoctl perform switchover 19:06:41 63977 INFO Listening monitor notifications about state changes in formation "default" and group 0 19:06:41 63977 INFO Following table displays times when notifications are received Time | Name | Node | Host:Port | Current State | Assigned State ---------+--------+-------+----------------+---------------------+-------------------- 19:06:43 | node_1 | 1 | localhost:5001 | primary | draining 19:06:43 | node_2 | 2 | localhost:5002 | secondary | prepare_promotion 19:06:43 | node_2 | 2 | localhost:5002 | prepare_promotion | prepare_promotion 19:06:43 | node_2 | 2 | localhost:5002 | prepare_promotion | stop_replication 19:06:43 | node_1 | 1 | localhost:5001 | primary | demote_timeout 19:06:43 | node_1 | 1 | localhost:5001 | draining | demote_timeout 19:06:43 | node_1 | 1 | localhost:5001 | demote_timeout | demote_timeout 19:06:44 | node_2 | 2 | localhost:5002 | stop_replication | stop_replication 19:06:44 | node_2 | 2 | localhost:5002 | stop_replication | wait_primary 19:06:44 | node_1 | 1 | localhost:5001 | demote_timeout | demoted 19:06:44 | node_1 | 1 | localhost:5001 | demoted | demoted 19:06:44 | node_2 | 2 | localhost:5002 | wait_primary | wait_primary 19:06:45 | node_1 | 1 | localhost:5001 | demoted | catchingup 19:06:46 | node_1 | 1 | localhost:5001 | catchingup | catchingup 19:06:47 | node_1 | 1 | localhost:5001 | catchingup | secondary 19:06:47 | node_2 | 2 | localhost:5002 | wait_primary | primary 19:06:47 | node_1 | 1 | localhost:5001 | secondary | secondary 19:06:48 | node_2 | 2 | localhost:5002 | primary | primary ~~~ The promotion of the secondary node is finished when the node reaches the goal state *wait_primary*. At this point, the application that connects to the secondary is allowed to proceed with write traffic. Because this is a switchover and no nodes have failed, `node_1` that used to be the primary completes its cycle and joins as a secondary within the same operation. The Postgres tool `pg_rewind` is used to implement that transition. And there you have done a full failover from your `node_1`, former primary, to your `node_2`, new primary. We can have a look at the state now: ~~~ $ pg_autoctl show state Name | Node | Host:Port | LSN | Reachable | Current State | Assigned State -------+-------+----------------+-----------+-----------+---------------------+-------------------- node_1 | 1 | localhost:5001 | 0/3001648 | yes | secondary | secondary node_2 | 2 | localhost:5002 | 0/3001648 | yes | primary | primary ~~~ ## Cleaning-up your local setup You can use the commands `pg_autoctl stop`, `pg_autoctl drop node --destroy`, and `pg_autoctl drop monitor --destroy` if you want to get rid of everything set-up so far. ## Formations and Groups In the previous example, the options `--formation` and `--group` are not used. This means we've been using the default values: the default formation is named *default* and the default group id is zero (0). It's possible to add other services to the same running monitor by using another formation. ## Installing pg_auto_failover on-top of an existing Postgres setup The `pg_autoctl create postgres --pgdata ${PGDATA}` step can be used with an existing Postgres installation running at `${PGDATA}`, only with the primary node. On a secondary node, it is possible to re-use an existing data directory when it has the same `system_identifier` as the other node(s) already registered in the same formation and group. ## Application and Connection Strings To retrieve the connection string to use at the application level, use the following command: ~~~ bash $ pg_autoctl show uri --formation default --pgdata ... postgres://localhost:5002,localhost:5001/postgres?target_session_attrs=read-write&sslmode=require ~~~ You can use that connection string from within your application, adjusting the username that is used to connect. By default, pg_auto_failover edits the Postgres HBA rules to allow the `--username` given at `pg_autoctl create postgres` time to connect to this URI from the database node itself. To allow application servers to connect to the Postgres database, edit your `pg_hba.conf` file as documented in [the pg_hba.conf file](https://www.postgresql.org/docs/current/auth-pg-hba-conf.html) chapter of the PostgreSQL documentation. ## Reporting Security Issues Security issues and bugs should be reported privately, via email, to the Microsoft Security Response Center (MSRC) at [secure@microsoft.com](mailto:secure@microsoft.com). You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Further information, including the [MSRC PGP](https://technet.microsoft.com/en-us/security/dn606155) key, can be found in the [Security TechCenter](https://technet.microsoft.com/en-us/security/default). ## Authors * [Dimitri Fontaine](https://github.com/dimitri) * [Nils Dijk](https://github.com/thanodnl) * [Marco Slot](https://github.com/marcoslot) * [Louise Grandjonc](https://github.com/louiseGrandjonc) * [Joe Nelson](https://github.com/begriffs) * [Hadi Moshayedi](https://github.com/pykello) * [Lukas Fittl](https://github.com/lfittl) * [Murat Tuncer](https://github.com/mtuncer) * [Jelte Fennema](https://github.com/JelteF) ## License Copyright (c) Microsoft Corporation. All rights reserved. This project is licensed under the PostgreSQL License, see LICENSE file for details. This project includes bundled third-party dependencies, see NOTICE file for details. pg_auto_failover-1.6.3/cgmanifest.json000066400000000000000000000055501414244367200200750ustar00rootroot00000000000000{ "Registrations": [ { "Component": { "Type": "git", "git": { "RepositoryUrl": "https://github.com/rxi/log.c", "CommitHash": "f9ea34994bd58ed342d2245cd4110bb5c6790153" } }, "DevelopmentDependency": false }, { "Component": { "Type": "git", "git": { "RepositoryUrl": "https://github.com/kgabis/parson", "CommitHash": "8beeb5ea4da5eedff8d3221307ef04855804a920" } }, "DevelopmentDependency": false }, { "Component": { "Type": "git", "git": { "RepositoryUrl": "https://github.com/mattiasgustavsson/libs", "CommitHash": "a64e6e6f06b7b8392cec5614280f70411282508c" } }, "DevelopmentDependency": false }, { "Component": { "Type": "git", "git": { "RepositoryUrl": "https://github.com/postgres/postgres", "CommitHash": "9213462c539e6412fe0498a7f8e20b662e15c4ec" } }, "license": "PostgreSQL", "licenseDetail": [ "Portions Copyright (c) 1996-2010, The PostgreSQL Global Development Group", "", "Portions Copyright (c) 1994, The Regents of the University of California", "", "Permission to use, copy, modify, and distribute this software and its documentation for ", "any purpose, without fee, and without a written agreement is hereby granted, provided ", "that the above copyright notice and this paragraph and the following two paragraphs appear ", "in all copies.", "", "IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, ", "INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS ", "SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE ", "POSSIBILITY OF SUCH DAMAGE.", "", "THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ", "THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED ", "HEREUNDER IS ON AN \"AS IS\" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE ", "MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS." ], "version": "0.0.1", "DevelopmentDependency": false } ] }pg_auto_failover-1.6.3/ci/000077500000000000000000000000001414244367200154505ustar00rootroot00000000000000pg_auto_failover-1.6.3/ci/banned.h.sh000077500000000000000000000104211414244367200174620ustar00rootroot00000000000000#!/bin/sh # Checks for the APIs that are banned by microsoft. Since we compile for Linux # we use the replacements from https://github.com/intel/safestringlib # Not all replacement functions are available in safestringlib. If it doesn't # exist and you cannot rewrite the code to not use the banned API, then you can # add a comment containing "IGNORE-BANNED" to the line where the error is and # this check will ignore that match. # # The replacement function that you should use are listed here: # https://liquid.microsoft.com/Web/Object/Read/ms.security/Requirements/Microsoft.Security.SystemsADM.10082#guide set -eu files=$(find src -iname '*.[ch]' -type f | git check-attr --stdin citus-style | awk -F ':' '! /: unset$/ {print $1}') # grep is allowed to fail, that means no banned matches are found set +e # Required banned from banned.h. These functions are not allowed to be used at # all. # shellcheck disable=SC2086 grep -E '\b(strcpy|strcpyA|strcpyW|wcscpy|_tcscpy|_mbscpy|StrCpy|StrCpyA|StrCpyW|lstrcpy|lstrcpyA|lstrcpyW|_tccpy|_mbccpy|_ftcscpy|strcat|strcatA|strcatW|wcscat|_tcscat|_mbscat|StrCat|StrCatA|StrCatW|lstrcat|lstrcatA|lstrcatW|StrCatBuff|StrCatBuffA|StrCatBuffW|StrCatChainW|_tccat|_mbccat|_ftcscat|sprintfW|sprintfA|wsprintf|wsprintfW|wsprintfA|sprintf|swprintf|_stprintf|wvsprintf|wvsprintfA|wvsprintfW|vsprintf|_vstprintf|vswprintf|strncpy|wcsncpy|_tcsncpy|_mbsncpy|_mbsnbcpy|StrCpyN|StrCpyNA|StrCpyNW|StrNCpy|strcpynA|StrNCpyA|StrNCpyW|lstrcpyn|lstrcpynA|lstrcpynW|strncat|wcsncat|_tcsncat|_mbsncat|_mbsnbcat|StrCatN|StrCatNA|StrCatNW|StrNCat|StrNCatA|StrNCatW|lstrncat|lstrcatnA|lstrcatnW|lstrcatn|gets|_getts|_gettws|IsBadWritePtr|IsBadHugeWritePtr|IsBadReadPtr|IsBadHugeReadPtr|IsBadCodePtr|IsBadStringPtr|memcpy|RtlCopyMemory|CopyMemory|wmemcpy|lstrlen)\(' $files \ | grep -v "IGNORE-BANNED" \ && echo "ERROR: Required banned API usage detected" && exit 1 # Required banned from table on liquid. These functions are not allowed to be # used at all. # shellcheck disable=SC2086 grep -E '\b(strcat|strcpy|strerror|strncat|strncpy|strtok|wcscat|wcscpy|wcsncat|wcsncpy|wcstok|fprintf|fwprintf|printf|snprintf|sprintf|swprintf|vfprintf|vprintf|vsnprintf|vsprintf|vswprintf|vwprintf|wprintf|fscanf|fwscanf|gets|scanf|sscanf|swscanf|vfscanf|vfwscanf|vscanf|vsscanf|vswscanf|vwscanf|wscanf|asctime|atof|atoi|atol|atoll|bsearch|ctime|fopen|freopen|getenv|gmtime|localtime|mbsrtowcs|mbstowcs|memcpy|memmove|qsort|rewind|setbuf|wmemcpy|wmemmove)\(' $files \ | grep -v "IGNORE-BANNED" \ && echo "ERROR: Required banned API usage from table detected" && exit 1 # Recommended banned from banned.h. If you can change the code not to use these # that would be great. You can use IGNORE-BANNED if you need to use it anyway. # You can also remove it from the regex, if you want to mark the API as allowed # throughout the codebase (to not have to add IGNORED-BANNED everywhere). In # that case note it in this comment that you did so. # shellcheck disable=SC2086 grep -E '\b(wnsprintf|wnsprintfA|wnsprintfW|_snwprintf|_snprintf|_sntprintf|_vsnprintf|vsnprintf|_vsnwprintf|_vsntprintf|wvnsprintf|wvnsprintfA|wvnsprintfW|strtok|_tcstok|wcstok|_mbstok|makepath|_tmakepath| _makepath|_wmakepath|_splitpath|_tsplitpath|_wsplitpath|scanf|wscanf|_tscanf|sscanf|swscanf|_stscanf|snscanf|snwscanf|_sntscanf|_itoa|_itow|_i64toa|_i64tow|_ui64toa|_ui64tot|_ui64tow|_ultoa|_ultot|_ultow|CharToOem|CharToOemA|CharToOemW|OemToChar|OemToCharA|OemToCharW|CharToOemBuffA|CharToOemBuffW|alloca|_alloca|ChangeWindowMessageFilter)\(' $files \ | grep -v "IGNORE-BANNED" \ && echo "ERROR: Recomended banned API usage detected" && exit 1 # Recommended banned from table on liquid. If you can change the code not to use these # that would be great. You can use IGNORE-BANNED if you need to use it anyway. # You can also remove it from the regex, if you want to mark the API as allowed # throughout the codebase (to not have to add IGNORED-BANNED everywhere). In # that case note it in this comment that you did so. # Banned APIs ignored throughout the codebase: # - strlen # shellcheck disable=SC2086 grep -E '\b(alloca|getwd|mktemp|tmpnam|wcrtomb|wcrtombs|wcslen|wcsrtombs|wcstombs|wctomb|class_addMethod|class_replaceMethod)\(' $files \ | grep -v "IGNORE-BANNED" \ && echo "ERROR: Recomended banned API usage detected" && exit 1 exit 0 pg_auto_failover-1.6.3/docker-compose.yml000066400000000000000000000035401414244367200205140ustar00rootroot00000000000000version: "3.9" # optional since v1.27.0 services: monitor: image: citusdata/pg_auto_failover:demo environment: PGDATA: /tmp/pgaf PG_AUTOCTL_DEBUG: 1 command: pg_autoctl create monitor --ssl-self-signed --auth trust --run expose: - 5432 node1: image: citusdata/pg_auto_failover:demo environment: PGDATA: /tmp/pgaf PG_AUTOCTL_DEBUG: 1 command: [ "pg_autoctl", "create", "postgres", "--ssl-self-signed", "--auth", "trust", "--pg-hba-lan", "--username", "ad", "--dbname", "analytics", "--monitor", "postgresql://autoctl_node@monitor/pg_auto_failover", "--run"] expose: - 5432 node2: image: citusdata/pg_auto_failover:demo expose: - 5432 environment: PGDATA: /tmp/pgaf PG_AUTOCTL_DEBUG: 1 command: [ "pg_autoctl", "create", "postgres", "--ssl-self-signed", "--auth", "trust", "--pg-hba-lan", "--username", "ad", "--dbname", "analytics", "--monitor", "postgresql://autoctl_node@monitor/pg_auto_failover", "--run"] expose: - 5432 node3: image: citusdata/pg_auto_failover:demo environment: PGDATA: /tmp/pgaf PG_AUTOCTL_DEBUG: 1 command: [ "pg_autoctl", "create", "postgres", "--ssl-self-signed", "--auth", "trust", "--pg-hba-lan", "--username", "ad", "--dbname", "analytics", "--monitor", "postgresql://autoctl_node@monitor/pg_auto_failover", "--run"] expose: - 5432 demo-app: image: citusdata/pg_auto_failover:demo environment: PGDATA: /tmp/pgaf PG_AUTOCTL_DEBUG: 1 command: [ "pg_autoctl", "do", "demo", "run", "--username", "ad", "--clients", "10", "--duration", "125", "--first-failover", "45", "--failover-freq", "30", "--monitor", "postgresql://autoctl_node@monitor/pg_auto_failover"] pg_auto_failover-1.6.3/docs/000077500000000000000000000000001414244367200160055ustar00rootroot00000000000000pg_auto_failover-1.6.3/docs/.gitignore000066400000000000000000000000151414244367200177710ustar00rootroot00000000000000_build .venv pg_auto_failover-1.6.3/docs/Makefile000066400000000000000000000011451414244367200174460ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = pg_auto_failover SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)pg_auto_failover-1.6.3/docs/_static/000077500000000000000000000000001414244367200174335ustar00rootroot00000000000000pg_auto_failover-1.6.3/docs/_static/css/000077500000000000000000000000001414244367200202235ustar00rootroot00000000000000pg_auto_failover-1.6.3/docs/_static/css/citus.css000066400000000000000000000110511414244367200220620ustar00rootroot00000000000000/* General */ h1,h2,.rst-content .toctree-wrapper p.caption,h3,h4,h5,h6,legend { font-family: "Lato", "proxima-nova", "Helvetica Neue", Arial, sans-serif !important; } .rst-content span.highlighted { background: #FFC; } .btn-fat { padding: 1em !important; } .btn { border: 1px solid #404040; } .btn:focus { outline: 1px solid; } #search-results .context { color: #404040 !important; } a { color: #2678AF; } a.skiplink { color: #757575; margin-bottom: 0; } .wy-menu-vertical li.current { border-left: 2px solid; } .wy-menu-vertical li.current a { color: #656565; } .wy-menu-vertical li.toctree-l2.current > a, .wy-menu-vertical li.toctree-l2.current li.toctree-l3 > a { color: #555; z-index: 1; } .wy-menu-vertical li.toctree-l3.current li.toctree-l4 > a { color: #4b4b4b; } .wy-menu-vertical li button.toctree-expand { color: #b3b3b3; } .wy-table-odd a, .wy-table-striped tr:nth-child(2n-1) a, .rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) a { color: #2575AB; } .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal { color: #606060; } .wy-alert.wy-alert-info .wy-alert-title, .rst-content .note .wy-alert-title, .rst-content .wy-alert-info.attention .wy-alert-title, .rst-content .wy-alert-info.caution .wy-alert-title, .rst-content .wy-alert-info.danger .wy-alert-title, .rst-content .wy-alert-info.error .wy-alert-title, .rst-content .wy-alert-info.hint .wy-alert-title, .rst-content .wy-alert-info.important .wy-alert-title, .rst-content .wy-alert-info.tip .wy-alert-title, .rst-content .wy-alert-info.warning .wy-alert-title, .rst-content .seealso .wy-alert-title, .rst-content .wy-alert-info.admonition-todo .wy-alert-title, .wy-alert.wy-alert-info .rst-content .admonition-title, .rst-content .wy-alert.wy-alert-info .admonition-title, .rst-content .note .admonition-title, .rst-content .wy-alert-info.attention .admonition-title, .rst-content .wy-alert-info.caution .admonition-title, .rst-content .wy-alert-info.danger .admonition-title, .rst-content .wy-alert-info.error .admonition-title, .rst-content .wy-alert-info.hint .admonition-title, .rst-content .wy-alert-info.important .admonition-title, .rst-content .wy-alert-info.tip .admonition-title, .rst-content .wy-alert-info.warning .admonition-title, .rst-content .seealso .admonition-title, .rst-content .wy-alert-info.admonition-todo .admonition-title { background: #277BB3; } .wy-alert.wy-alert-warning .wy-alert-title, .rst-content .wy-alert-warning.note .wy-alert-title, .rst-content .attention .wy-alert-title, .rst-content .caution .wy-alert-title, .rst-content .wy-alert-warning.danger .wy-alert-title, .rst-content .wy-alert-warning.error .wy-alert-title, .rst-content .wy-alert-warning.hint .wy-alert-title, .rst-content .wy-alert-warning.important .wy-alert-title, .rst-content .wy-alert-warning.tip .wy-alert-title, .rst-content .warning .wy-alert-title, .rst-content .wy-alert-warning.seealso .wy-alert-title, .rst-content .admonition-todo .wy-alert-title, .wy-alert.wy-alert-warning .rst-content .admonition-title, .rst-content .wy-alert.wy-alert-warning .admonition-title, .rst-content .wy-alert-warning.note .admonition-title, .rst-content .attention .admonition-title, .rst-content .caution .admonition-title, .rst-content .wy-alert-warning.danger .admonition-title, .rst-content .wy-alert-warning.error .admonition-title, .rst-content .wy-alert-warning.hint .admonition-title, .rst-content .wy-alert-warning.important .admonition-title, .rst-content .wy-alert-warning.tip .admonition-title, .rst-content .warning .admonition-title, .rst-content .wy-alert-warning.seealso .admonition-title, .rst-content .admonition-todo .admonition-title { background: #B25E14; } div.admonition.note a { color: #2473A7; } /* version picker */ .rst-versions .rst-other-versions { color: #d9d9d9 !important; } .rst-other-versions a { color: #fcfcfc !important; } /* code blocks */ .highlight .c1, .highlight .cm { color: #3F7D8C !important; } .highlight .gp { color: #BD5A08 !important; } .highlight .nc, .highlight .nn { color: #0D7EAF !important; } .highlight .no { color: #2C7DA9 !important; } .highlight .nv { color: #B045CD !important; } .highlight .si { color: #3B7AB8 !important; } /* Side menu customization */ .wy-menu-vertical header, .wy-menu-vertical p.caption { color: #eee; } .wy-side-nav-search { background-color: #2473A7; } .wy-side-nav-search > div.version, .wy-side-nav-search > a.icon-home { color: white !important; } /* Footer */ footer { color: #404040 !important; } footer span.commit code { color: #404040 !important; } pg_auto_failover-1.6.3/docs/_static/css/pygments.css000066400000000000000000000100601414244367200226000ustar00rootroot00000000000000.highlight .hll { background-color: #fff } .highlight { background: #fff; } .highlight .c { color: #408090; font-style: italic } /* Comment */ .highlight .err { border: 1px solid #FF0000 } /* Error */ .highlight .k { color: #007020; font-weight: bold } /* Keyword */ .highlight .o { color: #666666 } /* Operator */ .highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */ .highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */ .highlight .cp { color: #007020 } /* Comment.Preproc */ .highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */ .highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */ .highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */ .highlight .gd { color: #A00000 } /* Generic.Deleted */ .highlight .ge { font-style: italic } /* Generic.Emph */ .highlight .gr { color: #FF0000 } /* Generic.Error */ .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ .highlight .gi { color: #00A000 } /* Generic.Inserted */ .highlight .go { color: #333333 } /* Generic.Output */ .highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ .highlight .gs { font-weight: bold } /* Generic.Strong */ .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ .highlight .gt { color: #0044DD } /* Generic.Traceback */ .highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ .highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ .highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ .highlight .kp { color: #007020 } /* Keyword.Pseudo */ .highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ .highlight .kt { color: #902000 } /* Keyword.Type */ .highlight .m { color: #208050 } /* Literal.Number */ .highlight .s { color: #4070a0 } /* Literal.String */ .highlight .na { color: #4070a0 } /* Name.Attribute */ .highlight .nb { color: #007020 } /* Name.Builtin */ .highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ .highlight .no { color: #60add5 } /* Name.Constant */ .highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ .highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ .highlight .ne { color: #007020 } /* Name.Exception */ .highlight .nf { color: #06287e } /* Name.Function */ .highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ .highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ .highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ .highlight .nv { color: #bb60d5 } /* Name.Variable */ .highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ .highlight .w { color: #bbbbbb } /* Text.Whitespace */ .highlight .mb { color: #208050 } /* Literal.Number.Bin */ .highlight .mf { color: #208050 } /* Literal.Number.Float */ .highlight .mh { color: #208050 } /* Literal.Number.Hex */ .highlight .mi { color: #208050 } /* Literal.Number.Integer */ .highlight .mo { color: #208050 } /* Literal.Number.Oct */ .highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ .highlight .sc { color: #4070a0 } /* Literal.String.Char */ .highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ .highlight .s2 { color: #4070a0 } /* Literal.String.Double */ .highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ .highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ .highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ .highlight .sx { color: #c65d09 } /* Literal.String.Other */ .highlight .sr { color: #235388 } /* Literal.String.Regex */ .highlight .s1 { color: #4070a0 } /* Literal.String.Single */ .highlight .ss { color: #517918 } /* Literal.String.Symbol */ .highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ .highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ .highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ .highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ .highlight .il { color: #208050 } /* Literal.Number.Integer.Long */ pg_auto_failover-1.6.3/docs/architecture-multi-standby.rst000066400000000000000000000371711414244367200240240ustar00rootroot00000000000000.. _multi_node_architecture: Multi-node Architectures ======================== Pg_auto_failover allows you to have more than one standby node, and offers advanced control over your production architecture characteristics. Architectures with two standby nodes ------------------------------------ When adding your second standby node with default settings, you get the following architecture: .. figure:: ./tikz/arch-multi-standby.svg :alt: pg_auto_failover architecture with two standby nodes pg_auto_failover architecture with two standby nodes In this case, three nodes get set up with the same characteristics, achieving HA for both the Postgres service and the production dataset. An important setting for this architecture is ``number_sync_standbys``. The replication setting ``number_sync_standbys`` sets how many standby nodes the primary should wait for when committing a transaction. In order to have a good availability in your system, pg_auto_failover requires ``number_sync_standbys + 1`` standby nodes participating in the replication quorum: this allows any standby node to fail without impact on the system's ability to respect the replication quorum. When only two nodes are registered in a group on the monitor we have a primary and a single secondary node. Then ``number_sync_standbys`` can only be set to zero. When adding a second standby node to a pg_auto_failover group, then the monitor automatically increments ``number_sync_standbys`` to one, as we see in the diagram above. When ``number_sync_standbys`` is set to zero then pg_auto_failover implements the *Business Continuity* setup as seen in :ref:`architecture_basics`: synchronous replication is then used as a way to guarantee that failover can be implemented without data loss. In more details: 1. With ``number_sync_standbys`` set to one, this architecture always maintains two copies of the dataset: one on the current primary node (node A in the previous diagram), and one on the standby that acknowledges the transaction first (either node B or node C in the diagram). When one of the standby nodes is unavailable, the second copy of the dataset can still be maintained thanks to the remaining standby. When both the standby nodes are unavailable, then it's no longer possible to guarantee the replication quorum, and thus writes on the primary are blocked. The Postgres primary node waits until at least one standby node acknowledges the transactions locally committed, thus degrading your Postgres service to read-only. 0. It is possible to manually set ``number_sync_standbys`` to zero when having registered two standby nodes to the monitor, overriding the default behavior. In that case, when the second standby node becomes unhealthy at the same time as the first standby node, the primary node is assigned the state :ref:`wait_primary`. In that state, synchronous replication is disabled on the primary by setting ``synchronous_standby_names`` to an empty string. Writes are allowed on the primary, even though there's no extra copy of the production dataset available at this time. Setting ``number_sync_standbys`` to zero allows data to be written even when both standby nodes are down. In this case, a single copy of the production data set is kept and, if the primary was then to fail, some data will be lost. How much depends on your backup and recovery mechanisms. .. _architecture_setup: Replication Settings and Postgres Architectures ----------------------------------------------- The entire flexibility of pg_auto_failover can be leveraged with the following three replication settings: - Number of sync stanbys - Replication quorum - Candidate priority .. _number_sync_standbys: Number Sync Standbys ^^^^^^^^^^^^^^^^^^^^ This parameter is used by Postgres in the `synchronous_standby_names`__ parameter: ``number_sync_standby`` is the number of synchronous standbys for whose replies transactions must wait. __ https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-SYNCHRONOUS-STANDBY-NAMES This parameter can be set at the *formation* level in pg_auto_failover, meaning that it applies to the current primary, and "follows" a failover to apply to any new primary that might replace the current one. To set this parameter to the value ````, use the following command:: pg_autoctl set formation number-sync-standbys The default value in pg_auto_failover is zero. When set to zero, the Postgres parameter ``synchronous_standby_names`` can be set to either ``'*'`` or to ``''``: - ``synchronous_standby_names = '*'`` means that any standby may participate in the replication quorum for transactions with ``synchronous_commit`` set to ``on`` or higher values. pg_autofailover uses ``synchronous_standby_names = '*'`` when there's at least one standby that is known to be healthy. - ``synchronous_standby_names = ''`` (empty string) disables synchrous commit and makes all your commits asynchronous, meaning that transaction commits will not wait for replication. In other words, a single copy of your production data is maintained when ``synchronous_standby_names`` is set that way. pg_autofailover uses ``synchronous_standby_names = ''`` only when number_sync_standbys is set to zero and there's no standby node known healthy by the monitor. In order to set ``number_sync_standbys`` to a non-zero value, pg_auto_failover requires that at least ``number_sync_standbys + 1`` standby nodes be registered in the system. When the first standby node is added to the pg_auto_failover monitor, the only acceptable value for ``number_sync_standbys`` is zero. When a second standby is added that participates in the replication quorum, then ``number_sync_standbys`` is automatically set to one. The command ``pg_autoctl set formation number-sync-standbys`` can be used to change the value of this parameter in a formation, even when all the nodes are already running in production. The pg_auto_failover monitor then sets a transition for the primary to update its local value of ``synchronous_standby_names``. Replication Quorum ^^^^^^^^^^^^^^^^^^ The replication quorum setting is a boolean and defaults to ``true``, and can be set per-node. Pg_auto_failover includes a given node in ``synchronous_standby_names`` only when the replication quorum parameter has been set to true. This means that asynchronous replication will be used for nodes where ``replication-quorum`` is set to ``false``. It is possible to force asynchronous replication globally by setting replication quorum to false on all the nodes in a formation. Remember that failovers will happen, and thus to set your replication settings on the current primary node too when needed: it is going to be a standby later. To set this parameter to either true or false, use one of the following commands:: pg_autoctl set node replication-quorum true pg_autoctl set node replication-quorum false .. _candidate_priority: Candidate Priority ^^^^^^^^^^^^^^^^^^ The candidate priority setting is an integer that can be set to any value between 0 (zero) and 100 (one hundred). The default value is 50. When the pg_auto_failover monitor decides to orchestrate a failover, it uses each node's candidate priority to pick the new primary node. When setting the candidate priority of a node down to zero, this node will never be selected to be promoted as the new primary when a failover is orchestrated by the monitor. The monitor will instead wait until another node registered is healthy and in a position to be promoted. To set this parameter to the value ````, use the following command:: pg_autoctl set node candidate-priority When nodes have the same candidate priority, the monitor then picks the standby with the most advanced LSN position published to the monitor. When more than one node has published the same LSN position, a random one is chosen. When the candidate for failover has not published the most advanced LSN position in the WAL, pg_auto_failover orchestrates an intermediate step in the failover mechanism. The candidate fetches the missing WAL bytes from one of the standby with the most advanced LSN position prior to being promoted. Postgres allows this operation thanks to cascading replication: any standby can be the upstream node for another standby. It is required at all times that at least two nodes have a non-zero candidate priority in any pg_auto_failover formation. Otherwise no failover is possible. Auditing replication settings ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The command ``pg_autoctl get formation settings`` (also known as ``pg_autoctl show settings``) can be used to obtain a summary of all the replication settings currently in effect in a formation. Still using the first diagram on this page, we get the following summary:: $ pg_autoctl get formation settings Context | Name | Setting | Value ----------+---------+---------------------------+------------------------------------------------------------- formation | default | number_sync_standbys | 1 primary | node_A | synchronous_standby_names | 'ANY 1 (pgautofailover_standby_3, pgautofailover_standby_2)' node | node_A | replication quorum | true node | node_B | replication quorum | true node | node_C | replication quorum | true node | node_A | candidate priority | 50 node | node_B | candidate priority | 50 node | node_C | candidate priority | 50 We can see that the ``number_sync_standbys`` has been used to compute the current value of the `synchronous_standby_names`__ setting on the primary. __ https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-SYNCHRONOUS-STANDBY-NAMES Because all the nodes in that example have the same default candidate priority (50), then pg_auto_failover is using the form ``ANY 1`` with the list of standby nodes that are currently participating in the replication quorum. The entries in the `synchronous_standby_names` list are meant to match the `application_name` connection setting used in the `primary_conninfo`, and the format used by pg_auto_failover there is the format string `"pgautofailover_standby_%d"` where `%d` is replaced by the node id. This allows keeping the same connection string to the primary when the node name is changed (using the command ``pg_autoctl set metadata --name``). Here we can see the node id of each registered Postgres node with the following command:: $ pg_autoctl show state Name | Node | Host:Port | LSN | Reachable | Current State | Assigned State -------+-------+----------------+-----------+-----------+---------------------+-------------------- node_A | 1 | localhost:5001 | 0/7002310 | yes | primary | primary node_B | 2 | localhost:5002 | 0/7002310 | yes | secondary | secondary node_C | 3 | localhost:5003 | 0/7002310 | yes | secondary | secondary When setting pg_auto_failover with per formation `number_sync_standby` and then per node replication quorum and candidate priority replication settings, those properties are then used to compute the ``synchronous_standby_names`` value on the primary node. This value is automatically maintained on the primary by pg_auto_failover, and is updated either when replication settings are changed or when a failover happens. The other situation when the pg_auto_failover replication settings are used is a candidate election when a failover happens and there is more than two nodes registered in a group. Then the node with the highest candidate priority is selected, as detailed above in the :ref:`candidate_priority` section. Sample architectures with three standby nodes --------------------------------------------- When setting the three parameters above, it's possible to design very different Postgres architectures for your production needs. .. figure:: ./tikz/arch-three-standby.svg :alt: pg_auto_failover architecture with three standby nodes pg_auto_failover architecture with three standby nodes In this case, the system is set up with three standby nodes all set the same way, with default parameters. The default parameters support setting ``number_sync_standbys = 2``. This means that Postgres will maintain three copies of the production data set at all times. On the other hand, if two standby nodes were to fail at the same time, despite the fact that two copies of the data are still maintained, the Postgres service would be degraded to read-only. With this architecture diagram, here's the summary that we obtain:: $ pg_autoctl show settings Context | Name | Setting | Value ----------+---------+---------------------------+--------------------------------------------------------------------------------------- formation | default | number_sync_standbys | 2 primary | node_A | synchronous_standby_names | 'ANY 2 (pgautofailover_standby_2, pgautofailover_standby_4, pgautofailover_standby_3)' node | node_A | replication quorum | true node | node_B | replication quorum | true node | node_C | replication quorum | true node | node_D | replication quorum | true node | node_A | candidate priority | 50 node | node_B | candidate priority | 50 node | node_C | candidate priority | 50 node | node_D | candidate priority | 50 Sample architecture with three standby nodes, one async ------------------------------------------------------- .. figure:: ./tikz/arch-three-standby-one-async.svg :alt: pg_auto_failover architecture with three standby nodes, one async pg_auto_failover architecture with three standby nodes, one async In this case, the system is set up with two standby nodes participating in the replication quorum, allowing for ``number_sync_standbys = 1``. The system always maintains at least two copies of the data set, one on the primary, another on either node B or node D. Whenever we lose one of those nodes, we can hold to the guarantee of having two copies of the data set. Additionally, we have the standby server C which has been set up to not participate in the replication quorum. Node C will not be found in the ``synchronous_standby_names`` list of nodes. Also, node C is set up to never be a candidate for failover, with ``candidate-priority = 0``. This architecture would fit a situation with nodes A, B, and D are deployed in the same data center or availability zone and node C in another one. Those three nodes are set up to support the main production traffic and implement high availability of both the Postgres service and the data set. Node C might be set up for Business Continuity in case the first data center is lost, or maybe for reporting needs on another application domain. With this architecture diagram, here's the summary that we obtain:: pg_autoctl show settings Context | Name | Setting | Value ----------+---------+---------------------------+------------------------------------------------------------- formation | default | number_sync_standbys | 1 primary | node_A | synchronous_standby_names | 'ANY 1 (pgautofailover_standby_4, pgautofailover_standby_2)' node | node_A | replication quorum | true node | node_B | replication quorum | true node | node_C | replication quorum | false node | node_D | replication quorum | true node | node_A | candidate priority | 50 node | node_B | candidate priority | 50 node | node_C | candidate priority | 0 node | node_D | candidate priority | 50 pg_auto_failover-1.6.3/docs/architecture.rst000066400000000000000000000320451414244367200212250ustar00rootroot00000000000000.. _architecture_basics: Architecture Basics =================== pg_auto_failover is designed as a simple and robust way to manage automated Postgres failover in production. On-top of robust operations, pg_auto_failover setup is flexible and allows either *Business Continuity* or *High Availability* configurations. pg_auto_failover design includes configuration changes in a live system without downtime. pg_auto_failover is designed to be able to handle a single PostgreSQL service using three nodes. In this setting, the system is resilient to losing any **one** of **three** nodes. .. figure:: ./tikz/arch-single-standby.svg :alt: pg_auto_failover Architecture for a standalone PostgreSQL service pg_auto_failover Architecture for a standalone PostgreSQL service It is important to understand that when using only two Postgres nodes then pg_auto_failover is optimized for *Business Continuity*. In the event of losing a single node, pg_auto_failover is capable of continuing the PostgreSQL service, and prevents any data loss when doing so, thanks to PostgreSQL *Synchronous Replication*. That said, there is a trade-off involved in this architecture. The business continuity bias relaxes replication guarantees for *asynchronous replication* in the event of a standby node failure. This allows the PostgreSQL service to accept writes when there's a single server available, and opens the service for potential data loss if the primary server were also to fail. The pg_auto_failover Monitor ---------------------------- Each PostgreSQL node in pg_auto_failover runs a Keeper process which informs a central Monitor node about notable local changes. Some changes require the Monitor to orchestrate a correction across the cluster: - New nodes At initialization time, it's necessary to prepare the configuration of each node for PostgreSQL streaming replication, and get the cluster to converge to the nominal state with both a primary and a secondary node in each group. The monitor determines each new node's role - Node failure The monitor orchestrates a failover when it detects an unhealthy node. The design of pg_auto_failover allows the monitor to shut down service to a previously designated primary node without causing a "split-brain" situation. The monitor is the authoritative node that manages global state and makes changes in the cluster by issuing commands to the nodes' keeper processes. A pg_auto_failover monitor node failure has limited impact on the system. While it prevents reacting to other nodes' failures, it does not affect replication. The PostgreSQL streaming replication setup installed by pg_auto_failover does not depend on having the monitor up and running. pg_auto_failover Glossary ------------------------- pg_auto_failover handles a single PostgreSQL service with the following concepts: Monitor ^^^^^^^ The pg_auto_failover monitor is a service that keeps track of one or several *formations* containing *groups* of *nodes*. The monitor is implemented as a PostgreSQL extension, so when you run the command ``pg_autoctl create monitor`` a PostgreSQL instance is initialized, configured with the extension, and started. The monitor service embeds a PostgreSQL instance. Formation ^^^^^^^^^ A formation is a logical set of PostgreSQL services that are managed together. It is possible to operate many formations with a single monitor instance. Each formation has a group of Postgres nodes and the FSM orchestration implemented by the monitor applies separately to each group. Group ^^^^^ A group of two PostgreSQL nodes work together to provide a single PostgreSQL service in a Highly Available fashion. A group consists of a PostgreSQL primary server and a secondary server setup with Hot Standby synchronous replication. Note that pg_auto_failover can orchestrate the whole setting-up of the replication for you. In pg_auto_failover versions up to 1.3, a single Postgres group can contain only two Postgres nodes. Starting with pg_auto_failover 1.4, there's no limit to the number of Postgres nodes in a single group. Note that each Postgres instance that belongs to the same group serves the same dataset in its data directory (PGDATA). .. note:: The notion of a formation that contains multiple groups in pg_auto_failover is useful when setting up and managing a whole Citus formation, where the coordinator nodes belong to group zero of the formation, and each Citus worker node becomes its own group and may have Postgres standby nodes. Keeper ^^^^^^ The pg_auto_failover *keeper* is an agent that must be running on the same server where your PostgreSQL nodes are running. The keeper controls the local PostgreSQL instance (using both the ``pg_ctl`` command-line tool and SQL queries), and communicates with the monitor: - it sends updated data about the local node, such as the WAL delta in between servers, measured via PostgreSQL statistics views. - it receives state assignments from the monitor. Also the keeper maintains local state that includes the most recent communication established with the monitor and the other PostgreSQL node of its group, enabling it to detect :ref:`network_partitions`. .. note:: In pg_auto_failover versions up to and including 1.3, the *keeper* process started with ``pg_autoctl run`` manages a separate Postgres instance, running as its own process tree. Starting in pg_auto_failover version 1.4, the *keeper* process (started with ``pg_autoctl run``) runs the Postgres instance as a sub-process of the main ``pg_autoctl`` process, allowing tighter control over the Postgres execution. Running the sub-process also makes the solution work better both in container environments (because it's now a single process tree) and with systemd, because it uses a specific cgroup per service unit. Node ^^^^ A node is a server (virtual or physical) that runs PostgreSQL instances and a keeper service. At any given time, any node might be a primary or a secondary Postgres instance. The whole point of pg_auto_failover is to decide this state. As a result, refrain from naming your nodes with the role you intend for them. Their roles can change. If they didn't, your system wouldn't need pg_auto_failover! State ^^^^^ A state is the representation of the per-instance and per-group situation. The monitor and the keeper implement a Finite State Machine to drive operations in the PostgreSQL groups; allowing pg_auto_failover to implement High Availability with the goal of zero data loss. The keeper main loop enforces the current expected state of the local PostgreSQL instance, and reports the current state and some more information to the monitor. The monitor uses this set of information and its own health-check information to drive the State Machine and assign a goal state to the keeper. The keeper implements the transitions between a current state and a monitor-assigned goal state. Client-side HA -------------- Implementing client-side High Availability is included in PostgreSQL's driver `libpq` from version 10 onward. Using this driver, it is possible to specify multiple host names or IP addresses in the same connection string:: $ psql -d "postgresql://host1,host2/dbname?target_session_attrs=read-write" $ psql -d "postgresql://host1:port2,host2:port2/dbname?target_session_attrs=read-write" $ psql -d "host=host1,host2 port=port1,port2 target_session_attrs=read-write" When using either of the syntax above, the `psql` application attempts to connect to `host1`, and when successfully connected, checks the *target_session_attrs* as per the PostgreSQL documentation of it: If this parameter is set to read-write, only a connection in which read-write transactions are accepted by default is considered acceptable. The query SHOW transaction_read_only will be sent upon any successful connection; if it returns on, the connection will be closed. If multiple hosts were specified in the connection string, any remaining servers will be tried just as if the connection attempt had failed. The default value of this parameter, any, regards all connections as acceptable. When the connection attempt to `host1` fails, or when the *target_session_attrs* can not be verified, then the ``psql`` application attempts to connect to `host2`. The behavior is implemented in the connection library `libpq`, so any application using it can benefit from this implementation, not just ``psql``. When using pg_auto_failover, configure your application connection string to use the primary and the secondary server host names, and set ``target_session_attrs=read-write`` too, so that your application automatically connects to the current primary, even after a failover occurred. Monitoring protocol ------------------- The monitor interacts with the data nodes in 2 ways: - Data nodes periodically connect and run `SELECT pgautofailover.node_active(...)` to communicate their current state and obtain their goal state. - The monitor periodically connects to all the data nodes to see if they are healthy, doing the equivalent of ``pg_isready``. When a data node calls `node_active`, the state of the node is stored in the `pgautofailover.node` table and the state machines of both nodes are progressed. The state machines are described later in this readme. The monitor typically only moves one state forward and waits for the node(s) to converge except in failure states. If a node is not communicating to the monitor, it will either cause a failover (if node is a primary), disabling synchronous replication (if node is a secondary), or cause the state machine to pause until the node comes back (other cases). In most cases, the latter is harmless, though in some cases it may cause downtime to last longer, e.g. if a standby goes down during a failover. To simplify operations, a node is only considered unhealthy if the monitor cannot connect *and* it hasn't reported its state through `node_active` for a while. This allows, for example, PostgreSQL to be restarted without causing a health check failure. Synchronous vs. asynchronous replication ---------------------------------------- By default, pg_auto_failover uses synchronous replication, which means all writes block until at least one standby node has reported receiving them. To handle cases in which the standby fails, the primary switches between two states called `wait_primary` and `primary` based on the health of standby nodes, and based on the replication setting ``number_sync_standby``. When in the `wait_primary` state, synchronous replication is disabled by automatically setting ``synchronous_standby_names = ''`` to allow writes to proceed. However doing so also disables failover, since the standby might get arbitrarily far behind. If the standby is responding to health checks and within 1 WAL segment of the primary (by default), synchronous replication is enabled again on the primary by setting ``synchronous_standby_names = '*'`` which may cause a short latency spike since writes will then block until the standby has caught up. When using several standby nodes with replication quorum enabled, the actual setting for ``synchronous_standby_names`` is set to a list of those standby nodes that are set to participate to the replication quorum. If you wish to disable synchronous replication, you need to add the following to ``postgresql.conf``:: synchronous_commit = 'local' This ensures that writes return as soon as they are committed on the primary -- under all circumstances. In that case, failover might lead to some data loss, but failover is not initiated if the secondary is more than 10 WAL segments (by default) behind on the primary. During a manual failover, the standby will continue accepting writes from the old primary. The standby will stop accepting writes only if it's fully caught up (most common), the primary fails, or it does not receive writes for 2 minutes. .. topic:: A note about performance In some cases the performance impact on write latency when setting synchronous replication makes the application fail to deliver expected performance. If testing or production feedback shows this to be the case, it is beneficial to switch to using asynchronous replication. The way to use asynchronous replication in pg_auto_failover is to change the ``synchronous_commit`` setting. This setting can be set per transaction, per session, or per user. It does not have to be set globally on your Postgres instance. One way to benefit from that would be:: alter role fast_and_loose set synchronous_commit to local; That way performance-critical parts of the application don't have to wait for the standby nodes. Only use this when you can also lower your data durability guarantees. Node recovery ------------- When bringing a node back after a failover, the keeper (``pg_autoctl run``) can simply be restarted. It will also restart postgres if needed and obtain its goal state from the monitor. If the failed node was a primary and was demoted, it will learn this from the monitor. Once the node reports, it is allowed to come back as a standby by running ``pg_rewind``. If it is too far behind, the node performs a new ``pg_basebackup``. pg_auto_failover-1.6.3/docs/conf.py000066400000000000000000000303051414244367200173050ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # pg_auto_failover documentation build configuration file, created by # sphinx-quickstart on Sat May 5 14:33:23 2018. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # import os # import sys # sys.path.insert(0, os.path.abspath('.')) # # https://stackoverflow.com/questions/9899283/how-do-you-change-the-code-example-font-size-in-latex-pdf-output-with-sphinx # from sphinx.highlighting import PygmentsBridge from pygments.formatters.latex import LatexFormatter class CustomLatexFormatter(LatexFormatter): def __init__(self, **options): super(CustomLatexFormatter, self).__init__(**options) self.verboptions = r"formatcom=\scriptsize" PygmentsBridge.latex_formatter = CustomLatexFormatter # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ["sphinx.ext.githubpages"] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = ".rst" # The master toctree document. master_doc = "index" # General information about the project. project = "pg_auto_failover" copyright = "Copyright (c) Microsoft Corporation. All rights reserved." author = "Microsoft" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = "1.6" # The full version, including alpha/beta/rc tags. release = "1.6.3" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", ".venv"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_rtd_theme_citus" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add our custom CSS def setup(app): if hasattr(app, "add_css_file"): app.add_css_file("css/citus.css") app.add_css_file("css/pygments.css") # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # This is required for the alabaster theme # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars html_sidebars = { "**": [ "relations.html", # needs 'show_related': True theme option to display "searchbox.html", ] } # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = "pg_auto_failoverdoc" # -- Options for LaTeX output --------------------------------------------- # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ ( master_doc, "pg_auto_failover.tex", "pg\_auto\_failover Documentation", "Microsoft", "manual", ), ] # for some reasons, doesn't work with the current setup. # latex_engine = 'xelatex' latex_elements = { "geometry": r"\usepackage[paperwidth=441pt,paperheight=666pt]{geometry}", "pointsize": "12pt", "fncychap": r"\usepackage[Sonny]{fncychap}", "extraclassoptions": "oneside", "sphinxsetup": "hmargin=1cm,vmargin=2cm,verbatimwithframe=false,VerbatimColor={rgb}{1.0, 0.97, 0.97}", } latex_show_urls = "footnote" # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ( master_doc, "pg_auto_failover", "pg_auto_failover Documentation", [author], 1, ), ("ref/pg_autoctl", "pg_autoctl", "pg_autoctl", [author], 1), ( "ref/pg_autoctl_create_monitor", "pg_autoctl create monitor", "pg_autoctl create monitor", [author], 1, ), ( "ref/pg_autoctl_create_postgres", "pg_autoctl create postgres", "pg_autoctl create postgres", [author], 1, ), ( "ref/pg_autoctl_create_formation", "pg_autoctl create formation", "pg_autoctl create formation", [author], 1, ), ( "ref/pg_autoctl_drop_monitor", "pg_autoctl drop monitor", "pg_autoctl drop monitor", [author], 1, ), ( "ref/pg_autoctl_drop_node", "pg_autoctl drop node", "pg_autoctl drop node", [author], 1, ), ( "ref/pg_autoctl_drop_formation", "pg_autoctl drop formation", "pg_autoctl drop formation", [author], 1, ), ( "ref/pg_autoctl_config_get", "pg_autoctl config get", "pg_autoctl config get", [author], 1, ), ( "ref/pg_autoctl_config_set", "pg_autoctl config set", "pg_autoctl config set", [author], 1, ), ( "ref/pg_autoctl_config_check", "pg_autoctl config check", "pg_autoctl config check", [author], 1, ), ( "ref/pg_autoctl_show_uri", "pg_autoctl show uri", "pg_autoctl show uri", [author], 1, ), ( "ref/pg_autoctl_show_events", "pg_autoctl show events", "pg_autoctl show events", [author], 1, ), ( "ref/pg_autoctl_show_state", "pg_autoctl show state", "pg_autoctl show state", [author], 1, ), ( "ref/pg_autoctl_show_file", "pg_autoctl show file", "pg_autoctl show file", [author], 1, ), ( "ref/pg_autoctl_show_settings", "pg_autoctl show settings", "pg_autoctl show settings", [author], 1, ), ( "ref/pg_autoctl_show_standby_names", "pg_autoctl show standby-names", "pg_autoctl show standby-names", [author], 1, ), ( "ref/pg_autoctl_enable_maintenance", "pg_autoctl enable maintenance", "pg_autoctl enable maintenance", [author], 1, ), ( "ref/pg_autoctl_enable_secondary", "pg_autoctl enable secondary", "pg_autoctl enable secondary", [author], 1, ), ( "ref/pg_autoctl_enable_ssl", "pg_autoctl enable ssl", "pg_autoctl enable ssl", [author], 1, ), ( "ref/pg_autoctl_enable_monitor", "pg_autoctl enable monitor", "pg_autoctl enable monitor", [author], 1, ), ( "ref/pg_autoctl_disable_maintenance", "pg_autoctl disable maintenance", "pg_autoctl disable maintenance", [author], 1, ), ( "ref/pg_autoctl_disable_secondary", "pg_autoctl disable secondary", "pg_autoctl disable secondary", [author], 1, ), ( "ref/pg_autoctl_disable_ssl", "pg_autoctl disable ssl", "pg_autoctl disable ssl", [author], 1, ), ( "ref/pg_autoctl_disable_monitor", "pg_autoctl disable monitor", "pg_autoctl disable monitor", [author], 1, ), ( "ref/pg_autoctl_get_formation_settings", "pg_autoctl get formation settings", "pg_autoctl get formation settings", [author], 1, ), ( "ref/pg_autoctl_get_formation_number_sync_standbys", "pg_autoctl get formation number-sync-standbys", "pg_autoctl get formation number-sync-standbys", [author], 1, ), ( "ref/pg_autoctl_get_node_replication_quorum", "pg_autoctl get node replication-quorum", "pg_autoctl get node replication-quorum", [author], 1, ), ( "ref/pg_autoctl_get_node_candidate_priority", "pg_autoctl get node candidate-priority", "pg_autoctl get node candidate-priority", [author], 1, ), ( "ref/pg_autoctl_set_formation_number_sync_standbys", "pg_autoctl set formation number-sync-standbys", "pg_autoctl set formation number-sync-standbys", [author], 1, ), ( "ref/pg_autoctl_set_node_replication_quorum", "pg_autoctl set node replication-quorum", "pg_autoctl set node replication-quorum", [author], 1, ), ( "ref/pg_autoctl_set_node_candidate_priority", "pg_autoctl set node candidate-priority", "pg_autoctl set node candidate-priority", [author], 1, ), ( "ref/pg_autoctl_perform_failover", "pg_autoctl perform failover", "pg_autoctl perform failover", [author], 1, ), ( "ref/pg_autoctl_perform_switchover", "pg_autoctl perform switchover", "pg_autoctl perform switchover", [author], 1, ), ( "ref/pg_autoctl_perform_promotion", "pg_autoctl perform promotion", "pg_autoctl perform promotion", [author], 1, ), ( "ref/pg_autoctl_run", "pg_autoctl run", "pg_autoctl run", [author], 1, ), ( "ref/pg_autoctl_watch", "pg_autoctl watch", "pg_autoctl watch", [author], 1, ), ( "ref/pg_autoctl_stop", "pg_autoctl stop", "pg_autoctl stop", [author], 1, ), ( "ref/pg_autoctl_status", "pg_autoctl status", "pg_autoctl status", [author], 1, ), ( "ref/pg_autoctl_reload", "pg_autoctl reload", "pg_autoctl reload", [author], 1, ), # ("ref/reference", "pg_autoctl", "pg_auto_failover agent", [author], 1), ( "ref/configuration", "pg_autoctl", "pg_auto_failover Configuration", [author], 5, ), ] # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ( master_doc, "pg_auto_failover", "pg_auto_failover Documentation", author, "pg_auto_failover", "One line description of project.", "Miscellaneous", ), ] pg_auto_failover-1.6.3/docs/failover-state-machine.rst000066400000000000000000000272371414244367200231010ustar00rootroot00000000000000Failover State Machine ====================== Introduction ------------ pg_auto_failover uses a state machine for highly controlled execution. As keepers inform the monitor about new events (or fail to contact it at all), the monitor assigns each node both a current state and a goal state. A node's current state is a strong guarantee of its capabilities. States themselves do not cause any actions; actions happen during state transitions. The assigned goal states inform keepers of what transitions to attempt. Example of state transitions in a new cluster --------------------------------------------- A good way to get acquainted with the states is by examining the transitions of a cluster from birth to high availability. After starting a monitor and running keeper init for the first data node ("node A"), the monitor registers the state of that node as "init" with a goal state of "single." The init state means the monitor knows nothing about the node other than its existence because the keeper is not yet continuously running there to report node health. Once the keeper runs and reports its health to the monitor, the monitor assigns it the state "single," meaning it is just an ordinary Postgres server with no failover. Because there are not yet other nodes in the cluster, the monitor also assigns node A the goal state of single -- there's nothing that node A's keeper needs to change. As soon as a new node ("node B") is initialized, the monitor assigns node A the goal state of "wait_primary." This means the node still has no failover, but there's hope for a secondary to synchronize with it soon. To accomplish the transition from single to wait_primary, node A's keeper adds node B's hostname to pg_hba.conf to allow a hot standby replication connection. At the same time, node B transitions into wait_standby with the goal initially of staying in wait_standby. It can do nothing but wait until node A gives it access to connect. Once node A has transitioned to wait_primary, the monitor assigns B the goal of "catchingup," which gives B's keeper the green light to make the transition from wait_standby to catchingup. This transition involves running pg_basebackup, editing recovery.conf and restarting PostgreSQL in Hot Standby node. Node B reports to the monitor when it's in hot standby mode and able to connect to node A. The monitor then assigns node B the goal state of "secondary" and A the goal of "primary." Postgres ships WAL logs from node A and replays them on B. Finally B is caught up and tells the monitor (specifically B reports its pg_stat_replication.sync_state and WAL replay lag). At this glorious moment the monitor assigns A the state primary (goal: primary) and B secondary (goal: secondary). State reference --------------- The following diagram shows the pg_auto_failover State Machine. It's missing links to the ``single`` state, which can always been reached when removing all the other nodes. .. figure:: ./tikz/fsm.svg :alt: pg_auto_failover Finite State Machine diagram pg_auto_failover Finite State Machine diagram In the previous diagram we can see that we have a list of six states where the application can connect to a read-write Postgres service: ``single``, ``wait_primary``, ``primary``, ``prepare_maintenance``, and ``apply_settings``. Init ^^^^ A node is assigned the "init" state when it is first registered with the monitor. Nothing is known about the node at this point beyond its existence. If no other node has been registered with the monitor for the same formation and group ID then this node is assigned a goal state of "single." Otherwise the node has the goal state of "wait_standby." Single ^^^^^^ There is only one node in the group. It behaves as a regular PostgreSQL instance, with no high availability and no failover. If the administrator removes a node the other node will revert to the single state. .. _wait_primary: Wait_primary ^^^^^^^^^^^^ Applied to a node intended to be the primary but not yet in that position. The primary-to-be at this point knows the secondary's node name or IP address, and has granted the node hot standby access in the pg_hba.conf file. The wait_primary state may be caused either by a new potential secondary being registered with the monitor (good), or an existing secondary becoming unhealthy (bad). In the latter case, during the transition from primary to wait_primary, the primary node's keeper disables synchronous replication on the node. It also cancels currently blocked queries. Join_primary ^^^^^^^^^^^^ Applied to a primary node when another standby is joining the group. This allows the primary node to apply necessary changes to its HBA setup before allowing the new node joining the system to run the ``pg_basebackup`` command. .. important:: This state has been deprecated, and is no longer assigned to nodes. Any time we would have used ``join_primary`` before, we now use ``primary`` instead. Primary ^^^^^^^ A healthy secondary node exists and has caught up with WAL replication. Specifically, the keeper reports the primary state only when it has verified that the secondary is reported "sync" in pg_stat_replication.sync_state, and with a WAL lag of 0. The primary state is a strong assurance. It's the only state where we know we can fail over when required. During the transition from wait_primary to primary, the keeper also enables synchronous replication. This means that after a failover the secondary will be fully up to date. Wait_standby ^^^^^^^^^^^^ Monitor decides this node is a standby. Node must wait until the primary has authorized it to connect and setup hot standby replication. Catchingup ^^^^^^^^^^ The monitor assigns catchingup to the standby node when the primary is ready for a replication connection (pg_hba.conf has been properly edited, connection role added, etc). The standby node keeper runs pg_basebackup, connecting to the primary's hostname and port. The keeper then edits recovery.conf and starts PostgreSQL in hot standby node. Secondary ^^^^^^^^^ A node with this state is acting as a hot standby for the primary, and is up to date with the WAL log there. In particular, it is within 16MB or 1 WAL segment of the primary. Maintenance ^^^^^^^^^^^ The cluster administrator can manually move a secondary into the maintenance state to gracefully take it offline. The primary will then transition from state primary to wait_primary, during which time the secondary will be online to accept writes. When the old primary reaches the wait_primary state then the secondary is safe to take offline with minimal consequences. Prepare_maintenance ^^^^^^^^^^^^^^^^^^^ The cluster administrator can manually move a primary node into the maintenance state to gracefully take it offline. The primary then transitions to the prepare_maintenance state to make sure the secondary is not missing any writes. In the prepare_maintenance state, the primary shuts down. Wait_maintenance ^^^^^^^^^^^^^^^^ The custer administrator can manually move a secondary into the maintenance state to gracefully take it offline. Before reaching the maintenance state though, we want to switch the primary node to asynchronous replication, in order to avoid writes being blocked. In the state wait_maintenance the standby waits until the primary has reached wait_primary. Draining ^^^^^^^^ A state between primary and demoted where replication buffers finish flushing. A draining node will not accept new client writes, but will continue to send existing data to the secondary. To implement that with Postgres we actually stop the service. When stopping, Postgres ensures that the current replication buffers are flushed correctly to synchronous standbys. Demoted ^^^^^^^ The primary keeper or its database were unresponsive past a certain threshold. The monitor assigns demoted state to the primary to avoid a split-brain scenario where there might be two nodes that don't communicate with each other and both accept client writes. In that state the keeper stops PostgreSQL and prevents it from running. Demote_timeout ^^^^^^^^^^^^^^ If the monitor assigns the primary a demoted goal state but the primary keeper doesn't acknowledge transitioning to that state within a timeout window, then the monitor assigns demote_timeout to the primary. Most commonly may happen when the primary machine goes silent. The keeper is not reporting to the monitor. Stop_replication ^^^^^^^^^^^^^^^^ The stop_replication state is meant to ensure that the primary goes to the demoted state before the standby goes to single and accepts writes (in case the primary can’t contact the monitor anymore). Before promoting the secondary node, the keeper stops PostgreSQL on the primary to avoid split-brain situations. For safety, when the primary fails to contact the monitor and fails to see the pg_auto_failover connection in pg_stat_replication, then it goes to the demoted state of its own accord. Prepare_promotion ^^^^^^^^^^^^^^^^^ The prepare_promotion state is meant to prepare the standby server to being promoted. This state allows synchronisation on the monitor, making sure that the primary has stopped Postgres before promoting the secondary, hence preventing split brain situations. Report_LSN ^^^^^^^^^^ The report_lsn state is assigned to standby nodes when a failover is orchestrated and there are several standby nodes. In order to pick the furthest standby in the replication, pg_auto_failover first needs a fresh report of the current LSN position reached on each standby node. When a node reaches the report_lsn state, the replication stream is stopped, by restarting Postgres without a ``primary_conninfo``. This allows the primary node to detect :ref:`network_partitions`, i.e. when the primary can't connect to the monitor and there's no standby listed in ``pg_stat_replication``. Fast_forward ^^^^^^^^^^^^ The fast_forward state is assigned to the selected promotion candidate during a failover when it won the election thanks to the candidate priority settings, but the selected node is not the most advanced standby node as reported in the report_lsn state. Missing WAL bytes are fetched from one of the most advanced standby nodes by using Postgres cascading replication features: it is possible to use any standby node in the ``primary_conninfo``. Dropped ^^^^^^^ The dropped state is assigned to a node when the ``pg_autoctl drop node`` command is used. This allows the node to implement specific local actions before being entirely removed from the monitor database. When a node reports reaching the dropped state, the monitor removes its entry. If a node is not reporting anymore, maybe because it's completely unavailable, then it's possible to run the ``pg_autoctl drop node --force`` command, and then the node entry is removed from the monitor. Failover logic -------------- This section needs to be expanded further, but below is the failover state machine for each node that is implemented by the monitor: .. figure:: ./fsm/node-state-machine.png :scale: 30% :alt: Node state machine Node state machine Since the state machines of the data nodes always move in tandem, a pair (group) of data nodes also implicitly has the following state machine: .. figure:: ./fsm/group-state-machine.png :scale: 40% :alt: Group state machine Group state machine .. raw:: latex \newpage .. _state_machine_diagram: pg_auto_failover keeper's State Machine --------------------------------------- When built in TEST mode, it is then possible to use the following command to get a visual representation of the Keeper's Finite State Machine:: $ PG_AUTOCTL_DEBUG=1 pg_autoctl do fsm gv | dot -Tsvg > fsm.svg The `dot` program is part of the Graphviz suite and produces the following output: .. figure:: ./fsm.png :scale: 35% :alt: Keeper state machine Keeper State Machine pg_auto_failover-1.6.3/docs/faq.rst000066400000000000000000000203771414244367200173170ustar00rootroot00000000000000Frequently Asked Questions ========================== Those questions have been asked in `GitHub issues`__ for the project by several people. If you have more questions, feel free to open a new issue, and your question and its answer might make it to this FAQ. __ https://github.com/citusdata/pg_auto_failover/issues_ I stopped the primary and no failover is happening for 20s to 30s, why? ----------------------------------------------------------------------- In order to avoid spurious failovers when the network connectivity is not stable, pg_auto_failover implements a timeout of 20s before acting on a node that is known unavailable. This needs to be added to the delay between health checks and the retry policy. See the :ref:`configuration` part for more information about how to setup the different delays and timeouts that are involved in the decision making. See also :ref:`pg_autoctl_watch` to have a dashboard that helps understanding the system and what's going on in the moment. The secondary is blocked in the CATCHING_UP state, what should I do? -------------------------------------------------------------------- In the pg_auto_failover design, the following two things are needed for the monitor to be able to orchestrate nodes integration completely: 1. Health Checks must be successful The monitor runs periodic health checks with all the nodes registered in the system. Those *health checks* are Postgres connections from the monitor to the registered Postgres nodes, and use the ``hostname`` and ``port`` as registered. The ``pg_autoctl show state`` commands column *Reachable* contains "yes" when the monitor could connect to a specific node, "no" when this connection failed, and "unknown" when no connection has been attempted yet, since the last startup time of the monitor. The *Reachable* column from ``pg_autoctl show state`` command output must show a "yes" entry before a new standby node can be orchestrated up to the "secondary" goal state. 2. pg_autoctl service must be running The pg_auto_failover monitor works by assigning goal states to individual Postgres nodes. The monitor will not assign a new goal state until the current one has been reached. To implement a transition from the current state to the goal state assigned by the monitor, the pg_autoctl service must be running on every node. When your new standby node stays in the "catchingup" state for a long time, please check that the node is reachable from the monitor given its ``hostname`` and ``port`` known on the monitor, and check that the ``pg_autoctl run`` command is running for this node. When things are not obvious, the next step is to go read the logs. Both the output of the ``pg_autoctl`` command and the Postgres logs are relevant. See the :ref:`logs` question for details. .. _logs: Should I read the logs? Where are the logs? ------------------------------------------- Yes. If anything seems strange to you, please do read the logs. As maintainers of the ``pg_autoctl`` tool, we can't foresee everything that may happen to your production environment. Still, a lot of efforts is spent on having a meaningful output. So when you're in a situation that's hard to understand, please make sure to read the ``pg_autoctl`` logs and the Postgres logs. When using systemd integration, the ``pg_autoctl`` logs are then handled entirely by the journal facility of systemd. Please then refer to ``journalctl`` for viewing the logs. The Postgres logs are to be found in the ``$PGDATA/log`` directory with the default configuration deployed by ``pg_autoctl create ...``. When a custom Postgres setup is used, please refer to your actual setup to find Postgres logs. The state of the system is blocked, what should I do? ----------------------------------------------------- This question is a general case situation that is similar in nature to the previous situation, reached when adding a new standby to a group of Postgres nodes. Please check the same two elements: the monitor health checks are successful, and the ``pg_autoctl run`` command is running. When things are not obvious, the next step is to go read the logs. Both the output of the ``pg_autoctl`` command and the Postgres logs are relevant. See the :ref:`logs` question for details. The monitor is a SPOF in pg_auto_failover design, how should we handle that? ---------------------------------------------------------------------------- When using pg_auto_failover, the monitor is needed to make decisions and orchestrate changes in all the registered Postgres groups. Decisions are transmitted to the Postgres nodes by the monitor assigning nodes a goal state which is different from their current state. Consequences of the monitor being unavailable ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Nodes contact the monitor each second and call the ``node_active`` stored procedure, which returns a goal state that is possibly different from the current state. The monitor only assigns Postgres nodes with a new goal state when a cluster wide operation is needed. In practice, only the following operations require the monitor to assign a new goal state to a Postgres node: - a new node is registered - a failover needs to happen, either triggered automatically or manually - a node is being put to maintenance - a node replication setting is being changed. When the monitor node is not available, the ``pg_autoctl`` processes on the Postgres nodes will fail to contact the monitor every second, and log about this failure. Adding to that, no orchestration is possible. The Postgres streaming replication does not need the monitor to be available in order to deliver its service guarantees to your application, so your Postgres service is still available when the monitor is not available. To repair your installation after having lost a monitor, the following scenarios are to be considered. The monitor node can be brought up again without data having been lost ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This is typically the case in Cloud Native environments such as Kubernetes, where you could have a service migrated to another pod and re-attached to its disk volume. This scenario is well supported by pg_auto_failover, and no intervention is needed. It is also possible to use synchronous archiving with the monitor so that it's possible to recover from the current archives and continue operating without intervention on the Postgres nodes, except for updating their monitor URI. This requires an archiving setup that uses synchronous replication so that any transaction committed on the monitor is known to have been replicated in your WAL archive. At the moment, you have to take care of that setup yourself. Here's a quick summary of what needs to be done: 1. Schedule base backups Use ``pg_basebackup`` every once in a while to have a full copy of the monitor Postgres database available. 2. Archive WAL files in a synchronous fashion Use ``pg_receivewal --sync ...`` as a service to keep a WAL archive in sync with the monitor Postgres instance at all time. 3. Prepare a recovery tool on top of your archiving strategy Write a utility that knows how to create a new monitor node from your most recent pg_basebackup copy and the WAL files copy. Bonus points if that tool/script is tested at least once a day, so that you avoid surprises on the unfortunate day that you actually need to use it in production. A future version of pg_auto_failover will include this facility, but the current versions don't. The monitor node can only be built from scratch again ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If you don't have synchronous archiving for the monitor set-up, then you might not be able to restore a monitor database with the expected up-to-date node metadata. Specifically we need the nodes state to be in sync with what each ``pg_autoctl`` process has received the last time they could contact the monitor, before it has been unavailable. It is possible to register nodes that are currently running to a new monitor without restarting Postgres on the primary. For that, the procedure mentionned in :ref:`replacing_monitor_online` must be followed, using the following commands:: $ pg_autoctl disable monitor $ pg_autoctl enable monitor pg_auto_failover-1.6.3/docs/fault-tolerance.rst000066400000000000000000000201251414244367200216240ustar00rootroot00000000000000Failover and Fault Tolerance ============================ At the heart of the pg_auto_failover implementation is a State Machine. The state machine is driven by the monitor, and its transitions are implemented in the keeper service, which then reports success to the monitor. The keeper is allowed to retry transitions as many times as needed until they succeed, and reports also failures to reach the assigned state to the monitor node. The monitor also implements frequent health-checks targeting the registered PostgreSQL nodes. When the monitor detects something is not as expected, it takes action by assigning a new goal state to the keeper, that is responsible for implementing the transition to this new state, and then reporting. Unhealthy Nodes --------------- The pg_auto_failover monitor is responsible for running regular health-checks with every PostgreSQL node it manages. A health-check is successful when it is able to connect to the PostgreSQL node using the PostgreSQL protocol (libpq), imitating the ``pg_isready`` command. How frequent those health checks are (20s by default), the PostgreSQL connection timeout in use (5s by default), and how many times to retry in case of a failure before marking the node unhealthy (2 by default) are GUC variables that you can set on the Monitor node itself. Remember, the monitor is implemented as a PostgreSQL extension, so the setup is a set of PostgreSQL configuration settings:: SELECT name, setting FROM pg_settings WHERE name ~ 'pgautofailover\.health'; name | setting -----------------------------------------+--------- pgautofailover.health_check_max_retries | 2 pgautofailover.health_check_period | 20000 pgautofailover.health_check_retry_delay | 2000 pgautofailover.health_check_timeout | 5000 (4 rows) The pg_auto_failover keeper also reports if PostgreSQL is running as expected. This is useful for situations where the PostgreSQL server / OS is running fine and the keeper (``pg_autoctl run``) is still active, but PostgreSQL has failed. Situations might include *File System is Full* on the WAL disk, some file system level corruption, missing files, etc. Here's what happens to your PostgreSQL service in case of any single-node failure is observed: - Primary node is monitored unhealthy When the primary node is unhealthy, and only when the secondary node is itself in good health, then the primary node is asked to transition to the DRAINING state, and the attached secondary is asked to transition to the state PREPARE_PROMOTION. In this state, the secondary is asked to catch-up with the WAL traffic from the primary, and then report success. The monitor then continues orchestrating the promotion of the standby: it stops the primary (implementing STONITH in order to prevent any data loss), and promotes the secondary into being a primary now. Depending on the exact situation that triggered the primary unhealthy, it's possible that the secondary fails to catch-up with WAL from it, in that case after the PREPARE\_PROMOTION\_CATCHUP\_TIMEOUT the standby reports success anyway, and the failover sequence continues from the monitor. - Secondary node is monitored unhealthy When the secondary node is unhealthy, the monitor assigns to it the state CATCHINGUP, and assigns the state WAIT\_PRIMARY to the primary node. When implementing the transition from PRIMARY to WAIT\_PRIMARY, the keeper disables synchronous replication. When the keeper reports an acceptable WAL difference in the two nodes again, then the replication is upgraded back to being synchronous. While a secondary node is not in the SECONDARY state, secondary promotion is disabled. - Monitor node has failed Then the primary and secondary node just work as if you didn't have setup pg_auto_failover in the first place, as the keeper fails to report local state from the nodes. Also, health checks are not performed. It means that no automated failover may happen, even if needed. .. _network_partitions: Network Partitions ------------------ Adding to those simple situations, pg_auto_failover is also resilient to Network Partitions. Here's the list of situation that have an impact to pg_auto_failover behavior, and the actions taken to ensure High Availability of your PostgreSQL service: - Primary can't connect to Monitor Then it could be that either the primary is alone on its side of a network split, or that the monitor has failed. The keeper decides depending on whether the secondary node is still connected to the replication slot, and if we have a secondary, continues to serve PostgreSQL queries. Otherwise, when the secondary isn't connected, and after the NETWORK\_PARTITION\_TIMEOUT has elapsed, the primary considers it might be alone in a network partition: that's a potential split brain situation and with only one way to prevent it. The primary stops, and reports a new state of DEMOTE\_TIMEOUT. The network\_partition\_timeout can be setup in the keeper's configuration and defaults to 20s. - Monitor can't connect to Primary Once all the retries have been done and the timeouts are elapsed, then the primary node is considered unhealthy, and the monitor begins the failover routine. This routine has several steps, each of them allows to control our expectations and step back if needed. For the failover to happen, the secondary node needs to be healthy and caught-up with the primary. Only if we timeout while waiting for the WAL delta to resorb (30s by default) then the secondary can be promoted with uncertainty about the data durability in the group. - Monitor can't connect to Secondary As soon as the secondary is considered unhealthy then the monitor changes the replication setting to asynchronous on the primary, by assigning it the WAIT\_PRIMARY state. Also the secondary is assigned the state CATCHINGUP, which means it can't be promoted in case of primary failure. As the monitor tracks the WAL delta between the two servers, and they both report it independently, the standby is eligible to promotion again as soon as it's caught-up with the primary again, and at this time it is assigned the SECONDARY state, and the replication will be switched back to synchronous. Failure handling and network partition detection ------------------------------------------------ If a node cannot communicate to the monitor, either because the monitor is down or because there is a problem with the network, it will simply remain in the same state until the monitor comes back. If there is a network partition, it might be that the monitor and secondary can still communicate and the monitor decides to promote the secondary since the primary is no longer responsive. Meanwhile, the primary is still up-and-running on the other side of the network partition. If a primary cannot communicate to the monitor it starts checking whether the secondary is still connected. In PostgreSQL, the secondary connection automatically times out after 30 seconds. If last contact with the monitor and the last time a connection from the secondary was observed are both more than 30 seconds in the past, the primary concludes it is on the losing side of a network partition and shuts itself down. It may be that the secondary and the monitor were actually down and the primary was the only node that was alive, but we currently do not have a way to distinguish such a situation. As with consensus algorithms, availability can only be correctly preserved if at least 2 out of 3 nodes are up. In asymmetric network partitions, the primary might still be able to talk to the secondary, while unable to talk to the monitor. During failover, the monitor therefore assigns the secondary the `stop_replication` state, which will cause it to disconnect from the primary. After that, the primary is expected to shut down after at least 30 and at most 60 seconds. To factor in worst-case scenarios, the monitor waits for 90 seconds before promoting the secondary to become the new primary. pg_auto_failover-1.6.3/docs/fsm.png000066400000000000000000005124711414244367200173120ustar00rootroot00000000000000PNG  IHDRbKGD IDATxWp\}?9{A @XEWK)Җ(L7ܼe&I2I7m˒eI$K@ $(ݳ{{G (8[b={~4M@!B!B_B!B!E!B!BHB!B!QD!B!R("B!BrRB!q_B!Ke !BV%dc M^z p5|d0/8 !R4B!$i" @NMv@U[B!EE!BJl|'9dBm9n>qw$`iiK!B!d%# DU?wc5$VqG!%EB!rzglb`8|%!B*F!x 'Nnh0`j&|db4 V6 "2 ۦ<xG,d`?V< n!B=!RQn=@ \TUE<c6 X%p vC$Ȳ Ӊx<UU!<4MC"`@,cαX f8 4 󪚦}LuU@,H@Q=@*$IEQH$etl>׼n(fx'欽 0LPUh4"f!Jv#BEv]?XK,鄢(rp\0L0 p"_Bg+8d痂ZPZS)I6bPxU ``AB*fC.cSElfS$f3p8xrp8X!@BHlӯW3dM%JRl `&niοfcg}C~B"b x ^ x:W3;~,L&h4wJOw>J`0JZ$dBH;z(OH&p8HRp\H$x~?U("*L&YL2$IfEXV(–'NӰl$ 6 . f߀RvQB(D"F쳾ݽH_U`)!,+&-[lPJ$I0L}>PSSCMz !RRuuu{ۦi!L86= W"F֢5554P!0!!N# CiȲ YYŌKyyL&r9>eIg%dRR-dYF(B(B8F*b=oiƦ{<ף#R("TUeW,\+g;UЊ BJd0==@ TUMMM|hlld! BȒdT5 LR08$f3$IbgdY(kf7_ByBI$F0D<,0LlZbAKK i4RrBJ?DQ,la~yt: WPU@+?BHB+"czzH/WWSkd("6 6uۖd`mimBbBHiDQLMM!krq _ђJE!e@Qqb1!ypUU* &n[D!"QD!+btt@8Յv:iKE!EpPPuX/15#("D0>>@ EQ .3ĪUhq B0_FX^ލ^n^^-1I!T !:I "fhllDOOr( "" q&wc0X G8! BYyEFFFdqE`nGoo/K=LR X/Wp ~?|!("/bdd@`uj*PA("KJ$B!Jq,ѿx477ӊTBHBȣHRF4e4MCww7J82P Y AQhƖ 8NֲbE!T !$I~:FGGYXnz{{i 2 U:yLX(hjjBSS(!BV0 !,Y144[nՕ^/m^U- *D. &''VF:Ge?z%9!B* a۱w^ձ˂`266΂mo6^}oᅬ^zM!R)?P|o؝;w#h4@ iV$I@SS2B!npp$ `߾}Ø˗<… bؿ?fff qQWq)$ v֭[p\P=~ӟnpMMM@ww7!Ie\t @}C-4n߾ HYa2 IL&dYhD]]ZZZn:E!rp\8< r6mUUYi166+W<=077M Irz{{qEI( ^/.^ȦPC!+hO<'x|3顡!;vt_~%DQDMM ۷bUE4;;+W NCQ6gl6Vb)0 !B@BY,cTٳgŽUTFqʕjMyfG!("Rajj MMMؽ{w,rq\v (yDkk+x lRB!B!jΝgyGX1IQHQ\~7oބilvl6jhB!+UBY$N8Ӊ~ǕzHE7bAwwwU>B! B!+]4ʼn'҂={z8Ed(tXeO[[6nX%UB! B!d8N<Ç)@… <-[wO!B@BHǏy:tCyda7oތbB!, !;|xa2J= *k.E!R"~ӟj//t:K=$B!ldYx饗zK=r_P&g}\.Fܹ#p8!B)2U0;zrlsSߎ5[a0V (BH;xK=fT*ǏK=B!Rd"cn2“#fuL W OSׁ5["BN TL kz,B!d ܼfCwl;q늩8fo `墎B47oիWK=fB˺B!TÇc{ Ozpe}}V> S!mVpbMwޘnܸ/#"BKCC?aRRfB!+(H$wlWUx|DEb1,*@eAUՂ(qL&Ed,#"LBQD"͜s4MC?N8s:옞 !K@YQ ڛ1|sX $EGil\eQ40p8qEA:F]]L&8N'<$8D&A6$Ir0 0 0zZlp0p݋!prTYB!h2LA L,cIT*um=xG"ix AGG2l^fF0;;H$HPU[Td2 aǁy\ݎd2 ǃh4 χx<˅t: Պ\.  7!i/!B!SoϩT $E$dY8qx^$ 6F( , AF4ba!_6e ìCEAMM R\.f34MdYpEA`0`4Jz YJ Y EQf:f|z೐(\.vŶU#zO?*AL&H$DXJ  q04 \VUvl6l6󡹹NbV槍ݾ}El6>ׇ̭A$SPUe o0Uی r6oN¿˿`Ϟ=ضm#``nb=:5fݳA4 ǑH$Fsd`4p8XW/)ª}+ ÁH$9A$ ի?|>U(M:F6em,ci\.R)XV(UU2>]. 3$JUIy @BN`h4zHĦ& lvT n4 @eOщD"ZHhhh G&a)Nz%^=`4X%f p\,(̇-& FUky:6%FNK ǜ?U>< GMlPp Mt*{0Lp8aJL&k׮arrz_^)d(kHgECE`~JSPF` y~WL&ȲN' 3.փ#nI~_s1G7zU#!Ll6- l6l6;SU^}&dY8FFF L;[VZօ93Yʲmc +j[q?,w^088\.* n݊%cah X+>E2˱%M>IB`b1b1VEzd~5o׫^/jZpBF%)BH88;08``0ydYV5O(ffNMllz,Nt^ za4]G z|}Yr98R)l6LV1Fhp8p8H$,N.c^**xiQnM槁y^Vܼɲ˗/c||UX,,{ .]ߏ{_>G IDAT}h}/d/_?ۍ[Vja,hT 333f,֫u}d f>Oojj*g~Xb4k^oP[[{xr7PD!KHI]êXwH{}U =$ 65lyZhUUԃƦl6x<6-hX,ƚ qVͣW1 HY$I6^`~IL&uGR dzr* BBKob1$ L&x<԰rB&@ׯ_G2dr=֯__=ǏcK2N4G0W~?nZ)}GFoTOQLY:PD!BO{zY,,?vv&R?3_7-AݾWAʟJF PGֿ7lM؁&u܌V:s̲, 5kS skp||}}}O>Y3333nݎ}Up8!{^lڴ lU꽼TU-7cW+Kc |LL6=* VϷ9$ .BPDztHU+-܉ӫV ARߩӗﭓL&ٴ'QYsP I4Mc$IqlYcY< h~V#2033M=8Cmm-[<z_YA466U3M(cvvfa۶m)Њw^|ey`_}<:T H777 .555عsg]q  zڵknݺ{VFe2LOO#d}>}WLUUv477 555ЩʦFQD"~G*iCBVJS("Txhhh@<40kު_W_n 4UhPa-th}ڰcǎeAXcRkI)ܸq}_Ȳ̦AvbE#2 8CSSQ__AD D dV+6)_S~%)*RV|hkkKH%< BH-@\ѫln7I  _/cۓ$099 I pSF;Unff}}}4 k֬VHׇ);v+'$N>t: Ʉ}zXFQ att|oߏm۶-I3 E `2y(hooGSSS4M0==\.( k0 ͨG}}=bd2B4eX ?sV6R !DQVDXȓH$r%z;,^գWPBVD" 4A4L&ա wTii/rىVeyL|Ȳ? v*3q5۷C0p?Xvms_~Q4 @ccc$.\d2ɶٳ,fUǣ'iT,r w EQ O0 hmmEKKKJMNNEz;=\A[[ۢ!լ !Bq $ LNN"#N[tլ~?X0FE pg͆&Z?ȚȢSv;:;;N;I$/N3Ϭ*Zq?Ο?@ cǎz(DQ9sPa׮]+b2 DwOss3+D4$&'' SMMMe_G< BV;D"HRlfDKK jjjvZYMMM%~Rh)ܺu X 7vUz$ eO=fhllDOOϊA^<'|ߏ'|r_^ن,H.~, ;Mpp;N)&EQ000p8 `}a׮]}Y_]]]hiigi |DsssE Ø@ `Sxde< B*T<G  $F#+We4TUEmm-jjjkkk+zzz*rF6&&&b%I} EQ0667oޱ lFgg'VZU1r9|ᇰZ8rLCEQ[oa(p(QQWW;w)T ΝC8Xz5nZϋ(d2 $F,XApc͚5ߣ'ۋDǡWJsRV*)O2pEu4MC2d Hl̗[]]]XH4 y&kڝt U122p8 Yޤ0.^cǎݔrt%8β^yK'6l(P8qv܉R, chhb޽0ͥVȲ WDaܼy4 驊)ryӧ)Z_ɲ< B^.<77+K|>QGQ\r쾌F#ۋt:~z/-bjj cccNϪEww}W9q?`5Vlذa3lڴsSSSB&)3oZݍ^qC|ӻ`ԀسgU=w}/BE>7n ɰkMrljĻI@0dL&z{{޾$quڭi0 FOOTFK%)0h_|?xit0 B8 t6z5\T*I f3[R?x6 Py<ϖ|0jff׮]c=UL&;|PFׯ_gKC{^d2lݺ[lCUU! !NppT [laӱn7.(l[l~+LOOrnn݂g~_?!oW^'`1~~\|_}عs' o>= `0X,cff`NشiSgo?AU5 ^̕+WpUlܸꦾ(8w8f3:TmKMUUܺu CCCf߼y}Pn ۦU񠶶^^appU 'ʶn@I<V9UUGAG>Yx/h\[[$?G\xM|xhӑ|װZxgi-쥣=vVO\zH&bxpMV#"$I(zx"dY GVZ۷oC4\p-UU PSSQىw}/xԄ!8p5%Eb͚5}6xG6vZCe\t 7o^tMMMl{g={@$r9ŋ?8uopyk/444>tCCC8rfs1~H&,ǧ?.\0:;;}vE_Mގ1\rA$Ȳ|G^t~mG?ZʗZɭ_ׯǕ+Wo`uAYV8p `U9a<;0X:qp:ؼyܧNRA`qNJzŽy"_]L8tPT*AL&T*<+; TTfggqq3UUϚ\.޽6ӧOczz֭ƍb144Q6ڊk(رz;_AB׋у }/b, _CТZqle)ׯ_,^{}s±/|~799k׮U\3GW]VD[ _5FFFSO,DDT .]B4emZZZye9QH$0;;P(H$gUUESS6o\U7n1l6l߾Lҩ<*uܾ}$#IN:h4 m۶+JO?wC˶Wh4/"JT+s$```yy[bLNNR022=zC)SNGnppW^Ů]QᔵX,SNA:thE7~>~d2aӦM%[mfff4 f+YtbOڵV"PTn޼q<䓥Jd28}44L&vH) Ν;#G,)T ӟ:0?ol$IB?B ڰqF&HV>>k׮a||Uujj-e˖^'"N@QȲ{z(KJ|J*>sHgy*BÍ7pEcΝNUPU׮](ۏy'jժeK6ʼn'dpѲ 777ϳUXwMTJS("JE7iZQ/P6m[+(\Qa0vZ_~I~ ]ƚ.fڵ gI~!ՂaG}V\ N<ΒUh4?غukSG}x"&&&WL޽{Y7EG}vG3r* 26l؀5k֔zX*i_*`޽`!i|ge֭+Ҫ(O>,xꩧRY>aÆ%;S5== .eR[ZZe˖^ڝa(x7ϢYVo&^{R]<+5 N:ݎÇS"Ξ=˖ߺukEo? xDd1/BPXK#2{J R>sdY EO4Es099 zef|oߏPB/pƍضm[RN8EQSa|p-!EQت%$^}U8%Y2<IqD&ơC~v' jEss3nJ! !ˈaE"|WEcWM~W^)0N(‡~!˗qe9rN,X,ӧO#ȑ#E5"V^W_}(ICE:u tG)i S-!d?χ~NYjnn̒4>~ a:u `/쳵^/{9HO? `EʸvZ׿l6gE.G}łgyNJ;PT*irF㏱aÆę3gJʈ^aqz*nܸ`y#G2iZPT Ǐf=<_!]vw^zR,=~:~_W^ ݻde׮]æMHJz "4ø>:djCqqR]!466`0 ==ݮwJO<0& ^l3Rijg`ZYox\.(r.CZZR)Ξ=ӧOS$55 矱sN;ۤUCzz:QRR2t"f s.۷ DlsN@0L8}jjjBCCH 'b6chZH$z`1 : 墨 \1&F1QL'M{]o`Ā[4M`~~~طoN'&&8{,N:*ooo9swޅT*%D"`x"?>cW~?Ǐ'ϙU .Jd2BLL̢gϐ>M}}=zzz;wśjkkaZ[ڭm4XLC7*nL~ %׳'ͩmE_;>sl!j"==[lq ٦ b'OO?'OPPPwŋ8z(r>>>ǏlsGŝ;w\&d`_Dp) n߾?|Q+QE ~~~hnnFww7qWs{oAQ05((PaZ1JfռdYzXz}B ?<Ho?OVBCCo>$''6ťpwwǗ_~gѣD6HHH@HH~G?~B& ظ;/xŞ={?0+~0??O4:|vj^`ttׯ_Gtt4°yfTUU=[TUUA EQǞ={p}Bbdd}}㹦AQFFF鐙 ˡRpܿ111hhhJž={P(pDDDc~!%%_@4"##P(p@"%%2 EA,_IFΔ ''y"=Kis8:u gϞůk@F8 Ê"0= 6mBttW(++Î;H%BbAZZC&B4*++ߏr_ +Rl2&5WլjTVl6 @RBFb<cccrDX,d 0V`asfp8j(z:|>F\.Zl6r>ãC0(MGJSf޸~px`bj,V+lZ;yKb͚5$ "..Xf r94 T*r9 ىFX,P___455j),,X,FO ** Ri 0k`4! Hae_i ˅dT*EPPX;aZ܌͛7O]"v rR)d2߆ٌd455axxEEE^?777|gz*9☛œ'OP]]M*.#l!`J0 s dJ+ʕ+HMMEll삎onnFGG>L*.V f ++kɽьF#wk0@jNXl‹^dEQjF@ MӰX,R+pjxzzN4M ?___VwwwDOBՂ(FD"$`ll 矑w 07taM'ӢHukoLޠc=o~Dcc, ?aaaD"!y\2imaB8}mmxyyyt>B@``fK[iصk׬fT\BkV9W,r T杢(ܸq8z[ TTT`0wv{la0PTT*W26/'KɢV!jt._t:bh4;h64h4l7X>>>rKWMatt˅hd?7 00.p\x{{!!!񁷷pQ7 {?tD"_ D!ЫG@Y-s 9! ӷB :BBBPX >hllZM' @HH Lrď"lLJͶtNᶽ{Ωl3S;81\htwwg`֭Ƌ/ls Fˎ˗/#==}Aq=|CNHRxB;wtXqpmP}-EQPt0 PlhEQX,`=]+%f 6Cib U ;1 P*o`0`0~~~CRRia2 2 SWǃ/BBBqWqx5ɀ1{FЩaԩ%wO|#7>@'׮]éSuwwC*b^!VGll,x<0 Т( <$@@@FGGJ'??gϞũS\b^l߾?Fkk+m@Xy&m۶Ҙr%lىzY.ICUUR)>SluX,6ZZFaCl/춼.z`n޼ __e@@ \2 ,>}y?V.\@~~>y`ځQܻw 8~CnmmūW,lz=M2:::x#nKFjK+ +N^ CR+<VuR~P#99hrF#FGG188Iaex CFFƲ]ojjB[fjy/hzK!W(pTECG"##q @.CP@R8PȆEEEϧY( CCCJe_'XV64)##%'sLLίp H HʮZFEE 99۷7 =oO?? 8zݔvG%ٳ/p'zqEdff"22riO쿰xMj۷y&hf=qIY___bƍӶo2ߏJX,)bڵ$i5 Nl3V۶mÅ dN:gWKayC 0 u\GG^xj,[=hZ\rGPPCB@qq[f  -<1 )!006lX8 &|tuuATə F#Xdee`0liv=lg~~~H$HOO_wcc#:::}Law۷og;89ׯСC=@;J;mHNN!P]]z q8$$$ 11qﺇnjGEl8HLLP(V%#լ ѣGqM*D \"B@SSӼƟ={Պ?|,[={5558s̴UA!ܿζ8lBjwpk s;RՅI0l|w2Qy=vl!g7o^"RDuu5J%qرp7Ν;VQUU5)4;;z7oڵk {IP^^F@) CLĽJ%0888)ߓP(Dll,bcc]Nd%88444D"DFFiii6@ \kfMXQP[n/q7oĚ5kdݕihhF3g{`Z_R #b IKKmXؒʂOG\\6o޼켾fCT fn4!@4[|.vT4gϞahh!!!ؿ]DN?GKHH`wMBB!8W^… 8xಬ86)ǝ^G}}=J%7233V7L{`2m\~C\-[rvb͸~:IX8@D"8_~e^եhQXX%luPWW>PVV={ $$d tww\r| ,, dz ( ߿GGGF$O>( ]2LaLcZ0pL.\حm異ΝCXXؔ6n܈uH$> P(t-z=jkk!{.,*֯_OnZՅm"Ԇ 4VѨ#8p._jaB SDffsPs$ ͡nxN:5kZ۷o144ľX؄OOO`׮].WUy(J6@ CHHhDOOz{{' U|>lt'[F<}* 8rȒSQQ۷cǎʕ+ӆ|=z]]]QTT{B糿 ݻ( ۷o[~7>kbڵf{8aX -DooÒd8RRRHn%`7Dp8===(jda6qy8q% ;mmm0ضm,--ETTԤANbk׮%hoo^M]v #6jdd\.wHV@ڶmFbΝKTXT[qwwGjj*?iCll,n޼ X\{PBCCE稪07<<}O1400Ǐ^F^^^HKKChh 8m۶D|26mڴ,CT kA C1+^Dž pIa c``(**rHVϟGxx8޿Tʊ YAX8J---ʊ›vر* HR9wleKfGFF.[j344j dee!55!JLLիWT*7>~)O?a. !<;v`ollDUU|>v޽@ר0~gt*dCqq1n߾:P._'Ni_V7o/thujţGp%͛7FA}}=󑘘%0Nf( /X4Mէ7hT*EOOϤ6<==\>y 88rx ۷o:t%%%8yI$9sUb IDATO<Mӓ¬\Td2h4Ӗ_JD"є=ZW^A.h4($$$ 55s|KȦM2#0!@ ,"Ƴgϰ}9uV(--ӧI"_;qE`---ARR6oތ:|7v0=j/_ NMfk6ǖDi0`6l |pwwG~~$GcXpO)>Ĺsw9__xxx`޽ƅ˗/#00С&" ɓ'h40p8HOOGllBOOy^Bq5"ّ\v v)!!( h91͸~:Μ9C;Q]]4P2 Nņ &$KR|G]t_(JC\.Ӗr^JhF?:;;h`R>'DGG#;;x(jERRҒ%t===Ф9'H$8}4nݺHd׊ffݺuXn ]HO>fM X,hjj«W7 'HK,Vξp8CP@ 8"ByyB(BII N:E&vBB&ͻZD(bCrrrhooGOO|8J%a0mHIIq2lFHHRSS0Lb1۷/^M`޹s'Ξ=;B~m[8t?cǎrY,UByy9s YYY<q>>>#;k׮СC6@ (D",97nܘ=0'N)L07JKKwCQUU.Y'q===^T^ EQhkk8 "jJ]]]dELL vAplUx<mXč7P\\22>j5 !p={,Icccx-JZ6@YTZ f;vX!1555vb^986ȑ#pwwm&''J_D(g``$APݻt`k֬AZZ-\=) ųgܻwa] ʂ@%aTUUԩS֭[عs%y Ǐ1/_۷oQ\\ T*Q^^>AMM , \.i*ł^BV+BXV<366Jz/kmnNNΟ?q,)) |2oߎLww8ƃ>fܿVqqqxz=P]]̹9777YfvT\~ĥKtC;q/G (((KRDKK ! łDر;&AQ044///,30K򐟟Uɓ'c NɹvE;v ʥK7s" 3gp9W0h4sE2]1^vNdkxdff nnnؾ};Wcgz=LLL T\@ &BK?ZݻwS^ wn HNNu(@&aķ~oQUU@# E{ALL ׿B("44###H$cxٳtEQ7@xx8߿444@RaϞ=P(r ?}|}}? -- )))_ B^///cddZ)))d( b}}}hc&ٙ d2ahhEJJ t:޽;i%z6hT*۷oa6p0 \."""F&ԄR[[~|tG!77w CmmrkիWwׇ߳1$$r?;~Fv*))ANN^~ 'O 22ccc B__!ۋ={Çl;޿dqqqŖ-[ˎA>"ܸq~)F#d2;vOgөSPYY?|$lcv\\Nss3!ر/`h4(N:5ek66G!EQCoo/Ann.BCCZŤ YYY6eEk׮@ ";w|FAkk]V jܹs8rD"є >} ^ϖp"VYr* rd%닦&XVHll,zzz0 R JP(ш\v F @,#44mmm(/MHHH@OO\.L&R)NQ^^Պfl޼y ';|2OOOǣ 3 ۷l)u}B^^IHmmmx, o߾b;‹O?Eiioܸ޸t9:8q= e#0 HV J$J͛lUa&ۘžV+hfjžEEt:x<j;ٮi`` sxdQSS@kN/$@ ,0m5Zd2He7E``ଫ4M??W_Vr??bѣGtSDaNy:l/"6?q{yy9s1m+)lW(x(Bzz:.](lݺuG [k׮100XV$$$ %%&LxTUU!<<|Bh4qn:tLt4}8'|_/ǏdLvMc}Mw}fw'lذ91|c޽rT1W r)˞>{ ɫ"/)+]KD+& ~;v?KD]]G2 sرc싁Zƃp{nx{{ϹO@3~uşҳs9 Yݍ/_ œlX,hii;PZZVڵkzvall O>0 ¡CVZUUe63E @"'OD]]{Ibtjkgӕo;옩u֡ LmLc}Ǯod2A&}TzPTؽ{l"33m @p!D+-cA,;ȪՇNxs>|^^^۷o8@&shZ_ħZP*_F#Q\\b_ d2ӧ0 ޽{].Brx222֖-[eTVVPN}66mڄWN+/_frOOO6(ܹ^0رf|||Ri}l$@ "Fww7"##g> DZVDee%n oooF/(..jj`hhMMMX,i^^^ؾ};///ddd ==VV*4M <suZ[[>QRRbFvv6f3nݺrbVEEnݺhߟ{Aa˖-aA`_Dϟ?ɓ'gjv}aYE8[XP9sfU 7FGG(M Ann. 886m"߿Evv⫅z.yyyDvvtwwǡCjq-4"ssss;۔9!>>ޮFEE!** իW󑗗G1VmƊD ,r!@X}ތ,hrΫWa3III~0 ={Ɗpׯ_lj'\ct:tuuwR5wwwb޽hhh\.\.ǃa9zjZغu,ii!nO޽?Ν;}hPZZ ٳeŻJ^~ڡx<=Rp $&&bӦMc@ӴMXf@ESZZ:늚hDww7<@V7M V+{mG`@]]p憤YK_ݻw'֭:lMHMM% T*vrY/D"9r& ?(ݻBT.k;߾}!e 4McǎV\Ƣg8DXrD|2>sZErv Wp۶6۶mϬd2ւ(@rr2! N_`͛jȟgϞ5LؒCS@V#++ aaaKBHIIAccM[[[ ^Bg’$4gϞahh^^^سgϊaܾ}@dDX>ĉ'f ۶msJ'+p&@-w񧧧rHJJRWWZZZلKww7:;;NBp5***<_\ӧO'cuP]]hV%bx&---E\\nlS"''066Ǐh4"22rVoVW 2sayA ‚yɆ GCB@PPД>d'#jDGGȑ#=ŋƍ 6ؽ{-rMVp7n܈/_:%w:002X,߿+ܼyEEEN >>>سgŋ0 d`gu r0o޼ƍljwEsl6ݻX,O>ڊvp8ddd,Uq۷x ABB}IOObccmڵkq%& CXXz=@K 9 Svŋ8z͘DHH ӓ B6ۛ7opEx<۷o||+Nܻw~)"""m]r0Q}mzddd ..-ף@DD7 à۶ms9ٳi]]]]8pD"ьjܻwnnnصkh%PZ( X,lڴ$r^"KDqIͺun:L&TTT`ll +2t ܸqQQQols I_YYf3r #B ¼{.gM^Ip>z~Z%66]]]_IlooGcc#}X,>f3:h4ŋP*ϑl3p`4mƴتwPVVՊe]}zܽ{<ZRaF#>} JR?FFF\&?@ 4Da aJ#X,(++D"hnftvv@R 'F/tӒds%!! X,x1Z-BNÃ`2pU|l艢_^^޲[n $Dnݚ,GpIZD wwi?p8Zs"**B@]]Z-|||?a06cFbccqe-psscJ;wia۶mPe2<8g?q_\^Ȳ76я(TWWcpp]vy<@ sFmyee% p" ÀN҂xLFF둞$6YV\r7nS .`ΝӖ'T*˗/p{N"8Ǐ#//f 0z6|}}QTTaXСxxx 22rV`.jia|BO0kDž 3l6ÇHLL\vI#<=]'[߄6@ VD"̙۷oXyjB&.oּѨ[H*ӊ7M’AQjjj k֬la"a'@CCÒ KMxx8ֆk׮իƿ˿Y V'8tÀU0sŀd\-~o- >}1#??^^^;UD"Ç0 PYY ___;݃j۶m~:叒@ !aNT*ϟ+sYCKR_HNN-5550 s{.6l؀0@ hMM (Mزe˴UG&-+/ [2$%%!)) 2P~k\f ƽ r]pbH;;KWUUd2#''>>>Kp;v˕(deeA"86ɴ8 WF$·@9q?~|ϴZ-<<S凫` Z4kMxg{cэˁML˷ZJ'%%)))P*}6z= V됾V:$@ טiʓ'O?gH$N'',866izmعs_`4MR}it҂ \>e[[[:;;3ZHrrRtz{{a4gݧtOgLm_~}Qm|rQϗ.ܻweee>mۆcǎȑ#s;\p{2Ʉ:8qbg}}}x=R)޽ @Q|}}? -- )))_ PPt:dff"%%AQݍ7zr( cƍP*W_ΖS~!FFFjL?Ooz4M#<<111AAAX,FNN^|"ݻw(..Ƌ/  vZ| txDP(Dhh(FFF@4(©SPRR?_pA__B!^_͉o ///|(//EQP^Fj`=Yl娩0::ۮۋ^9rVǶCQr9d2;QEQ(,,/! hpy;\ł~ ϟht\v$x̗ կ鐝;y^3/\)(σRAhp#^ԿlG|<|րk>gCX \.999+`0`) EEE(,,d(H> qYv^`U*rrrnwoBPロ2_ B_|Ŵs DDDL:m7/%%%H$4oDcpp0֭[$@ LD{Jގ 68"\1 ZH$T*W>[n!++k^ .L+`"""Ӄ\yX,(,,X,FOO+1 O.0xsP"00݇ixxxuuuHMM0/ֆHʕ+AXV a^ٌdh4HRbؠ T*vJ8p*aXmtl[RbuuuxX,X,hjjjE`` BCC:쨯Gkk+`2`XֆBtttv޽{VvN~l0hkk0b1^z^Ӊ18vؤk%a4 . ɄW^n4 +Jbӏ=0X,Djľ]]2GbDwaR@4`6188 `2ؿ=`0`ZR  ///zi4 ʶ@QV+QSS!ttt`T6YmC7Z<&1lIIˆRCp.MӨRDTT6olكGa׮]춸8"554M?\>y;oחm&`6/QQQJs\7CZZd2||| H6eA4Îe9NlFss3{ ?]"Lʚ4ohjjbWx 222~_ʅ0 s |ζ0Od2R)nݺ$j<~/OpM6͹ >>O__58ˑ7Ll_|QQQl~|Og:&a4MS.](dddP(Ľ{l>/شiݧN^}MDBV jjjA4㑖I0r^ә3)l}L'nUy!6oތ٠h`ZRV!ˡh0:: ^R H777~6lbV+x<ޤEg }q^^^0LD?{oE뢊QnPooūmٍ'b6bcgcc&7;DLwtKH-x" r%TP'ueUPeAq|_o~G~3Eぢ(F>>> 0 H$pww0!!H D1r^#A٬_vGY+}Hv{< X$< ( Nu;r l5RVn5.G?5w"YxV.Bais}n`c 鰭$&&B&ͰT|K̴(??YYYv<~SAYF_Xz-2 pqŋطoߤ6y&X,\F-29/^V J188`0ٌa0 R @J ^D777 CC&a``"b0L7b1 a0/Ut5RՀA5 bH0`${oL&A#11 ]0n߾MNk׮~Ddp/ .Fnnn;wvvbӦM,a*Ti_vh0 q̙)}ƍy8bAuu50 $ 6mڄ͛7Z48 {Ti:JhllV@ 6nnnD􄛛$ D"e񁇇Tae-=GJV łAF&!!![8i=Gt w8 @_bKw>/_ʕ+Ν;~~~SvA_y>K \ Qr޽̘ane|pq -FR H,1 sPWW#͛7屓0P0 jh4\lwwwL&4 D>Pwww.Zz=b1f3<<<X ///D"C&ARapp+0 xpss/:;;p/z!`28bڠV188wA ^^^J%K3cPED#?yLH]?`I?x|@=TKGK&݈"G.Z7oBarLOOGnn.QMO.Da ]f3FZZ,KDmN8+WḦ)PZZJ\84MիW,+6n܈k׺Z4bڒ:.usnC2 2 v!h4BB@ hhEQ\p FV HoooD"BH15ڡV9%y8\ _H0#`"#X k0LQkqO TCiRRRj^RAcxxfw\6D2oooH$x{{#00aaaN͖fRV@ Ҙ[[H$xzzB&ח(% LOOO򚶌'_4 `ĊǢ|Մ%KB_?"5e5bG"9sơK9a`J Ʈ]&5!!!C__gZ\@Xa ח6_~%"6)LBEEm6NqhF=hYb )j5Z-t:a-RbBxMO`` g$xX,h4 r\lT*(J(JP@`9J @$޽{,Rsj?~b& ?4.7;Tz ϿAXXv#** 'NŋqW|˗/ˡhpuz߿B8 byy$ ZZZkw_uu5G85at.]&:ujuj?EGCCC``CCCF5B:>>>Ppww,U0882jM3m2^^^L&CLL{Yw8BVs 7EQ\:vkUް5>>>Xt)/_>#N֭˞7(k(Xk709喻;R)Ŧ7֭[4ܥafAX.XI_ ŰX,_ƍXd $ >C 99ariC__6mڄǜ>}ϟǹsz.q=lٲbyYU 69"ֆuͲD000'NŴ_/IzrrCCC6ZxWx8v>|19ZTVV"**\pssÙ3g?dbL S(Fѡ9iuu5Q#J(lܸ8vؔU*@F#޾}vF΂q.HNN ;03Y 9U8YL&|JN0VFӴͱRRpRmppv!::+W\PzzzHyLUU233*ܹvZ~b `ݺu6A}lق"33ӕM 5k`͚5hmmE^^c5gϞŅ paP@ Q8ܹ]vG]]]HMMuZ}ׯ_GVVz{{QVVpأީmmmhll?,XH9EQ\&BFcX L???$&&Ϊuwv*ev?keMK>f~v]"""OOO5L&Ӽu]#ZvFAA8R၄y~>jLɓ'\@6C5zӦMƍq)OQZZpj;sOOOdeed20ؽ{7R)RRR|rbʕ68G@  Q9yܗ>M 7Ft\p.3e-_YYYN?kky,Xd ֭[000o&;d2|}}dɒ97nZF]]77@*bժU$,188`Eg`2l^>)֯_O"-- O<0b]֦ݻC{X>|0hƝ;ww^xyy!;;555Cvv1޽4WA 1Ʌ`a~M"?6ԋ/3i?|8s̸M(// 8k"..yyyؼy3]-ꬡhj1<(xl,jzCaY8#`ݻwp?ٳgGK  ((AAAXlyi%^zf.w\\ܬv"LvQ lKܺu ;vÇeYȚǏsظq#BBB¼G$a0q|||k.tuuŋ8uԂv @ A@G1 f3$ɂ.4BCCQYYi7T),,ٳg'UV*b޽˳.( a2[&Aܺu k׮㞦P(PSSYc͐H$Gooǖ EPPI9VEee%@Q∫<&|f6ZFaNQ>R{yyEDDlBYYYq°jժia"q1(J\v /gW6SBl@]h_ŋœv!3{S^^۷O& (,,0 ]}x9AӡfZ& 2 <4MC.KtR\rotvve@zz:Qc+k.S[[W1k(\J帩pb˖-x3իqeDDD@*ڭƝ;w ӧOQTT //a۷u@ yZ{cEWyd紏(jZfܺu ٗ ?iiiosV:’%Ky̻wc0 dee-*KJee%RSS]-hoo_PFDFF(ݻ7n#P `$晽Ǐǟg_z֭[6?gϞ%k;.HIIk@Q޼ye˖Z,>K$WB DqdQ^^NRLCzٸ& 7 IDATa㣏>v0L(++î]P(Ӯ3t:(BQQ>3bAQ=z^ۮ|dxx)ł=6T1b|9ro޽{@";;{=aj|;v ϟ?Gnn.~_Z$QZZm۶Z @ R1Xf͘$M<III>ҥKN PfZrJ=z999sΘDsOt-OԏQݎ)G}>Uyc}3WwiLމqsA.;o2!HE3!t]T.((={lwà( <YW\6l؀1wQ477f````Bƛ=>LZ<8b>TN}t qɊ9g~:C  q#PUUÇc`` ϟ?G@@nݺ!cҥ>P\\ P(鉥KBT `R]]]F2ry* `@TTWf())իkjPK.q_EhFxx8<`?(B__~ŋ APPB!T*R၈tuuq2tvvHIIi&P\MII ~ JCCCdXlP(_}? hjjr؟Z)))EQ6}ZWWիW#((uuuS2~$3ǨƺuGAaa2q'[ۜ+EQc xxx_}vlڴ )))l2>SRqgʕɓ'g,;/q=V~1Q__(..FNNBCCQRR Rޞ?Fׯ_ȑ#! ۋ\t:dddfN@TiP>}E֭[kZ?bn._rܼ EQPՐH$/^@kk+>#`0]]] E@@B!8kŜ`P("##O&&]r ^h4E!88rf999H$ͅ`ׯ_m֢qljo(jR_~2{޽{\///tuuG[/&Ý;w`텅v=#$$}}} χbqXԩS|2Μ93a{Amm-o>#.4M/?1<<ͅ?c:{vnn]'+_||n:9;P($n E ػw/ ܠkM]]L&NшvH$HR|qqq\y`夣4M#00+c0eSUT__CP@VC.`0`0 ..Ϟ=0<..rR4M|||p-JD"`Jdee0 8y$#0\ohX, RF\\J%֭[ɶ}\v- WƺO߼y8puNXlM6Aq lONlƓ'Opi|`aaa} Fݻ7qHLLaztvv_dǙ-ˮ.cŎ/;OvZ4 Twaxx*zJA`` vsl6sۣiMuKRTUUqsZ`_/93nKJJDLL n޼mj"z,1dVcװ6[W:9[6 czn à^N0_x.L&0 FMke8zƛ`Î>xR oooF ^C,[&'>콧 QQQ61m}QQQ\߱fu=}e:noi|s8{d070}}}c1wܙkz=SRR_ZZʨqwޔֈa&''>k:u4C{Ɏdd3QH\랽:Xeo(bmrfpppRm+~v ޿AiDydaq.g*7<@bb"d2 Kz{O}Ib-R 01˗vy)4ͬ* 22\oR >wʕ+pssGjj*.]jslvv6 ߿b\L777lذk#5y 6JJ⠽\Çjժ1(,,DLL >1٘D}Tp5 F31OիWqԩIرc0000c.̈́ɣGݍÇn6xqq1?N7obϞ=@ ,PhV>20? (,,i׮]CTTo>b l2t:TVVBՂiA&A"6l7233UV +V^LL&T*IJJJJBttKI***  HTWWO:`\ѣG X|Eq:?FWW;g4/^@gg_bk׮ţGyIsN4443(!՘fAb۶m'RB}}}c%ظ@ ZTUU9YÇزe}lkv<|J;xzzb׮]UUUjP*/xzz"22"7oݻW^ 99iii6uΝ;\ u???lذaҮ#>>~A8&|ܹ\բ8/_زe 6mdLii)RX4=/Ƣ tl\Z- <== ];\EQx"3f'O 55b@Q^|1;t:ė/_"!!FH |$ELLWչ΋/؈D/}}}(++Czz:œ.Crr2^|ei૯'|2cǏ#''F``Lp14M=^hii&@"d<+Νs$%%Szx,YK,OPZZ۷o#>>wޅlx<<|2  jQ]]ťxظq#ya&:/_ƩSHENWW״8zA pk=4 J'8|Bkk+>} `񐐐 -\f ш2 #11aPYNx{{ϨAtt4&FرcS:X,Ƨ~ׯ#66 BkmzSSSq9ŲejQ(cJ"Vs7ob߾}_N=z,+{޽{q`Ϟ=jhnnF__ZZZ &&˗/G||PWWY0 Çvc7y6;wDYY٬{w|>{h~_|> DZFrCRKbƍr͍sYpssC[[ݻX.hxx8V^(,a?~ ٌ#GZAΈ0Aww7wmyzzBTBVcΝsnjrł6bqC|>F(SL&X,|DFFbǎ\^ܽ{4Ms&`500Сrh \.ѣG_?B&᫯™3gZDfRϟCgHR … 8~ Zdޚ+WlPXX}^WWF3F1$H0<<<[ݑ1f;MPT0 Ajj꘹ӃF455 e˖!;;111cիWix jjj`lڴiVng[naҥ .KaT*`GEED"֮]uy>XB!7ㆆP[[˭Ohoo@ @HHBCC2'V{޽{vƍ7&sظq]KZ[Ձi`ǎ./΅4HKK#1¬B@deeoٳgg%.1޽d2v[j^xA^17F޵k~g܌:ۿ:;;!H  ӧOc6- ڊz|dO~ \A~~>6l؀pWCCNbRpmĉ6nܸww#( x ))+ގXr%(**>F.\@vvPf }y)Z[[qq4msƳﴶ|>[lAZZE*kDJJ \R:twӝ(↢(B8}4d2JKKk. <( ֭V```EEE8{,Qᔗ<^zQUUG92hiiALL ]-δy5-[fwT2رc駟NXf py={vNGz==z^DsXTTT̺i\r?яfB@ qssùsĉ|!#:uf[ww7&-X`ڵNCpHKKí[q}g8 /w7X,FC -;hAQܽe!02 |o ^_~577]Ǐ/X8씏2s'O8; 7ǡCYf ?pA9 Ǡ( H׮]bEEE$~@3,ܻa 4Mwk|>222N03رcǴDQQ}QZZ/_n !ollDKKM{們yyyڵk i#嵦Fڱdnrzxx7 ӓ(%',3`PAOb², t* kGwS IKL'bAmm &˗K>{5oƫ>>>c?mtt|^^r{vWB0aBB|eҥ8q.\Jz] 0kPPPdddȑ#8rȘN[l0?!e Mtt4p 0 8UѴXc^FyD"All,ꐒb/&& ϟ?Ǿ}PSS\! ]]](..EQP(\`?eeeP*BBBݻg0YSSw}?HHH0FFFJ%r9ߏ^_ZEQt(NCFF@Q<`$+???P>?w}tףRJ"""ŵtDXt)/K.EII S !## 닯 HII'4p? V(HRtuuMi_nǠJwk;M,cFWbMrRoڵxFٌ!J%a4144s2[;..߇FQQQ6ԠSb1 0466BVsDZkuÇ5Ǻ9>`R֢SرcDeeS), =z˗/hhh}LZE۷oa2@ f 0LQQ>jYSذaä8_DXl\z͒?`8sM|aXV#>{efrSq}^B1Go>{m*..!Hc/v>^{˗سgM{e˿iXwqv\QQM=֮\70BTrX$/)8Ğ$BǑ#G&]>//ϡ[ĝ;wn:d[9\q͏w# o?яmdh=n<{tKW\'վBV`ӡϞ=޽{V'KCCZZZ*hmmEKK x<(† &~3\L4.\`ݓZ2<@bb⢸y̧$.`!,, |^z˗/cٲeXfŚL1zځ,\r4Z>hgO ]Gk%KٳgL&K1f4mq(C;l~ĸ(2;ߞv$^2Vjk}rM`֭kI=x;:+ .̙3o{9ZOOS۫׺7o`ٲevdhjjrERRԩS.͎T*czX,###elϟB@0ìX+V@kk+._XZ9MEEmf_4aܸqC;߿yyy6 :{D$ : CQUa^1n^$XfVC(" S~fh4ݯP(L\ΥKpȑ1ӧOqԩIJPTq ׯǥK+8qrrrpa:~GPϟCTB!ٳ(|իWpG;@p&DDbbbׯ_#//AAAHOO%BGfl ٌ7oN0}x<I)M$G@X<0ЪӚ*,Q [os¾ ͇ׯ_G__bbb[nF2uLxܵSr%|wضmۢp%xŘuW^aӦMNɓtҴ&B,O?ױlٲ؈&H$† %l68}E!aVC\\r9 @4<8 Ƙwww#""bιl2L&p1gs+VÚ5k,5[ddd!(0C})@Dz4| P$F܆ !sNgMoIQApTT8n7ϋ[]u_tW{(GP@8 (" L"c0HHȴÐ@$;!Se{^og׼O-x8x ͜YMׂ((J1vl,Jnٳgƴ8GRÇ-jjkk.Q^^nx@pYTWWclllfhllH$RDXXFQRR>i1aMpd2$&&LJizUPUMTT8m{6FFF5'tOR hii<wwwBY$ r|YZpuuEdd$_i=JW^addthDGGE_WUUEm$ffftRR?ZmH,/_&gEtuu!<< HJJǏ75vb444`||憓'O2[y緮@ (bggGGLonnFUUpi\NMMZtlo]...GOOiJBCCF Wݷtϩ)ttt* VVV Cddױlz_SS:#88.(@SSB!p)ϣ"r111tY颫P(_6<}_|7ɘ'D~~> +?ZJ?vN۰x8;;yyyLb4r9pUE˗q/wEZZ,g=R)(BRR &?4 ƈ@QY6x...EQBUUx<RSSitcc#ٳX{{w[nFrbɈD"B"bСC&*eeeH$7 QZZ OOu3HZq}1]C\\jjjЀSN pLt(((+N8{XJz$ (ѣG״&!]6kRP(*.|wFFZZ bofѣGByy9;T䶶d~@CCd~~~oD"ڵKo"&>`Z8;;C(2-`ペXA?~DQQX,<___+wч LNܹs999f ә7Xn޼ݻwg?՝pcddx8qF177{!,, 111K055{"==, JB}}=***RpqbfbTVVEQ(**BBB¶^ ??-׵cLNN< ġCp-lڽyefwww?~|,66op eiD"p8 n0J d5gAc~~i1Lhh(BCC,\hllD"AZZٚNLL$L#"")))MI( EQBff榃crr$cH6wVRrܼy999Z'nnnx"QXX$''l6,Y}-ZZZ ˑli-oNMMeu/Z; P'N>% ˗?\ mmmD"J9psHH<<}HmaTTT //U Ǐzm/tww뭾ɓ'Q^^jVg겷Ǒ#GZ*ō7PRR mx<! ¶ G𫪪H$9b b"I&djbnn ыr@GGEDZcǴ ?7'ODYYIXEPPP]];wNckkk\x߿ǽ{p…5vQ__>LcvvvtבAR!99C0>>tf| q VW@`QaO!y\YY+>ϺىQܹsw۷x1 l,,,,`U=4dkaXZ4+++:>C HxH900eɻ!,, FRRҶU1))ɠq?(BAAָVxbp[[[\|+rCII XoggK.K@,cxX,N8aV.ۉW^ 2VZ8{,Ӣl^tww "8<Ν;zkH8a)//RA@.A{ 44hMMMf `1Guu5glD"Aeݣ۷oܹsZxصkRSS7LAaa!0bٳؚY 8@ 6 Q+رc?`1;כ7o0>>J0cٱ߿_o` GGG9s+!bx(Es'Mʄ 0Ez>|]]]())g}LTTlΖ@ҥK0k.ڵ (((@xx&M6$eee={'N0Ggg'D"$ BCCqQZe]xViǎQZZrۿYXZ"BLA [(,LT^ۋRڪ㰵tMMMHIIYvl;r2XT|x C___l`bbd(~l6~~~8r#\"ݍ ]gܻwl@bgggʕ+H͡Q\./^ķ~k׮mJ@tvvGxxl2qH*FaQz я~Ĵ8x@QBCCjP7 l6d2 H'''D"p8&bQ k9MZ^^^8}4n޼53NBB={n"//oŋ7EDD ""GwP&-SP& ꐛk1{{{ooo2&!H,F₰ )ӧ1772?&s4bYLَf $$$drE鴸J޽C]]R)ܐ`Jr* ___2-Q7==޼yлfILMMY/{"""p޽m@g{9H]յnnnG}}2<ػw^47\.[k422h/0z}333wA(o>$$$E1} _S᳒AXʍ舘"11Qk*PeFN(R(R1 `5 ޽{gJ_Ǽg>M憩)ŰH|}}x9s>|@qq1JJJJŴ#Sʍp JпwttǏ>+cJGAAAppprKINNFcc#J䋊ȲVILL営>Rt333:߯]@=+ !//?2 hllģGPZZRܿ{6\x- 222py?9bŋ "##AQfggGܹszVYYY#Nw}LpU痶uwwR$b1>AT/_D@@fff@ Dzz:? &&~R__G_䄡!TUUahh?R\ՖknK5E\QTFWPc72 c ) lX,l111e|}}62șǏ_WܸqcUBRʊ &R3s%%,Z}<GCCCīWYC666ⷿ-ܹ$qiw}l|Vי_~%ܠT*1667s=#++ SSS\sDNN1ӧ۷o! ܻwh\\\??sOFU]]mA ]! fhiiwWjooD"#RRR 6ٳgzp8:tCC\ OJ6lrP\WWWDDD@,#,N]\\PUU^?~sss`T*JBXX?~ BA+ v>d2ڍmΝG||3&A. q=d2#!!˞!EQhllP(D__R?wBAק-uei`a,ݻxT(sjX, vʊ B J%A( y ]BBDnwDAb1u2##_}lmm{nEQ D__ߪ~l6 X  F| vvvt,z\.//M:<=> 33y<X JlCkk+\\\srssX{QT;R)Q\\ T h:::pQKac냇Ǫ9ʱk||;v0ƫB ,lx<}}}[J[TTD0RDYYF^MiiThhh'<///e , 컹Ϋkci&:MY4TN}Lsbb;v-=ꫯ_BZ[[{Ԋ$??W^o^ʽ{p9Zd2d]홯|nkHMM:x&˛m,ѭAAAFsA{{;Ο?oF?!/fXĞ={E)RB*ሊ22ʽ6>}eZ03r}7‡/p13u>@DH [[mwmec 7mHOee%bbb謆Ns^nr1Y[?1gX{YKrgm\vMC@uu5꺡,s|KGOY6RׯQ[[ G\\КϞ=3b'&&%%%:+/ޝ5SSTﶩ_s)2ݻ~ K466"..N8{,6\8vR + J!wqq˗qm}6(v&''nP93\~}ˮ}555Ο?ohiiR)qaQk_rw7,Ƕ7\ԡT*g!@&t~2*dr9l7fm`[{.xpr,˦Q'EEEx"b& Qꊩ)1-E3;;]vmcǎEii)R)x<Ҵ 455m; ݻV988@*KUy%d2~oJgϞaffz nb1^~ h݈($''/5efff´:ccc v߃p.l.p 7 Aee媠` LA߿Ahh(Ӣ& QfR(.kBHR$''[\cR[[k4%krr2el/6 B1ѣGQSSǏoPL&""@uu52331g188.N?~/|Z^|gΜ1̊mmmŐJpwwGbbxoYI4775Ȭ#$ɦuDď@?=%! ȑ#xRRR"0;;;;U*ZZZD"!<+HHH`Z-qiAR!88srrܜYL{ #ke@{5 cM JJJaaaq~Fgbb"jkkUZqC144a[xx8⛽{>}:iˡ4%;ؿ?٥bBn@NNݻUq\~r,Q1k.Ǐ'q! 2ۻL*fkDOn[[[QTT677WB!coooY Ӡ( :&WOOϖWPPЖ"!!` tu9旒ݘzjjAqq1l|  R7` vf Q2 ޭ*+޽{w^T*466VVVHOO7[!c$U0ƢŬj"$$zOnkk LfTb!99bܻwGckSA[{deemiܿ[Ju *GFF B`1CvehJ嶋OP(hY}fT*^/a55GR)^z9ҥKLD 5g#",,  cZdq-fӁҶEOO,Y|ݻeۡmQ 7  IDAT333xĉr Kf< 000@F0lll ˙c]`1`ii)$ ;xYҹq"''g˱jjj??uj C6LfVSgΜAii)lmmM>'a5D`mm09sl"hd߾}Ȉxxxi1􎳳3R+ ))"&o޼AFF2ÇD#斚: PTx%ꊣGĄ: ~ӟnPTZ3۷tJm;;;lk$Sע,bΟ?۷o23hT*cll , HKK#uA+ɨ1頣 s>,l6\[[:9rDysHX, f`1pÇT*xy*Jܽ{{Ann933wŋ1>e9BxF999LaT^o$@a)j`ll `HLL$ (ZCMM T*2~晛[ܬ6z2Ѐ 3*eL=MEcaggLf-WWW:,FFFw(^zNaϞ=Z>#ܸqz(//R)BBBp)21B0 {Ɔbδ8T*eZ bxx vC@aMN:2dff2-ʶ`dd>>>La2$$$ >>Źs8DWW7(2lMr;;;Ft\(z_HHnݺT Ck,/^@.g?ٖP(>6C*&FEE=ʴ`mmcǎǎcZp8B0 PVV___\|iq:l`x<$1Ȣ˜aX8~8$ JKKX2&păLF222fV'@(++\. N8w[jXXX@AA94b޽6T*8pAAAp8tli1;;K),???}"d72||>&w@ Q̙3(,,KlKi.\@{{;;dggu;wi1hU|>aaaZ@&Q"C<P^^J+++$$$W?`ǎˮSe)PbKQ3a3?H A_ee޶h|vpIY455A  իy #ֆh`Li1LYѣkG +DDXݻw{eZm;&&&V-,(J!mJPeR'$VÇN1pssٳg"wÇg>y 3~L@![{W}#05q`O{L cr 7]ǻwHJJZ3`۷o vMX\ti1 E@X !@ZhiiA[[]f2Aw{A~~>ٷo޾}Kg˲TZYkb6mM[jv('''LH?o_"3m*E)!dvn>vxy#h:uWX(ۦQQQ6.ە#p}ƍLC 6 Qt&++ >dZm ,b HMM_yo>1-Qzr<]f#EG}~u{=c fM=K#<<]]]LA Ejj*@ l"范'E ;::;E__ vy~+Pyܸq}R-L P(׵]0>> V@;v젃 [q jV&z7RxFuvuua/k4+iGMIIɺתRz.{?~Rg- DKsL3a= '?Y񫯾Bqq16-hci߰>GK@ l7 aC:tnΝ;eۢ^ a``}}}ӧO~twwcvvJϟ?$rss111ܹP({. Xtv -+![KBTA(Jtvv盛'QQQA[466b||O=www777H$>|SSSP(DWWKW8ֆD͛kuyuR_~%ܠT*1667y>sekmmaݻ t[UU!HR ahhz{{6M[:l ɓ'1M_aee*ixׯ_˗tDEEm?88eV7{`h ~V|PTӟt-[(JC("%%O>OS puuŮ]Յ |pqqg?HOOGgg' --mUo?=&vN>M\ :VKCC\?O8cA*$|xxUUUx8R)!J!Jfd2R)fffz`rrQQQ@XX]78T*D"X, ؈N#!!BPIMM [jåij?9 L_ 155 m) H%ܹ\.Erގ08;;mO:;;Up3( LMMo߾ŋ/Úרji`J8 Lr(W,B"r %>h^;"֞{{{[xvS+zoA]JRO%Jo> U[x oJX\zp\ ohjjL&U]ni_>KX,=bpinvL[:C ,]5uuumspp ٩g||o׌W% q9p8 -z>R`ggǣ0LNN">>SMvttWcoQ(zI+&|>#o޼AbۍykS((//GffNMԴ2ꟚUOaa!?/~HMM]3>|Wk 8>G]oc+PRbA׳X,MUmn˃j`f'Ç8s̚looAHHȦaNc4&&&099*vVñlX[[<)KsX]p(6s;h[mlCj]=q]v|`U>5oJӹh}KW+﷖|i]\*{]|gHW:iN1mYqq16o#$$˷ hooǕ+WZz;MwMujV-jDEEuybNSǜ%q3c(b8>>~-`kkˈ |||-gmmM?օl#e?5MתgL]/[ɩS-˗/Znn,}zVP+DS۬um齍mwT@(B.J.wqq;BCCq-=V2g& MA)agўV߿e?u!1Nsf xຯZ8??uhs֣F~KW+^;]d]EX,㆕?Խc3M2muʪ] ;;;ݻHHH@PP"4 ZK_D"0:: HD+zmllP(`eeJ{{{ ""zc-8N.L($  xv R,`8V6wt= .:nnn$ #dgg3-S$%%mb=ID ,P(#YI 2c{E}}=:ĘP(NeCR¥K3#>>iRcc/$  ~(ߏ1Zf HLLc֘2v( QN! 0?; Rw+EmpӢ|ؽ{Cu|V:KC,iS?ۙYrH)=IEX u|).˴(磼ׯ_'V@Fg3i ݻwnb׮](**" 9q޼y/^ --iq bAUEYCCêX/Ƃ磷^xX,yT*xzz"((qqqfWbu}qWxa~B=֎ZN;Gg8{ZĈbp܂899!==]'8?Ӡ(#//ou0>E1o߾@ cE!F(S|\viQL||<#9siqL>l :Aww77ٙV`Ϟ=foJ9Y9-i]1ϭGxx81;;2H$;vl] 6 R uN311WWW+wΟ?尥d*4|}}͛m1~jr8}4Ӣ#B@f .`jX,N8gϞĉLm EOOvŴ(fCxx88JJJg1&YŋLar bppJ@ݹs'~_Rp9PJLLL 66˧r S^^N6VPYYMymeeBk䄹90=@@Ȍ105Eww7I 'ۇ"28o@֭[~/H4f%b1xzz2-ƖQT*H$x fi7!uz5]Ԋ!HR,v| ,+r199-͙>6X«Wr8nlY&!??[dP%(SqS۷o <iq?{oTa_T0( . 3tOUWTMLuMM?=zz{]P ȮAD`HXlc InΫn97'{l`kaaa>tOFFFjRURR8e|2jcb) ݻwS8w_h:C \.? Enn. 555ZImaΝ8pR0qM1-@ 0q0E>7n/ 8-f1<<'OD]];UƍqUܻwnYFX [[[ZM:׮.===qSSSxxx ,,[ZZ`ii;v100`4 ƴܷoۇVp\۷w^2IdddpvvfZ1 󝝝H r`T*.ͪ bbbN̢nnn.航*-(Z">>͛7aaaubZJJJ 6\.Gss3z{{077ǁ뻶 ë`4aq<==W^!)) OFPP刱ڊXe0NYYFFFp ZMX;2ÈgZ @{@XLƍ၊ :ui9zF888@$i@BBq-/{Ћ8XhmmR4R)<== [X]fwcXP*:gJvJ… HII1mBII N8 Ɛd((($?@%4P($H>@! F_aZ^ַHgΜAff&4h`nn@T KKKHR(Jd2eaaa #9XO ,F^^v///=>ތ ?LXCDDDe<.&2J e lQ\\I MqppآYKgÆ 8~8Ҙf\.^b1>}4TTT~:vڥahh@@hlldZ^!JbJ077ǝ;wb ˑof2b{jj wŞ={nps +TΟH!+f򑚚ٳg N׊oU[[[doX)CCC֟&-CCCjX=ӳmUm.VB0,F x}CoFoCuuk"99m-2@&i<ĥK:;;v쀷Akrr2>) rQVV\z!!!ʢϛݧ|F:f'Rj >Y @08pU-Zm=6bDDD [l O%??Vx)nݺ'''̌M)5&DN;;;랗/_/3UϾW%''/yX`5yb(..ƿ˿oG[[ oߎ7O~~غu+<CCCt͛׿/#D"-AP(_|4`cc۷o=B}}=nܸH>X[[Ņ믿Ԣطo׿Fmm-;w.@묯& -)Hׇ>{ <==AQːb HIIA\\ CCCEhh( ޽{D@@ ;w"??/^(BCC! f[B|ppp_J sBD"bB dѕA{{;ǡP( B~"`ِ`err|>?֦?K.add###\P`ϧK}bQWWc8=Hܿ׮]͜6_|Aݚn#8;;~VXd2؜ݻh˃BwU;"::DzBk>}o޼>}`2.D"un~:4BCCQ[[e=ưaafHQݻ0-@ h /|WHMMů9Ϧ}vVロ3w|mxq~GTss3!  ׷Oxxx}NgT}O HHsT˗ KKKP]v LOO㯻;]v_1uÆ ~˒$9mwnMj\R/fEj }ߞAՖ6(j7{5mG`` R)***;%5|fgg1}48nÆ ᷰ~m<@TTqzT*EQG]]phgSHP{9:L&ǣ5~~dddPEQr\uR*jVrǗ*3l(B<1999Z4BXTJts~ŴRbx6lUe)Jj~K}w󿫥MOO_OZ뙢"mMe>}:gTZZڪtuuQԳgϨY͘1!(*33sT*zrԣG%-6/uhUEkPRzي32JLL>|H}i9zI{{;UUUsIIITggkslZEQԓ'OVT49G$&&RL0j˩ae} zN@SeE}\Hfe5QPPSNo d2g0222pU{;e gqjE8rrr;޽ XVV֪WnnnZRXtLJٳhNedd@(ؽ{7r͛: رCv>^2-YEggg5ƍu2"ǐڒ?dff6~P( v l> Zhk}z5A ŋ666Ҋi$''#66vJ%󑜜M6!66 ?-_DbPh4ž={ 0-GoEHHNm>|N"Οyd2TVV:$77^^^C >{sN210 9|= 6ϟ3-Ecx<>|X6ͱ}vtuuize~ƺ`mmB2 <hcR$''ׯ_7 JR3ZXXxAĎ",MPPΞ=?Oҩ7(JlݺUg6srrpYlڴIg6 R#::i)STTݻwI SvBoo/ZĘ@}!11mĉ(--567T߿2_$ܹsG5%3<~3'CYhbbBgKJJH$bZ֑077gZVcZ dddҥK:WQQwwwF23{.nܸa }OOOFuぞYoܸ###Lj!cs bcc7'ӆAQNxdddz Z& &J999Fpp0*-ٜ4'Ԅ3-Iki}#ݻ?O׵ gϞ]tšijjY0zΝ;j5Gkk+˘6LNNXK tqyF"""fr 25oCC|||tf@uul'lق{tsmmm111Rwg}oUU\.pU}0<ٳg#-- hjjZVB}ghh۷ogZV9x N:?LK9cccNbtwwȑ#:(JbӦM1~a pFʹa18APPA,텟Nmzzz"33"7o֩޽{1<<7o`-`R?Ɲ;wdz=)LDnn.n߾ ˦7Fˋ {{{/F*"8{,lmmu#pOX[[c ꂫ+ӒF__LMMk. 077תCi 166cݓ'OʊaeJnx!_N1ᠻnnnL!V5?8޾}3lڴ :>V^^͛7uuuf] Zdwvv 9&&&3p8u!,ܜ[d2166pnڱטtQYY |QT*QUU&''}L( 3r:t>ą  "@0\8كR"48rȜ`BŐH$ٳgvd<~ǡCPXXhogΜg:y<q`||111F bffPr<} bZb///deeʕ+LK8:݂T]]ӧO̞.ikkCcc#JD&-W[c$@:sZXX`zz)-زe .]Wٳ(((@JJ 4jG"FusΡ^vN,NNNؾ};jkkGzzχR͛q=ڵk|ô f#** vƒT*jO.H_8<`mmah*YAQ˰M1227n珁s:3aB6@LL zzzcD"]RRWo4o޼4J?Oattt#į~+cbbbhnH$۷Og6?`E@@RԔ^MLLϟ׉=@  ºɓŋ/ٳOOb4-.\@aa!qa8p`2JO@ B$իҕl2i nݺ{A.k vء wwwtuufc}rr8{l* blڴIg6uAWWJu 8vNl)J$''#::Z'XDb., 8~8ۻz$ aeecǎ!''i)F E)v܇bzlCpà ɥ;^xi$''7ba[!!!< r-퍖?~L$(Brr2bbbtj<\ZLX߿Nl 66d":eF~-شiPTTUOJKKv쌉 x3tjVLczbmUdO쁩0lLnr1np>}UUUzۿ +++\r VP(`ff }v:Ӥc\zUb6lЩMm(e>2>>hoo8 =bB T㎫+l6m ~FG__CX+hccc֭[Յݻwѐ7g,Eii)=zyyahhEtt,Dl6U_|?tL+um78MII7&''ϟkBB?Nl 3###pttsx~ 󃯯/  .v}}=Ȉf{mhh@ll, =hkkRΝ;??$%%Y ~_bjj JP(P( ǏnX,osZVBB=zz@\\\_‚޽vn/^`tt'ήC5F>|@޿kkk۷CCC냵5݋x{{#<<122B779cH$o~'?3ƨfP( kzAXXA vжm]k!) A&attD"D"{7#ccc%?cfffA}GmFHR\xhkk=QLNNbaff2.?[˗/|Nk.ڮu|ZoKK m{nuqA~XE KCb~}"m=06!ƀ@qHR,eT.\Ԅ7n1nss3!  ig?$ɂN5NޞbbCCCH$pttá˪Xnh B@XLD"All,FGGÇi=kި[ЮxP(EQ;meXtr89ʖmgkݶsU&1C[I IDATǀ 055ŘG `mmmsL*͛7=lmm1889\j,Q]0::˗/crrذa|> //9}C9U`Æ R3/S'{{{xxx̹/)JBϩ:Tclmmlll x=mKCVJ Tn٤j4XbtttNNNZE K`QYYI9YYY|cEq f |Q1 EPWVub"88_(b˖-o_ΩO]/ߵi4?~k׮DGG/[V_)++ݜUTAӪ^;_^q`ϭ7$$YYY :ZZZ`iilKk96,[ߓ₣G9GXK혘߲###cYV۶hW@mOOO +h fyfffp 4j$uYDQ}YjcǎeX,֢עNPbsGg~Rc|߲1q)Եb~-K ԵW_}|\._o3111*-` Io#:LLL`fA,GXXrss5Vrj˖*7ׯرc Q7.Rc:;;WTXmj]ͷ x}2Q lll3::bd5ص8 S>[IZjtL\ umyWjǔ XuqիWn@ V M 7o<`lI& *++lxc>Vo,߳phƈ)ك7o0-سg޾}rJrEq Wy3ϟ?㫔rss.QAAr#?b׺ ?AoXdH ]N0-E#9rvvv(**bZ^P(P]] .'O}^} ?u>p΁){:6c 0X~nQFJTTҌ.T*ZTe8H$d2 !ˑ\RR'O2-C̙3L  !!A_kk N,HPQQ) 44tU+"V EQذa!I |cΝL1lll011J%xzbݻwhnn_|JFOkq]EX"447oֺo vvvZE Ÿ_Z"//oA 3gΠ ,u222ǴOFT (..ɓ'si30<\6+!!!(** R  ׫Wb{255>ݻofaa!eî]QqKCQ>|Y5\.GCC a1tm޼BPWZ@F]F . >|`@155E||<+嬉^dff"%%õkp%nRl'␔Ĵ ԩS^ѹz5O׮]C~~>bUipb0-eEbdffɓ'Add$bccm6ꘝI1<<:^077Ǖ+WcZ3gΠ^oMuww#%%qqqpuueZ n[AX!B>DBBlmmubz@ ԱӺB ###!y&*M'. \t ь޸q#|̾dnnNVUj 6 iiiLK1(2 ]vA$d r<HHHPipEeh\dwيW'aJWr:P;݋0zM"r`وŋ_x" TdرBYYRY p1c0(q%W2@ 5lllq ea||qqq:@ :1:: -޳,_FNNU[[[g?cLG{{;>Dر:JJJpyehwÃiEx%q;H'''קw/a1YZZ8@֞|>(WTV*"''~ő#GfO\v8'3|||C-?[)p\9.Htص%%%LldFYEKK ϴ } Tՠ*A(|HRtѾ777+baddd?_0٩(--Í7 ?Ftt42 q055iLOOcpp'"@KB!v-[`۶mxr ?~wޭ|WW\.^z(DEE-[,k%((MMMױ%PXXv vBoww7pM˗/U o޼ARRӆ@~~>N>q>uDYYkɓ'z@ Ա'NAA4Rב#G:n޼!deeLKK 1::XhYpuddd@"0-ŨaXvQYYɴ |>s>GΰIdeeazz7nƍfK`ƍpqqaZVinn^w!wށ111033cTOss3:ĸ@X +-,@0VD"mۦXk>}ĉ8~8ݻH>. \ׯ:V<, 7oǏ!˙c9s;wģG J(vvvjŽy (-yyyŋ=g||6lиm,O6˗/!ϴ333y9r9Z[[*@ Ŋ@BSS6zGaa!Μ9:l6PVVz[ۨEAA}\.###šCT}ۇE?755Ell,߿`%PAAVd'##cWchh+{u+M4Jr޽111CGGuo߾Ş={5r9\.ۇ'N,ynEEN8.Kkmm>U\.N>7n,j>Z۳YklooP(\SVLa%(J磤z3==l"͊!--ظqVs8`xx6mxFXX>|, ,3\rUn߾{زe /U1rssQSS( C$all pqqA__x<ۋ^DGG` MNNl۶ 666033C`` v܉Z#P>|BX"֭[Յ`mm GGGD"ݻ n߾BD"BTw&'': (--fff|2ֆuF٘qqq+ +J! >|P(5/Azz:q1ݻHII7&''ϟ׸L&m޽  w}B68::k<W_~~~5(A$!<<666LKûwDVfU89&_pygϞi~}b||%%%T*SVPZZfT*EQhmmŇ`oo~XZZB"`ll JxD"All,FGGj)JrrH$lݺ, 333W 6lK$p89Z[[@???ډ5==115T:TqQTd2WWWف(H$X[[ӿwvvȑ#ˮr ɓ'EQF{/..Fgg'>9J%l6p ftwwC @(~!H ggg D?Q^^!;U}˗HD۞Uu]vE,k<mmmp88z(=~} ueZoTUU8pbccCQqI "OF Z^YY wwwl߾]+$++ /_fZQ{ҢUG(Jbjj tz山1dffo_q<"66&&&011Yp>wuՕ[ R9g5+2dرc+֥l)›7o`aaWWUf)-- 2>lf "++ Ν[U6jp88::|u˗hiiAdd$\?_(+]տ=o_~񣦦E! `Ema,bTVVܹs+kb5LA˪LS[[ WWWlݺi)Z|к-]SUU$;v QQQs7l؀> ""SP]]>(UW@wΟU|牪:rZfsΟʩ>[v+SPP"""rظ۷oSTTJ|gNÇU=ξ6;::rakk-8GﳏYURW/~++unruuux!\]]sjql6d22@IOXzJ.^{:MQUUe8qHNNΝ;#bթmllp tuum]da;;;ǣIIIMc,ۿc׮]kc-[B!***舸5 \.Nٳd"xUŨFXXغhoCwd2deea\OSS<JKKuV\rev #::La C 166J~QTTs1-C`tiTUUi<56:m6 EEE'Nqww;PQQh y)TVVyáŖbtt{Ç5̙3dtvK.^ hkk3X'GXX'kז=O 1 Agg'guF8 (--B@ddD"YW@ ;-`cfճX{{{iC (Jprrb3g׿w1I gj'NéS4R>i&OҒىTX,>icccAeZcͯ19U,a}3Ƴq";;[(BAALLL].]GRR= 777aKKKLOOJ#mىLgfffCqU 55666 5at_%[ni޺: JJJ022ooo\~]c6t#((i)z!Q*tpssVzM؈Çk`booveZcMǏ`1226Mgja ر_Ɓղ---hjjBDDKŋw.]di9Ϟ=Bk @gϞ(puuոƍXld2k̩܌7oСCZ ;g~388jHR?7oָM&@ZZ""" *6. +G rh\.GWW׊LNN2-@kr9::8FógpMewww/T*%~~~Cyy9JJJpejՠZmm6gkjuD")ۧNFCQQQH$x9d2N>mH]cpp>>>Z[]0== LOOSSSprrիWbi؈[nݖFM`(NQ&hhhwqF楎T_M, Ji: Rbn擶m`c ҫk9..zjH IDAT-c:N> LPK.R {{{i2,--釅:GaXiiiX62 EEE j~BoEHH㝶ͅg􍩩uM())\.ljnJӽ$L`s3E Q"%JV%Kcz;3U[NNo櫺ex=%QT % &Q9IEsy~U.S@9ӧ~nٲ* !88n2KA$alllUdϲ@YYY .0ls`>7 j)))ŧ~ʪyyyŵk1Z*(,,d-d=̵B@uu5r9III/'ӑVܼy|>999pwwg[ڼp8 {荸)ixxxĉP(,,ݻȶE&ÃmC4*++188jo7ogšY\N@kePVVF@~}ؖ1+{.Yd(..ƞ={l& OFOO ?b[H;vLyM&RcaaaذaِjDGG#::4MիWH$޽{m44gϞA" 88ǏH?xT*8233z˞MMMӧWM9X^@Vc׮]kݻػw/26-=$dE320"JhO?\|}UoJKK!Hpy\BCCN 667nd[ւdgg֭[6-1c MhooǣG\8 \.9R߇RDZZ֯_ϲﳭ*{ͨENNEP[[ . >VmhDqq1l֑c[UQx%R)֔D"#|||ؖB eE `PDFF}T͛7rJ&o~\p ǏW_}e:}}}Mӈ1{B@YYLGsU{o+nFY>0ϟ?L&+vر& a߿gϲ-@ ,Ί@  QUU^suuELL ***,=667nѣvxTWWڵkDXX۲fe׮]~:N<ɶ ˗V@@PP̲Tfb}}=^| XhHKKo[@xxnB"((G1[ uuu888 ++kO>]?߇xa. ieeeMسg]=07زe 26ZR*LeW6. m?@X 7IHHGׇ`ڊR?~M7娨@jjM,Nhh(zzzʶT!Z***h 7A@@W]D۷u **jQCtt4_"}tttj^^^f4FQ`bee.rxzz"QRR\2_&EP}}=:::pyfR H.`Yx<(I>s"6`^88ڊlsi",;wСCl˰;\|/^'Nu˗/V䧮MMMذaٖ3+W# ` CVix<Fo>HRhkkC}}= y+uPzdCP+$P+e)j>^U/"@:!tO \<(x54 uLL̂:C,1h4@BBY6׮]ɓ'WEiprrž={ؖcܽ{`cc#6mZʝ;w p,Gmm-<==W兩xMy}Y{/_u˗B Xj$''̙3r Ν;hFAAlv9IJJBRRmj֭[QZZp+qqqsV D.MMMI4c||'m.(^GYY$ R7B#J6 %v;dO%s|G8z@_:ttt&sey&ccccqS'[CCCpI-"'A||]e<[v߿nUim 0KnP(8`Z-p1,8bjqu;vlnMm:::PPP}BCCQZZ5c (+[d2`ǛsR///xzzj&S!Z!{q/j~jz7.Fzz>(py"+1>X `bF~;|L2rS[ZuL>T]...xaR͛7hZ PvqssB`[ű 7ae%k. //lX1>>>ӧOcǎ%/Jqm|ǫ`)l򿎎ظq#ʐ9/iiipuuEBBœ O1}-**bL6r&_]]]_QkV0"""VB BbF#y5n޼?`GT>OwAU?N v: 444 ::&oLKKKqe,W^ rV%斁)Jx<[n%Fˠa[ ӟ???@PeN2y5.ѣf;}.Lxc?~|m=_no& R޾Jfdd\tFtvvٳVPgj())\.޽{e5 qqq(..\.Q(dddz{{bΝpqqcH$!..F"}}}J  rRFͅD"0#|g/ z=ӇRDJJ @QFxx8D"A~~}r9Nh3SG*pqqABBۋ^?~|IAP6qtt2";;17A*wV u;4 B F'>س+x?"ExH`4";}#~xg>E}s'޵0xy@4+F L3;M1Ƥ ر%#kkܪFkk+uVjX_Ӄ*z8::bpuueU=c0؈'OaQi|70S$ϟGee%~=?~M6!%%.\JBVV(0d2o̳Ls#LOOODEE###/ၿ/L_5rss<//^\.wrH$0hdKBiiih4hjjb0MzLw S0=U3F3J%YKKK F|dP 0/K-j zF  0( H : h FڈQj٧!~~Hcc9Y\l؈ԩSޘpttdOkS__bGE~~>9B?+֭[ Nlooo@&СC 3$zhnnFxx8әyIDD`49;yihH!!::7of4͕NM xYqܸqY{*7O}zZ0y8oi,mGFF3}u"==} ߿;vXׇ Z' vͻηde6싉?Kh ,mILDnx{eŒ#!!lYSa35K kh4p8l޼AAAw LtdM1TWW#''g5/1a*~0ٮ_whӷtw=EX{OjiiVERR$̃=hl Q__\X| 'Dmm- vejEbUUU7  jjj4ietj#ЌIVHVIAL Bgp_(8\M;Pec۷o144 dz-gMR]]uY,сzh4#''sk pڵ)_dMXVݵE^?,334PLL I:4Mٖbx<>\r1\.JJJ0a0ai:O>D"֭[-#00raiD"т" 'TAb*04J* Zp8BWO8y"`pm03FV;vm)B4JJJ0::LlܸmIkWWWT*T*<6m"cy&x yLHgmcЦ`|2N ,ĭ[fݯvpI| yrl!̲2TTT 88x]+T;^ChJFQJӘƕ.]!tn^p c_e \V&rOE0¶mۦdC]]Ƚ aMC4l: a޽{噻iaQ""""m)V֭Cyy9~_-gMLƍxعs*^%** \.8~]]`؄i%X .OO0s  2h؞p i'Ef^Bvv6|O{MCC#G Ǭ&с:t:p8dff"##B AѠ}b ft RVĶSXXHX^o&>rtt%T*Eooݬ47pqq~?ؖf A4^z4:22^^^pN817a*k)4.u nsjW`KOOj5SeMS ő6b * 4M#** G!zƍsf<iiiJKc͍m)\n"HhĽ{pYBSS… 8<\.۶mѣGtYQ^?`f'&&mmm2:D׬Ԡؿ]=)$RRhݺuطoMeΫWo?` aA+eǏlKYH$x{OEF8ܸqV`f `^E^^ HPPcT*DEE-#G;\v "s"Z3002T*l߾ ,V'O@T,4_J֭[g}naKi͛7׳-gU322bWEw,je`JKK&}j5JJJfQXX'NLF|z{{Q\\qٳ>>>KnoÆ ݻwjʝؖ@`w"77תOZ->} JY}GZFMM T*/Kxxx-D~;wnIXq`SSjjjpСeL&޽%%%ؿ P(v>w9F[~ݺuؾ};nܸA&li|DػwisqQb\v qqql4Q[[++cݺuV鯼pppΝ;ORPQQBgggdff28 !Þ={fdd)))VRDg088̋hDYYz{{>mIkhW,rqqaRf Kѣy | :֭Cvv6 \.۷o0~}rlٲeIf8uq5$''#66B틘\~MMM0 HLLh?}}}JBVVd#ɲ2냛ۂ7 Err2JJJl.D4d<℥aѼd!))ɒ0W\aܓ-JKKeo~~~͛7q1 #,OOO4gEE!7>>񨯯GAAÉQ4&d}}}^*@kk+>l ={Z 9r"CGGZ[[^^^^=zU(BIIɢ qttVe^6ttth4(&c<""XR9۵¢("#//oMLuvvB-9[n%A &==( _\]]g888,bb"Յx{{c׮]kvuV\zu2 O>5{;EQxd2H/`0bwLL ۷oիWqQe+d0:: . @h@,XӓΛ7of5[`0NZ~~~ A 1j5={T ___رcpcxx7n܀ٳ&IIIFjj*RBVƍOfgg'jjjrC Jo޼`MFFև3T*tˑFu,˝yc T*L)veZ p80 J峖<_ TTT ==BpmpΝ;gWk@榴p8ضmۼ{]%B?CUMӸt~pZ? IDATh؈wށc߾}$eQUUT `_!,, \244?& C,Idd$( W^Ŷmjֶ add^^^lXV 1޽O>ݻaB4 qyBX,6';1y`gؾ}A@Ghh(6ol-V111(++Cff&rfi\pǚB@YY8( MMMhooÁhFjj*|}}Y&imruʀ`h6k{Eoř3gC;ׯg[ƒݞd25"b.]'OL[nYܽ+N:/ܹsNIIIAJJ KT۶m;upp@^^n݂^G`` )*lذϟ?Gcc#yoPo8uԒ'H$®]HʈbA 8|ݥӧHOOR%sU;wv-?ڬ'..L ]RdddX,쌳gҥK8}41z <[n:((AAA&|>|%Weoߎ[n ~~~l!,._j(--L&CjjMRHR899AV;w\ >eR&@χ`0CǏ#;;̷Wja<fZ-e~py\xǎH$h<˗/!H9 CXXnA||~K.?\Ae{(G}ﮣx8pl[f sx<`( 455n#((bxU\!&&+5y7o()h&ZZZV۷Z͛8rU)p\$''AAAB.HLLDxx8^~J"88xN___!..r/^@KK dIII(,,D@@=[ZZO?h4B.2<<<%2<>A"qqq08x .^<{>>|L4 {|r>.\/ՠ( _ H):/_'Oy+H$lٲ۷oa\՘2{:::hL<,áCRF[[ۼ7q0KA4x{{͍y4=~#8;;'TVVbxxyyy())kPBF\.۷oGPP"##122Lsssׇj (#77ؾ};]m۶1ʌ_~%i&0 !Ja4vpebxxHIIAkk묟_~%sco:FsBCC .@R!++ e|LB۷ѣG_\x_|Ex ={x{{C"L9}}}h43}F$%%A*2ۇ`0{6oތN߿?߿ \7_z>j4ypx"9??9vd2<==3ŋ 3ݟgP>T )ǽ?fγ:+2im`:ϼ7nӱ;{,Μ.]bEQhnnƷ~;w+J8N3B!J%###ZxrrrXx Z[[Y`B!ك|߿eee(((۷P(|󑗗2ܺu o߶sèE[[RXahhprrBSSRpjPHDGGD"T*mz= P1 C$1Sw^h4"88CCC0h4h4FMM S}ϟ5P(p]ܹs8qwWzx߽{LV󑟟'N 11qwGG2=zL>їd8tB!B1PףV!dTVVj chhnnn;   hT)lllDtt4iH twwC@.3H$Btt4ә"<&}4MחMzd2f4h4 yM}H$&$ }x777fgI34 GGGh49Ťt/^`ll BgӤi3}vH^f{^Ϝ/+󚚚?廏gtK24~r9}4`H4D"ht}h4pvvqDss3çgu7W0e8yP]]=<>MaW|*++!ɘߡhjjB``ି}ԗ{ƍWtNh4١^ǽ{HF3p:tmfg||7nܰH+{`Fjy)\]]ƪujx9T*m۶ `@VV֒ٳgp8k*kd5h4:v܌Yϻ|hsm:sm?yەrعs 9vK<{{)繘|w} H3g{K=.ә[OMkaNX]z\~}Of[v؁j;vK3o{K=.ә[֥Ӷ +3}ɓ'Lb~8}[RRbShnnݻw-`ejkkqm4@l޼yN3h>,X,ƽ{%AHJJihh(PPP۷#44mIkXriϞ=0222ζD(E7o&E8v޽{_s*%,!%%e4msGmVBZXa2W\ǭjxl 3)Bll,㣏>v;$''#99088~EbӦM*NgŸ>@4i&/GGG;wO>Euu5>֦b| 4 ?Dӓmy6\.G]]R) 9B-H: h4O?HHۄIPP"ś7ou]ɲQZZ<&m weT6JJJX3~^@8p_3OfiFUU*++̜w== ֆbYذaՃ0;v8 ky{޽{EZZnݺ@}`0cp8HNNfR #** K.3MX>/_~3 \.E-k_Z] ;;m˂P@@^xA@ܾ}7o/Rl.)3gҥK8y$ \.6oތ͛7T*T*VLl#** QQQ̿MBҫ...زeUnqi}W^E^^-ZDPΝ; DOOZ->CeYLwA*2eZ-w^QQQIJF\rgϞxbM a2VeVXeMKKCyyÇqF-&xfLF(O>7%l)A~^w˗&`!$J h4FBZ4qF$%%ǐd8xݮE={W^!226( 444Ǒ299x%`pU=ztF*KfY/:P(6_v>X EFF(%%% Zcdd֭c[Ƣp88q~Gd2$%%-`2e| jkkk&!99y^eOO)Z-PZZ `L@@RRR̺l`Ϟ=t~Ⳳzzz? /~aדthmmE?Z-:E||<ɂ#Hu{nAp"X{!L˗vu7$2AII ölX2yyy(--mrHKKCZZPUUjkkm:gƍ#l2AOL`)22,/9rJEEEx%AK"҂>t:u8@QԪ8_m;w`V]oZF ,2`8\~"#hllĉ'ؖ,Ԅbj% |P\Z Cрi!##<40eiuuuűcǠ#T*ڵ joc0}\.կ2Gբ===L;X߿m 2wC||U$ {W30Q^lK!XRp\lܸm)6Ȉݛ`.\sY),X&#t:899A "" ^|D@UUUh4 K8pF#>}Q2FkAnnn8|]d~d2C*2DOll,OOOl۶mkz {.`˅+?eLmףfJƑ?6l"ӸT*Czxyya֭N"fj4;w} GERRҲM)‹/044hr ??jɛgttj|>̴Zh"336lR잂lٲŦ 322͛U@`~D" 2,0888=bS\vV mmm UQAR-y cطorYqttu`@gg'JJJ%DX~=SIl#jjxyy!226m"akД]PH@NS @b>|g|W_0˽JhhhXޟ.H)AY^ IDAT&?׮]# ;qeX|g(((Ν;n:%V HHH}{:nnn̲+Y'jߋQqa<۶mcL [frPH^\}Ctm)B ,ۅ@X "hY% RUUǏ-ld{ ===Ԥ dJ4M/ԯGYaT*ϻ]WW\\\)} xm!Lm.zjh6p88uܹx_mIMr9@XX6l؀=OKk#+e1xNk{FxG&x@u4&LVVm8ݝ}K4ӧO-eIC.EEx0&,Vxl311W^% ;`0av_>DAAr9Rr3ٳg ֭[  Zsy<~}}}@GG$ ٳgqT*dee |2Ξ= x"󚷷7(w1Mi磿IIIJd͝3T*!, Ç1>>$vŋ/[~R~j[142וP6IlEEm# %ޏHpqvȨ>`34hlT>ո ::@"S0a BGkw{kBq1*0xNDz:wAtt]yÇ$@ ߿۴ ]ff&^~Ͷ <TWWBr4M`0@T"<<* Zuuuf=h4 SL0G$app}}}Sh4bhhnnnjdfHļԄh_WWהMm2ϗ*PTxR4~ףEEEHܾzz{z  C#R88n1B Fhl24:)Nmr\V:;w!~l$7 ##.?YF h`Xumz2.] 86`o߾],(x<BW_}?|QlmN煶\ͷf=L޾}aٳm)h4T*R)3\&]V( DۛPgm{i4`1 0f|f."lQq;z*@nc==O?!==djd%`l߿ϓ'O]fb߾}x)vɶ$zzzE `<“%&o_bmw{k?W_$X<6n܈6ܽ{T LA(Jp\p\ p8xH;1: (yoi1㦣n^h^g-~atB9!Ur,ņgs ],:x[|6JEEpyb[0+###X,NacD@@lbyA n| ý-P B{iBp|7!l~@ ,LJwp4a41ӆG!9. n AoxPRHWpш"!??m9fg΃H* (T[A/PQZccv;ލl"~vbfvb#;=;=;3v !( gqEPwerz^UYO~2+O=Z-aZ$&&2ƪÇf:7<Vzz:t(A*ر0V!++ }N:t(vC&O&tww#7703Ξ=/ٳ 2 FWWZ-݂l6l6 DZVMMMGjj*rrF,h4ܹs!!!ؾ};aF|gL||| bfbΝL*bl޼pl\.Gii)Ξ=t(x8w._3gάJĊj@GG Ŵ\.k.ƺgث0tvvk*aPUUJ͛7[Eg.NNNCSS[8qb]$eJ%ʐaP(\.;ӡJKKqAXuVD|7fu) ܽ{D~/̙3r%{' lCCCqu`CLBII VCV#Glgll,p8p+ #??fػw/ܹC&jAQ<=mSK@JJ͏$R7o/`:"Haðj.\KpiZc& Vd l66mڄ2)T, E1A,bl6@ `:UϣR[n][~LNNq0h4(2Aذw0V Auu5W011B|d߮F1p88<.]3gΐ1V\.166Ep;wxl XZDǯUvv6S:>} D3gΐ`LTVVbLB*H$ӡ LB?|.^"\p$VF_,ٳgqe;wn*d0ׇ>(ijݵtaǎA PUUR WWW:tnIv܉oߢ{] ؿ%P(aJ***j[Myxx@RAѐaW`ŋg_}@'Ο?eL&۷o˗/… 6߂`0ᅦX,^_p8"Pqt(hmmE{{;pҍw Q^^XM6}p92 ħ~ھ"00PXw'cYup5@׮]É'HSU҂hðYNNN((([Bh4=faaa̴$@Bt B@LL IVL\|~ɄǏCO?q>Ǝ;p "܌p=`[czLWLطoB!ӡؼ>l߾0l@ Sp%\Tr$C{ɓ'سgӡ6x5\.:D&CHyyy(--ѣGW^%?}$???ߟP@zg:FD($$555HJJa:P^^͛7cƍLb7)!a+WܹsL3'\BL^@$1aD"***ȔG6Q( CAA! ggg%uFUURSSɏ+d׮]oq)C!e/ FII aXj"<:Ÿuf{{{QRR۷owޅRDFFݲؽ{7IV'11---dӧ?sRGLcV@ lJłNCUUzzzjQPP@{522PH jGFF0::JwBJhiiAxx8Lvt1~zudP(D__}ZVdee<222UFFFVY SGɄii'VZS?nwj{!<<|{[TիWP*H$tٖmx1g*f],>kz PlCrr2ա <BpֵeqiR8$ ORcףR(ʶߩyyގTa>ѣGE?LJ"+s5ax=c-wm6HHH`:Uw;vlI*J30HNN^g@CCd2V鉔n>Jؾk׮رcs;99^[LYY222DƇ=4WݲP}aR)-;5Z.Z,ǹe\Klٟ۷o6RRǏ!J 6ew}zܲw3Ko9ל>u\y~wpssEZٯXWl2v•+WpYCa̓'OGfE#T*EMM z=f3100 .0AX(DEE$"mR#݋h s,.t`$ &&| I:T\\}0e طo@xT*g͌ݻwe:KuX>J/_BR\aaa@ww7=u ɬBqq14 on^k0>>ǏC!11d:$T*;H$hZp8DFFѣrqq^^f|8s ӡk6۷ˈ/^ 11BPQj b! 999+~:::"..^Ž;Vlg!!! ynyfl۶ҫW ! MZ.h4ү#11$Lvv6nݺSN1 Aإ˗/#''\a7{رc(//GVVӡajl޼P)F#iZ* ^BHJJBjjꪯ722n݂R$SX۶m>(**`lݻPTHHH I066zˏdZBrD/]w a&шQx{{3Ί2Lx!d23ƺىFla݌ǓW VYhh(=3hDuu5jjjرc#hZC!K# P$ɦ|͍qM01RI2StuuZ~~~8q!b!-- UUUعs']rHKK=66GAӁf#))  Fi!fo>CZ\Nbbxׯ_3Whllۙ l^}}=L&b vTWW#%%PVDcc#lWWWC!011OOO`۷o;|depp0^~ F'''!9m& DLL i@,ݻtHZFWW`0fXjk2(\v$b@&aLbU.7o"&&...LQT*z{{51ibbǞ{=_ hq(**B~~>ӡspp@bb"ݥd2wܙLll݌#hsd2#773<< M|ْƁf>]_ߏJddd0 Aؤ>СCLbu.'Ow}gz R>ѸTADDiY\.6oތ&2ASllٲ[l_hjjBmm-۱qF&\1="""`2Y\]]~'IA*bppn5ÁP(mFf{!֥ 6zަY`^~Ǐ3U˫&Fll,jkkt8˗)6BӧOh pq>&pUlٲpXkVESS^zEb6i&lٲ<o|^ryN @)~_7 ~_ iiik2+jH$(0ٲ`0'V(//ѣG lpICZv&gLqlbuc~( d20 aL&>} T 777:tȦ\(..LA|>1PYY V 6 6pjEBQt:ݼ*HC%fBG2''.^kԽ#]mrÐH$P(( , El6d26l v. T ///!'H3Hv}>q b|q9ܽ{P;2<</^(ddd@$1ҪpvvH$X,ƦM 0kzXшNd2l6l%LQ ?pppZ)lAҏJLe2QP%P%.NszloȊcccT*l6l6ϙ Dضm 摕BJI"޽{pӡX=Nq\DGGL$/_DZZUw!d144 6 //.]v8wӡJ\."""1uٌ_W+WP_[~|gZ2Lt2hNn^_ؒ>P( JpS xyy!!!v3vA6ttt`LCVidd?&ɟb غu+]m۶3΂4 FFF~,bCee%X,ӑtHknx t(A! .?!D"+ Zgeqx*3ã^䓳LFdA~Gi4KK777AAAeqpp`.ٶ">>ׯ_' CCCxΟ?t(6@[nԩSLb8q0ՅZ8;;#77׮B˗/Rt8AÉ'==&b]} ׁ19=Au;ypsE(\`LPku0L?ouD"0C,Cӑ:bU=b=Acc#I0d3H466bL3^ ;*сk5?~}'L:j P* `0eF#LT* 6 Bp\r8;;rNNNl^JE:T*zeKV@SOQV :JhjjBss3*++LKs;޶#cg^տl ۰W"F *5L&30HQ0)oJaOx<t:龷NŋHJJ{2! t# 9s0b2#( $''#>>͛7LCKbIj:1/˅hEQl6&iοf3 \]]Va4! hnG, t\pppfBpֆ2NX4|>, EAVBd2AрfFFE 6l˅Z@ H$Bjj*~"((fq<>ӹf-Y-\?w+Ozd&!b^~eee{e:Xך0::Ç3M" )N8u˗v9 :f3?~1dff1"K.aͤ Z-T* 4 j5&&&6Mw-<,{tk_-I KC&a||PՐPTH$t, zl6d2ぢ(:Ap  www",,lZri.F###D"G(Nz h4B!.."| ]rc]PɇS)\bܝ4GWϯAco0<cr?LVK 551cIְX,EQ0+X"zŐd`t.vlk`XpM0 A1"J>̈́utrM7Q|>Hw1;Zv>Lkd5@@s8q2e`KXV444 == xxx ((xh4T$B^P"44sPa^Q b``~}ꃐ`@PPTbXp 0(ȆaЩ\tz=f wE͕p8 EhhץR)1>>N+ O0z9rDZcǘ GQ^,lذpÍa֭FJJ cq㌭^* =<|'Od:j鄀\.EQ`XtDdaXpss7"##M T TJę+5777puu]hW`@oo/gu2ppp@@@ÑP+/luo{@q Y&^^^NGcc#RSS&qz0%pttEQh4G^®+UI#::~-W(puu%b6#(J.:<@]]M=LHR(swaظq#BBBzNHRLLLcӻ *. ///x{{ojdL&Vr DJJ&V;d2c\]]{i)J455J Vd,!;;dknuwwӧ8<.,ĉvΞ=.++C~~^o߾ś7op!r󾊶n݊[nLR.]$eM6[la,NKjf K7+N777xxx <<|]$֒`@WW:;;rGDDI,SRR^zf5 B2 ሌd(JbHȺXfBTդaw pCK$6={Ç8pwtt"\m^2ٌb^ktΜ9c hX,BBBpС5kih0<=t(j$ >j-[jl6de~X,VcbϞ=d\Uގ6UlFPP<#rDGG@squu޽{OLLիWtB icH0땕G!33PbŘf! 4nXg2$&& ZǏcϞ=+^.HY H$я~"+'11_^$dBuu5Lk׮%'cr9:::MlpqF.YkHTݻwH$jr@vv6yrŚ6c=qqq8=Ν;\Vt^P(LA++''Q2۫2H\.K0*˩d"c 2xr9l6RRRlcicXyf׋P$$$E rrrRd:uefBHR)d2!##-&aik5(򵎑2xѣGq޽ѣGdgb.(i999x"rrrl?007oޠؿ?嵆J.0L!IbEFh!h4b|>~~~HHHb8ʵe06@ @T{uOVvBPP@@^^^hnn֭[?FCFHa˽WըEAA*ED|g(--ƍшz 3o#00{&H"#GȀĒp\z8::2Mrƭ[`0={à b<ҏBNӧXVܭ~d477c||>fw0T"{o߾=7oZn>?W3_^ijv@kk+|5̷ ScY3Flذw999ENND"޿6yt3e }gm3} IDATgorg-n޼9;|>(--wʜ>,/^XֺZjKٟ ^|rr2dCAA>[mÇQTTܾ}{>7ߵcÅۇ\ 3.KFFAaa!\\\G?VZ|  L燾>!##eee( bE!22RrHMMEpp0?@ @$ ?q%|\r^^^( 0 D"\.Gmm-~_PTعs'jjjj#** fvB[[0888lϏ&GA*ݝm!!!r( ---Egg'R) Ξ=KBLٌ~dff"88ꫯWbݳ <{ beeeݻθpX,](B,/N ;::"11\Y… v޽{P*(>ҥK/QQQT|0LxOz@"\.bǥKm6d2BVo墿.K.ߓ={_E`0ꂻ;FGGW_ݝ7xirp F3gbxc{ŦMLP5ku:ގ@  ""\.è˙oggg/ : 22>KL&l߾}Z.\7|011ӟ>b1c믿Fff&^zH,\.3ʵ|= {@ff&N=,uc/AWW!HT*m+Dx#M]jž={lj`vVfHOOݻwAQBCCAQoZvsڵkA}}=~ɓӯ < lsOLbbbfo5ڲ]RvDD>|m۶MKjJOOǫWcC!Onn.IX9hl߿ZlS_B!P!}˶1LDt\x-VA7^VbA`0Y(,?JKKq ŋCnn.rssqaD";Ξ=r:tǎqddhii~PD LkDz,_TիWt.۲߹JsY3c)kw9oD/J1<< X<{}& z,ǃe_hZ888B^GG駟I@}}=$ɬOLL,ǡeBRNgc^tO=,ud2addg־D"βΩǢe}Ysqr?xӾiǀ%I<8ץ?zZq)ȵAAAbbbPRR۷ofZ]z(JH$>-ms:MZ \N0W]?sUUUηhKY3~vF]&X+EEEՏ aȄ+ٳgذa"""|OOAs;A%rY-,wO~z@ѩy&2&oX,ւ$jbeٌom.oL7 -7|:j5b`lٲe㿈bTTTbܹs .;WBBtub\Z{X[}P3=yEMkM0EQMRb[Be,\b_J߾yr붩ϵ-m0w2W,K1:PZZq ZgϞAрaV76;wpQeX0L``s`=  ?,;s3k?t=kɓ'$1`fiC?6$VإK?"8"?~/DTسJ$$$uZZZfFΥT*GXt((( CE۷oGHHVjZ& FE̅(C.CR!--mD md;wt(l=hPVV%7l t|^'Μ97o̙3KZ~ >MP(n<pUK(JA.CLL̴.jFGGQRR|!lƘARRA?LT"Xl6F0;V?6$VŮ]PYYyFF0޽͛\$׎hD[[:;;j;vi$ݑ&|7ƍ,b&?ӡ64^\.}/GQ^|Q888 ==B [62Xxv؁={0Hh@3=GYFOX{hhhNC\\lth"66/_ǏqQGy7&/t(p8޽ߒAppp@ff&5111(,,$ bU ܿnnn(((`:b *ٷo.^9[X%le ƅ %E,Mgg':::ػw/x<ÑM7qܻwzYYYb fa֭RZPZZJVhj2HTppp`8BVL̛ VZuu5zzzCC@O?׮]秽NQ9 FPPaX%Zׯ_C&7** [H$X,6ѣGa2PYYa9rnnn DjZ-nܸ\5p6\{`0`͈e8:ڥ iiiLBؐ!Wm[\PAmEYdQ}evPP@Q Oa'Of<|*Ν3 FQQt:!ɰk.2mĊÇO!p82AYY`hhPT(((@pp#8#AAAu^Ǐ?V3g 5&88AAAx%>6mfgd)OZ0_Fgg'hkAA|||lu'Op8())ף LBXm ac2N@DDz=?'O 탫UhHd:AR94vsЀa>|d6 իWF2ѧ*RZd2ldff[a8HR<;vhc۷~:x<Kˤ|>ql6>|RdDBAAAV)x{)'۷hooGvv666d#N<}ހA*ަ8,.Dݻ)utttMHH@BBJ%>|BDlٲe u~(7m6E;E(2foߢ\͈BBBe&̌ 8Ո]A<6bɓo:ulLLL ޿O@sh4x1>|[=4YHOOGXX.\={,( mR*++T*{n;1L˗/̙3Hplv؁dgg * O<RRRV$tułV%Ȅ%Q(w<==BZ777 !22lHXXm@kb޽IA@@>3֢vRDEE!** zhhhΝ;F;èǗ_~IDw Cxx8hQ^^www[V:'??ϟ?Ǯ]m A( =H$G}aT KC@6&22J###9bm]EMM t:I8"CÇAZZUF~~>0"3(##cYF끖H$@ 8,IIIHJJ\.g/C!mAŻwpq{Cpp۷/^ĩS?a# Q__WWW:tRZ/_ĵk PPPvX(477E!==7oH$Bmm-$aA222L66L:&<V&8'D/0V //#8l ={,.\/¾FlF{a 8v؆;k4( A"ZVY, 999`p.UTTb YӰ9o|7غu!ׯat[ۋrxzzΖl l۶ަLgg'^~[f²! t\.p]F(&؄W^ iii&:3i`gggT*P%jxb1 <==k.TŽ{ Fcos%rh4$J qqq ݻZAFtuuիWHJJ"R C@6dnpDEE۷oe[\OOO{& d#<< ޽{cf'|>FAuu5JHh܌&! E⧟~c")) o޼AVVM*O>EHHy)ّx衡!TWWD 11&rP*`HJJBJJu:z-aq r"X8Q J@ p$/// RhZ'\]]QZZJ"C2܃tZE9.nBDRyf,;19@cc#I`5سg.]`xxxi&Z۷Ǐ"""a>Zsf#** qqqpww_LOOOFL0d۷e,q4Y}D"@5A+:fOMx"(-6 #"22)'`8x n߾ *NhU :1-B&]Ӣa;4" *pl!xMpl,}gggƍDD࣏>۷oq%c+Kq}m-/VN\J~ݿscJYgm˲m)JBnl׮][aֆJFG399&6$FT*BqS"Nqss^pR)*++q,=QXX*t: C!11SSSxPSS/~ .ҥKNCii)4 ZZZ 055\r_}.^\\455A*_}1993gΠ  ɘ!8)4 >|D6ZT^v D"4<<<'?A#""AAArBlݺB:|>R+' z?f|;2Mt:=|Y"‚1Aw"Â1<&BK&04:9xXYi ؚJQY,Z X3m!`{ZSS/ipܸqcv#t:\.8Fqf?Ri T*T*Z|twwC  ++ Jؖqt">& J32ZA7RFtD(roL IDAT&ajFh{pmTTT hkkC__B!>|`3&WbTWW,;x b1\]]̴[BfS^7777::l;KvB2 , * Ϟ=J%N>)`b8vL(ڵkL[D.t) KmH$ӧOYMm]C,ԑquJP(4bꀞ:RA&f_޺iOc񽔒|6YD"h4#sM~ffL 3`p]x|>^M1}MNN2߼yib|xn*apuuEQ eCDX0p8hV#؄Yܼyϟ?GVVΟ?{E؀($f*++QVV}.\3gΘM\[[4kHzwő#GL߾}{2{BQaXo_D?XG0GFFb j̅A3x_s]vEڵkmc.e3T*ATT>9YH7C0w-Q pCZx߼U RJ7T*Bwww<M<듇"77wY>+**pKv~KQUUBx3^fd6.BŶ[̵126|9/Vׯ#77Kno|<`W/t-Uos`(^Zo|Χ7w O}=BbΝfϝv,><D# <, f#؞iARdЄА)`6`4>?OEΝ;$ԉhkkz$%%Y#RZ-\#(('Je>x>>N6(\xǎXGpV(;wV5(,,\U?uڊvKRnfOS\xpYֱZ-peGX=4 wDHHHXUyyys|||,wqq@ 06mڴZS 6 55<0qA($Jq5|VBX[d2144V 뙩=E1EvvE62 mmmc!//϶Y\w?ѓZ|gtϖC0D555ahhEa&Y;x<RSSKJ˗/Mfh[RRҺ%#\uR)&''M"?G"+D$" lk !駟~@nn.ki \UqqqCmm-{ ~</nkr4^PThkkB@jj*N>n%b jjjdw J1::yCGXX233m:bJGPXX.3/@ll/Cqq1È͛7T*~[8>>>fxR===&SݡT*d-ҎkQH+c:\|NCSTLQ``ຊAkk+fffڵ}@@L֭[( عs纉 cèdjkKWWq9->i{.xj5G X rR8Ν;/DĀ8{qO?8:Ν;HJJ͛8 1qHd2xyy!66{{k^V&uNCvvZ űcǠcHR$''#%%eM뻠sHbpp<==(7ƬFژΣ|hZTVVB LQ4FV32hX,[JVD"@V3/33\Wݛ6mbt:|/^EQFFFLZ-wE`` >s{(Duu\.%%%$) aC@N >S\t 'N`棇СCǏ)+T*ɲtUTT ;;{fSTL,Bh>eE6mݻr0Dy444@V ++k8|B;3{0EEE6us\Sh0?%zCVZr裏022+W ::۷o7$!H0>>/_BVIt EQ&@``  3>>/^@0D۷!#fԄ[">>ΖY[b֭2PZf0LKKKs*'JByy9۷bc322h4ڵ 6@丸p1ZϟǍ7τճV#8عstRlX, ǡ@QX,ꊈ9`!k(J|߿aT@||RD`` lXe#, ٳgq 3S|---~:;T#0Ff#EҥK())h4DRDll,222jhnnH$^G||>Zb݃BCC_ 455*Yt:S8("""Ltp zYm/޾}˨bjPĖzC$EEE3@,L]s*++m3/{vv7oDff&lb aqh1a#(,,ʋ(ɓ'R'Ř>h4&;z...@dd亍8PLo777ڵkݞX fQRRsA B2dDGGq___j4vqvv/^T*ec6mȁR Hst:F 4d CCCH$&S6{6mZUDk299Lu&ԃ}EQرcDz.j BpnA5z{{9ttt0WWW4_xϪZ-2 bhX,p^:MMMDpqqΝ;3 SJVÇp8ؿUTAoo/MNE[/"88xEɇPWWh 55u"'e2:;;166D)xxx ++k?VB!B!FGG"f₰0# `M >}'O!W__$ue``HLLDzz͏)...>G?^| L۷i}8_[T0 )Pӱe`X(..v)3녨(<{l/˗/fîR,BLLCh^ϟcxxz{Enn"WWW`rrW^EDDĺ/ܖ&r"22 t:CkQRRߏ{1Z;, \. @ XtX,D"X,HMMl -[,}x*DfH={׮]CVVֲE%ɺpOTT^zo}DZ[[VWeoq뙑455Aѐ)l쯱uLpp0>3ܺu HNNfֱl>|ѡpssCqq%)r|Ueܾ}VYtĻwH.`;::Dww7\.v܉<{Dp@pi صk] ٚ166̇VEdd$RSSRCi.Z!@stLL&,COOOnnnPTE@@c||0_)44111~r\={ǁ-[ %%GvvאJN9tuu 1::fz6@p*hbp1444޽{&~adѣիW={Jhh(Ƭf֭[XqYţR\T*>>7}ن"** }Μ9c"gZZ- ?#x<mo>rܽ{zV}Lrh4*ɓ'  t6)BDDzBTR)))8}M"p .]ѣGI$!55utׯMYhꊄ$$$[755wޙW/~e7pyTTT ??j{:u*eKoo/^~ `B:hq͡C0>>r̛;v P*رcǢ$&&2fffѱ޽{DQQC=ELL QSS@@VRzCmmm3RhO" B,9TXXغu+\\\bL@n!'O]u&z\Aף`X#NaqGs@HHN> P+W0`R8qB{TބŒsL&3Ḯ5777Ӌ#ovݭ[-zۚܺʞnc-9-}ŋѣGQ\\7o2뺻;l[n12^Oc,kj>sH$&s'|?O8p޼yN Bs]3TD?@1d2lݺ7|Fss3sq DFFbxxJÌ----կ~EQ(--.]Dk|׈e?ԩSO/_,BBBhPUUţG8^._f7O?DT NƟ'zDDD`xx:>>>hllX,Dl޼^0`=,4 fgg4[ X޽{BCC?ꚚDGGŋLm SKR`vvHIIH$8"++ [nŅ TӧOi={/ FGGL[RZZ˗/eeehnnruVsۼ>|L[<55x= H@Qq|wLݝ?y70OLL0m(<<<ȴOLL0rq TUU...ƚܹj---EQxɁ{(J|aݕ+W;GFF Jd\RRܟ&H>֭[}=kr122”o|?={/^ĉ'V?ѣP(ą D7nW_}eLJ9o.x0>sD6Ba޽6@ؐ :|&B Lx^:@111 UVVSSS(,,dIRիWF\\6mڄx<(JDGGc|| AJJ .J=Df @ |>z111EQP*7:;;1kRvKK Zkxx<6嵵9J%3I7}Z">>BYYYP*zc[b1c۶m`꺩 gϞ_ݫW000PXزm6FwyyB!#:RW,bꁾ6g΋>h@ {D.|>|>s<ڮvfIz( 6mb]T">>Y.H011OZ|\.N:!ۜ 7tQgb۶mFMM ?WwAKK r<X WWW2m7Ǚvun[!Mv|N?t-bR3 0`jj)ohh<L'>~mL-)\6^[1pss3y'^(}/ׁ\.gqIhOfC"' }PVVGb޽c"n޼8krU3d|aQL_i\ ՗fw|Ν;<L///fPVVMMM&;ݗ1~${'seϜBQPQQ tuuaϞ=(++lj@#,8ꌍattԬ怣Ԅob߾} hB:HRaDܺu G0߹3Xv۷!}{[[ۢL)EQ52 B###}EȴN737^PYEڵkmKmKmC/;xLg{Xr˱sz{ob,:N,# `}99g۷oȑ#U݆>}#c|t: ݻwP*&Cll,M"zjsۙ{zBaa!5׎,,d^Guu5SBXX[hwu̷K|M] 4 29'9;[˭KLLL%%%7nѣGp@QSbK6fgfϜ8;qvvćdDvkEl  L%LNNٳg î]CKK a Z1X, ***L^G]]ljc!(Sd2^gqSUVcxxCCCRdY,bbbCڢ Fee%MIK@sQ(x9sB*XZDEE!""bm)Bee}+g^ľ>AP(ݻwCd$ 3|->&!77UUUɓ'- c"Q޽{7nb!11kmӒ۷o#""bюH$Bxx8Myy9qX^tttᠰ60J777{P钿{~=tj$ ?\@`#PnE8 yiyzC!%=r|||099Z(DEE0pΝQsܻwϬϋ/b zzzJu"4abb7n܀+v޽촎>>>8|0ÏUl)Nî]:$k.!88xjׯAAA())!\0LQQ=z D%zC`LԐVEDDIgvLOO/ej-b2nt:466B,f###Ffx<д12ettz=PVVF8"88'NNCCCe ? Vm\OI&jLʓ'OpY,НaQcǎ:DYNQQ߿.8{CphJa!RP7FLOO p8f9EQhiiAss3 pBBm:x jjjE)޽CWWt:X,Q\\LE8V ??sx=z$''#((I 455ܰ}vP[^WWFX5Z-DRR*`X Bvv6lٲfNg777j&Zpb:&>(--%q___f7nFvv6WT&1 ᥳAT N7;;wޝ/>|'O:ǡo߾˅ZFTT;fo6(ґ]&'Oŋqԩ AD \.2Zof5Wo6Gr%%%IkC#++kͳٳO>Eaa!kz\{#+466(P 4aM@FFCg:Nrd2<{ `Xxرcip8t:xyyA.d:RTxv~LLLŋd|uL ,3go??g`SJCuu5X,Zgr~"55a0 Y`bB,,DZZՏeOFGGd2gkǎL<@ ,qlBU}xyyd,E{{;GaA|||Bnn3Jxxxhh4477 –-[%7ݻ P={KaqA0Lʚ@ F&8<1ޝG}wRn! !@`m3 c3q%ff2ny7[Jf+UT%c{d\6c{l\X؜F`d@PK}[}}h@WQ2vqjzϟ? Zf(>GRR҄8|ϟgd֮]Nv/_޽{ٲe /vNQnܸA]]=h4OPZZq:!d" \rssΝ;Guu5Ve˖=?±cLjF~zzz(++c֬Yu[oS7$$$p1***4actvvRSS( ]Vb̛7/ٳgK1!=lO ;Puu5_|:Dw!~Ǐrb͚5rGF)--.]&!!̙3ef!ĨW ˗/Wo{<Ξ=CQ(--U F#//ϴpׯSYY^MȠ|L[޼;PMM͘77]w ʗ-quzYIH?1/_?ooxwO .;fsqzr6ўg5\8c9EٽEΝ;\vH$.6mSv"!S|@kk+---~nsE~ө,^xħW[nq "zӧO Jӎg ~D(طd0aN$9-̂yŏn+%ŕ+Wpz O9-vcǎ`2ǿta~ E((!] q&1dbÚd`w2guY(ƍܾ}[}= M… Ǽ#1cqo9WBkri^LxYYY\x<qpB222X:9sőϫr~z=}G .;J42zI}ًDBx:ZtboŜ%=῵E۷ocĖz)++R31ٽKٳW_}U(BKqqqv)zܸq_GɖMw{]|e((=z=t{57[OoNƒU@rJ&YNJSk]nwNcG ڰ7\h!њNrj6Ӌ7}]̙o>7B@ @bRJMMg}`ڵk8LNNawJKKc]pN=O-e'L8؃ބބ I٤:zmmm466bzStl20S`l6HJJ:Jrr:ke.ߟ?yc+9dyhtR눪ppЎ"v2q $$`N%ђFt @(Q:Znw;{\DrldMg[=.py~ p͛$B&))%--+W{\zP(`PgD"$33g'tgΜOOpx.{#evuyHKQyy `VZ̘1e˖=R!Ƌwygy$ܹsl۶SXXgپ}l;nޮ%..ٳg{nk_/>vf1#GxXjUs39FDQJJJ\~y }RRYYY}^K_>te4^9͖g]KĄx˛;½ D }>4NGr4] {Jd]99:VuEQxvg+_Ⱥ2}aeBJ @BArr2-bѢс餾^+Ŝ:uXtakn_:F4NE4ef~]n/_77?NG83,yb.}x+wK26φ[ww7uuu̜9CvD"|78N^/l6"ͬ_gRUUEEEӧOɓ455uuua6 i&>#|>崷SRRnq7mDuu5v7//?֬YCyy9W"`4լǏ$,XnFnJii)v'r}v*++),,~ 1ɤX*F^$`0l}Ε"_]va6yp:?o~^`4)--}|?kv$nvinnߦ??U_?1bڵ~Gt:پ};G7j IDATEs f3FQ}7;wT߃>$%%vT;++;wL:x; O/"فoS[֫8@Ey<=xN73rAQ;;I713?WEB^Jqa> (%IbύRpk\?;+A{o32oLJ r^&?I!# !!>>Azܹ+VU/ nO0U76{ٙi.,fz\!g@*`Ek`hS?|(B!iiieӦMܺuB8>}:.\ Ljj*B!v;6mŋ3k,PE'Iuu5bŊ>w7l؀dRS|Á zzzh4 پ}:jhAFF;bbGjvC^/ɸn E=WsvSXX{ܹC||ѣlݺU](^۱j(bǎW#@@=FlJ4Um~?PH$b@}F"N'96B(" `(..l>9Yڜ \qt:PK1‘ID4V;Hk7{a8!pJvg'ϯ\sRb&'q{|(Qk7ojDQRdf! !ĘYF&:ZnQC׉ *Q^ݴN.S/0%EjcNd i$&fЈwwL4UEnCa?tf͚5L3~^ܻ`cjjjHKK`+bxb͟?_}cSP^W1Gyx^a{=q{q{n Xlِ+`=W޿9q?XXU.`'vf0wY _Lٓ%LqwZN!Jv$3K-LLdHQJrj6n瀃ƾl{t:똝 =Sl0w\YLz~Ͻ?@Z7~ `wߘ^xݿn̜9˗/Kh8N Nq~`Tg`﹘?RRj_'=n𰶍=C[?l(zLf+d Fm9 q$ 6J^W=RH+BgRb%Y3["]?nNE10[3IJfFJ/_|l_ W`E!b6;w.3gzz}z@II Zz2](B`4aDLqf2ρ齷%݁ي&&yF`dN&B9sj) 7H-"=_.;oĎ795"b$9|0 lzE@4W۰n):$sg3bg,pEt$$$`4Yhr>pBA#4HNN:ƤFN>KPSNǜ{BA?]FWE|tśILNŚ=Ǜ.~~?L>˗墪 ϧv[`EEEZ= 1Iy^ul1r !BM @B0_Myy9˗/:ΈX,XϲN>|0ORRf-$|>B!HHtiN';w:ʨ#//O ҥKj4HLLrq)~?0m4-[`! H @C) !BL~Rb{nVZ@K"..z2ɓ' (BBB+W$33Ç|aΝKYYFDFeVČ31i B!/$cjiǐu4֯_piЂ X`ݻEQ0TTTL.\~#cLJPIB!  !CMM [ZGF#eeejKEQz*z̛7|;F d2QQQ2b_z6 6_פOSPPaR!&ztvvu !B@ @B ŋD"f.6Fvx"|gtuuskR,c!B1  !&6mڤu)Iӱxb/^ g߾}_zj[!qtQE2B!) 1D`#GD(ٱc;v젾C@zz::e˖uL!ƭ.RRR!B!ƀ/m۶iC ??ԩS8^~e]Fuu5XB;>hTB!ĸ"L7o2g㵎"j*^/}۷oWgl>|P(hgŔPE@^B!ĸ" !ܹsZC̛o̦MHOO';;78u@@EEVUBd^CGA]]>1B!Tz1]|ׯs%n߾ nW߷o{w(_o#GpU??? ػw{sr֭!废nWϾ}\?ϣ^8x\$%%a6o /sسgǏt9?}}=w~DZo߾>#}:D"^yn߾Y|M455a6tDBB1D"Y~>gQ7;C||}z\ op:tuua6yسgZh4; Xnw{ dvD"oίkJKKl?;wH$֭[-PXXɓ'ijj??gEaӦMkNgg'L>`0{m6v ~Jww7999B!>L$!;;ȷ~˼yPժضΝѣGq:\."/xF̘1&"={á>nεkU_s)F_~~>555t:l6v'r9v__IOO'))7o Hiii}l߾x={Ǐ `ill[RZZ G}磼\}ݸnyTVV>9r>ܟѣzN8lh4kΝ;￯>nۭ~?s_!b@B h4rx饗ޫeee?~J(͆匿f,X项J}}=)))={6l؀d21|{vb8/\Gq]]]HIIСCt:vj.ժ/Fݶh4>ױrI{{;'NPwCCzbY윉F@LΫj%p:444F#sQjU}Džl6^>R?CoNggz5kEQM.ϑ׹?S4%zf˱}~_WWGOOfy΢B!;JKK eeeZG[a޽~~Fzm7Y]GQEρl@޻],w-ìYFAߠ?ؿ}]~{aaxHIIaCaQ5w縿cGQt:݀Njp8w{t{ckj1V/-[ v=(`{>/^/a{}=qy$/^c!B<-[_carݣu }*p8' 2o<(T%%%޽[ @PSSöm۴!BPRbL&˖-c޽466:fΜu _D"֪L'&&( /&;;[BcEEESXXuqbZ!B1() 1L1L_t e~ Bܸq3gޙV\IZZVQZt)w9r_|QB!ôh"|M{= i?Au !..|>uYbb"eee2&$UPP  tuJ!|cb~5Y:Sgqw^a3OrGiaBX,Xϲ>LaΝ˒%KԖqUUU,]~á:8!}hsO xX"O?g}&8|0?1B!D @B p00㥚Lf+$&[d5((..ƍ̝;W(㎢(B!"B1$R>Hz6]NEc(BOE.J5$ZGxB9s0g!h%zAQ禳\!))%'jbɒ%P?[V^u !B!Znw1=vOO7g $ZұO#oScC~2ގV&qF%.wJBrscJy/^uq#EFFQB!L/ܮVMjY<9VuH_.{Ӑ%gZq\}9r={`ݦ嵵:tׯcǨ֭[]Luuu}rߛi߾}}ֽ屸Mtٛ5{i H_gs~O#w())ʕ+ZW~߳vZc!B bΜ+/TpO/d+_7GqUeR{Kr O/"o=l6GiJYt)UUU[N]lڵ'|˅"_]va6yp:vn޼I$ǃh>oί~+"FsY?Nww7III,XNcc#lݺR>#|>D"v;nf~m*++?SN'1b2=z^ω'0F?;wrIxIOO')) Mee]YYYܹsg>'Win(pUMl|ΰ䉹ܪo۬./I7z\];x~U%E#['NPQQuͅB!zzzZZGB!i$))ȵ x}ݴ;H2'DN)`iL"ɜ \@CSy2~.pDG4$''vX X,466(),,(qqqa=Z>u?@Bl6^NGOOUUU444ؾ};0k,, ǵlX,(hxIHHs:eFa^/]]]l6V+B!:;;p`@ Tbnnnf Dț IDATQl&Sn "dA}c+Nds~O0ۯl'b}2ޱ9eggcZGٰa1B!M趢VZZZ(++:ݫqܭҺQEAӡ( n6PRTGQA:42bHN8Aqq1999@y {sPᆱg0f͚5x^233=Ö{LF;@vMAA˖-M8[s/YP<+8qH^/Sd &3^!=֩J6oެu~;ƍ"tttp5VZu)! r>-ʇK#J(qz.'Km3ILĒCd~߲ccLz-tx;DOˑdi$$U8S֑&J.\HnnQ4W^yQz !ƇH>ɓ'7oC/=B?d`mtv__H(0jt$$`fMzlzèo[r%G\GYzlf tw]ȸ3VS:ɺux״2:;;IMMYBq#  si l6ycB~$ZRI, ꡫ.ގv6ǟ5glEJv>i{bhf̘ӬXB8S)!dh pۛ]t{<](xSIf`f=q`bh.]ʙ3gX|Q_Ͷm۴!/Zǘ2^Jqq1x,RbƸx2/FFݥp;O~{;[SM X *)i$!)Fv*Q5g~,Yd:Θ}63fXB1>Lyy9ZG.]aĄ' !AדSHZN᰷}1{6dǎȏ񬼼W~^~eLiI)$dkC Ö-[طoߔi˗~!ĸqJJJ(,,:ʔ( w_:Mbc>Cvq`-[L(:d/je}TTThC!ڵ24FB~![lĤ  !Ɛd8{,7n:rrrؾ};ʕ+ZbXt)/^SOWWyyyZGBLqxקLcr'ou!F/Hss3gΜ:غu+`wDQ# 1!lݺhc8p_|QB)O?«*P9y$oNL RB#=߿_(,Y† ϥ5C`6ͥV(#ݻ̜98P\19ׯ__gܹZGO1ͼ2t$-Zʕ+裏|Zbh7 6MHBk9sfҵx駵!bnݺŋٶmk֬:ҔF9x };v젤DHB w2k1PVVu1EQv͒%KdPqÏ~##ĸv9z(7o:ʈ8~8s̑c̙âE3eDQk</eTw^zQ=#Hz=}7oG?֑[P(đ#Gzlڴ ٬u,!Jff&uuuvzRB;wPSSNcٲeki顲@ uHII:cB @B#+Wnyfӵ$$zƍA>L$aÆ R+WO>a̙z}DBI'Oٲe֑0L[NZw)G2&(߿l4B!*++橧b̙ZGb\Bٳ۷kTUUMQQQHss3(BJJ  cM`Gce.`BkHqHӱyfn߾ͧ~ʦMZZv;</^$--UVlbJcҥ8q nK]1"\۷ॗ^BwJ(ԩStvvb4yHJJ:XQQff֮]u$q%KdZ[[ٳgzrMu4!41{l]F{{z|嗼ZBLPmmm;wH$N_~YXSJ ɓ~"V"++KXB+lI01R8uW&??_8!.\@]]ɬ]vB"ģy7'D*={Q륦TTT1v9u@D***uK&&- fΜ̙39}4Ge˖-$''kKR< dddrJ/͛7sqa٤⡚!O?-`0ٳgq:X,{99B `VXSO=IJJbݺu TdXԫ<.Cݍjeڵ,&Trss|2O0ׯ_WILL$11uI  ܾ}[=Hz S 11IQ&]ho!%%k83gfqFv'nsi6o.ĞsEQHLLdժU㺹w>RTT$3 1E"]Ɲ;we&ŋa) ryہމQ/^LAAFtZBL`l۶N , ׯ7x<455oٵk |dddDشi~!$%%OUU8qV, f;w_s~QTTŋ,\wR__O4U577;ҥKo7ov`W"`41 X,~7otԤj4d(”ɓ|>t:eee2ƓT^}U_zk\ulnn&KG)$ֲ͛Az=㥗^hmmڵkDQ~?:e˖i:URSSٶm]]]۷h4Jyy9yyy̙3+W߳fV+h^jىngӦM@z%%%ד޽{Yr%P!`6l{gdc{=~t:VMjrU6o͛7pxbE!??zRSSIHHpBͤÇ~Ybb"ׯ ?p9t:IIIXBZ _gϞ=ZGzkǎcΝZGBV^Eף( F i,Ǝ㡦~[ztbHQ&]Vhii!%% [>vKQ~}X({޽~Asf͚5ǫbw8dff|Ќ~tA3L>ӧOfٲe%ILH7oޤUVi?7bX"xLnk׮ֆNSKL> `68%p 0~@ @RRO=ZӌtZ(Zww7UUU8NL&=VUXl6._#3b8~83f`֬Yex"pKjA1|D"(… Gd,A1|`Zԋo0c .\(fG @BkrYIl68un^ܹs? Peggpp1z8/^Lnn)իOդpz7]ф}\.nݺEkk,b0ͥ\EkQ[[KKKzN" ̝;_~YB!@LZ`4)++/&jsk̪UHMM:@7 <x7`BL5.۷oܯg6tR[[Ph  HCC@u8)++}H Q&-d{.uuuAuy1k,r8AB!nܸASS`PŋINN:˙6mڈ[Q>v9b"rm~?q a, EEE̜9S kB{ <4Z(z*jW#NǬY(..fҥK~(z)((]v}vGtwfÆ XݯM :;; B 0DQE!--3f'5t>EF1HHhM @L @BDhhh͛ ̚5 G ijj͛tuu]|>,ZH~~E">SxJziҘ7oވO-Z촴"z=999撓#]ǘ吝>p8` 0 呗GVVֱ$&Q& !'ve,e^q0pu^0k,i櫯W_}}ݼyV***F #+p8l`rrccc8~8ZV:W_~u=XVypɲ|ɿgRAtQHa[bxx ָ]varr> [ֲwq?ߘȵZ Z-]<\\:T*VصkbYFeY%I^Z;wJXDD3"p0^yٳg+-bzR00ZrUabbbN« IrTU0ُǞ}Y}:tk_ڊVyh4iX. 8TvRAR 70z1u+y޼oz<0o((عs'YKD8lq .055ZE|T*(warrr2a+$S? @?~B M_G?>y~eQT <`9aA\.reyG>a۶h4.ǁ8L&An]E1 0P.a cD)`k V "ZuQQPa$o45~|b1 "`dvbbߣ{YrZ'\h. xV AF\q-zzE-䔥ddZD.o /PP8MlYZ,˂eY{mE1 i QrӪnq4IH'LE$餫%IT.C>( ԈV  5@+qғ+ #,yEEAIDsEXD>iinffӘׇ$?ŊRJRi5zu6}o& 56 "Z54y&q4M4Ml6vaNl6^ifh6(, }UUP(d`$IjVW(LLO/TA]uQ(`" À+Rq.qϪiL·MW.qZAgd5VB6GP1y @لeY m yE,jng2£X,b`` }Gɴ$sgQt vYt:eA@UUB4aLD>Gi> Ei( ((J6""S*V,INvfzt:},ULV $!4M4rh40 #chzRV6m4( $ ,#@De2a-LP%]죰mih`zzmu]H6Ž,C4`fv("ɤäD40}ǎ×e}u;v% YݴF/u:mxvH5&6am fI+%|ߟ7թ~UUq&׷fqv+s+ h4qZ ̽}GTYEQfcpr9]P(qHUUh4j)MKӴ5-I"*JT*miDDD ""ZE@Q垍! Z:aY:6pك$⃱3uDQmwx}bѾ(BWs[m~ j.j`rj*1S.9΍GW MӐf:rA@*k`'waQEu=orr04麎lvQwvqv;=^^OKs+y>&?' Kc'MdZRȰhb2Lzz/ 4Rn 6.&.s)I\###;pe'c T`_:}kqy-oAאhXV. 1>9r]S1[1JIDATf(5u 2W7oۅBhbqС^֩@kEPW0uKnL{}KBƱt=ᄋ; μ}{}xatttُ'""m/6zpUpAkPV{6hy6k U žlݻ!"u+6DDtWC=a)`8u~;mz=$""aksZ 5UU4(rݨXQ!""Z9f3mOK#G>^o!""ZuQ'=;9il`BQyKxQ [DDq#<2oN^`C'No QOњ>:{キC!""ÇR`ݽ Ҏ;ct:8|08ƞ={k׮^>b?Z+$k;׿5A@__n .KDDDDDDkK/N|3 Q{4%?!"4Mþ}P(z0qQ{_Az5T"""""u=TU7^]2 ;n㭷o~t$Iعs'v؁Lv"""""8w/@^袸 2>>7|v;&oߎݻwCҸ </"O} [lp FFFmkxw""8d0::Qdhma&?o%g NESȀ73g^yPU}躎ݻwc۶mEG#'"""""Zy8t2 >rEfjq ]6u{all R M0::-[]qFDDj矇8]xNMutM馛Ν;C! ôj*OV Fр,˸{=,+]Q\4ɓ'[o!Mq5````LDDDDDSK/ZEQp]w0^hE0U5<< _qIi(AUU c4mMDDDDD ^y 0p]wAQ^h15\.[om̙38x :NHEaL&Yi&lڴ=8^~e `nmhahZwZfff011Z D( ijY{8b""Z.~s.T*>~zzضm۲yO~< /-ɶNn/}47ހ<qFTЈzmX=<ϛw_It;xq}ĉWǏG?x |'|<&''cAEkO]v[o4,˂eY4 (;SO={ O=NwCӁa$ Νã>'d2d20;;7xCCCvOS*piq͛7>@( 8wroߎcǎa۶mxJyqb4A^GE(˘L?>hDDkK8~8!<σ$Ii"B:111zN9[8L$Epe}½ޛnKV\~ilݺ[t$(JIn?ķE\x I EѼ26q?~N!ٳ{Ј: z :z;,}$a``###ؼy3lLD&,<K8cٳ$zDW=S!rr|Dža)LMMO(BLjL&] -0 TUTUњ4>>w}a8EΝ;q-hh<zO+TUyid[(J(زe њ "CSp An\s VDfggөh/%dYFZM+YKDb&}j5^ECCCسg4M0F),{ɓ8z(+< Ʀ>T*eDDDDWZ'Obbb"ݖ47 wƭr0"˒Դѣ_'jDBXd k8{,N#%bz'ODV i?puסP(zD#FD늢(eTu>}:m})qCU4LĩS`& R3tAXZ(JR_Aٴwa^KGUUx$ b###ѪaDDc\CCC螾v̙ N]Kz, 0t [RaH""ڰfff0;;4 s\3+au:r9 w4p"ZLL+LLTsGɇ8ӕCIURXLKDUx8nhs]w%ACrjjJ&"Z!"""( |tW"Ϋ8>YdH"˱6{ʲ)tM0 aYּ mS/HXt!ϣRR`Νu}5OD0""+4NZ-X˲PaY^{5Qa$S$DQNe\M4 b#ϧIݥtu>rhm0|u/O)˗Ju]NjDDѺ}~z{ Hrl[x۶md2ORR@]Yxv Vp6D)\@otTDeqmض 8IaVtu. bq^CDD2%+ NyTU7-7V]8YxN ~BWg  +:r2R1b0vt[~=0թ~8Nx{S䰧ՍQAq`&,JGQ0 L n#c(0 tf162 DQiT7MӠiښ8i3 M{ADi{NPr tRf"Ǔ0I0.ZA݆eY( , R iX,¶myNyfhې$)I XUUEyBBLhi ։U$HU)KWt?.HsyꍃBG}l/)jV#A$t2ILF|>NVB4i8i ( DQ(J,+ i4MiHdt*4M(ul6}E!"cDDDDn8,BDD@ل,hZN*cJi\*,{\Ыm.:/; Rewh4X϶,l6!lNC۶EJҢAATBӁeYiH80as]vau]EQ`& HCM ŕfUEe]Mъxk]18O`ͰDEx_ (P3uDQAEl$:ꦍSg'PP10-!dqO]tq|ˀ0b M[*0"JS%W6 fs6IP,|u/" KVj%!K7EQ0%dP(|u\"""p0L^019]S19]Q#c TcVqc O0P-cbr1bLq7㟎8k*fażn!\F!Apuml he,[o(ĞqAܹFiW,DQ xcsߍ*q~}mtFDDD+žۘDӮ<XC;2\PZCN.ODDD """ZqqmLGӪ!p=qz9 ZZ(->R&j?c414}D-oհ j^^@ D.MDDD """Z\sncK (pJ0""""""""ZxYhcDDDDDDDD1""""""""Z2{ADDDDDDDD+y'fLIENDB`pg_auto_failover-1.6.3/docs/fsm/000077500000000000000000000000001414244367200165725ustar00rootroot00000000000000pg_auto_failover-1.6.3/docs/fsm/README000066400000000000000000000003541414244367200174540ustar00rootroot00000000000000These files are descriptions of UML diagrams in the plantuml format. They can be used to generate pretty pictures by running plantuml locally, or by plugging them into https://www.planttext.com/ or http://www.plantuml.com/plantuml/uml pg_auto_failover-1.6.3/docs/fsm/group-state-machine000066400000000000000000000024551414244367200223770ustar00rootroot00000000000000@startuml title Group States [*] --> Single Single : just a primary Single : (single, ) note right of Single This can also be entered at any time by calling remove_node() on one of the nodes end note State "New Standby" as WaitStandby WaitStandby: just added a new node WaitStandby: (single, wait_standby) Single -down-> WaitStandby State "Ready to replicate" as WaitPrimary WaitPrimary: creating the replication slot WaitPrimary: (wait_primary, wait_standby) WaitStandby -down-> WaitPrimary State "Catching Up" as CatchingUp CatchingUp: a primary and a lagging secondary CatchingUp: (wait_primary, catching_up) WaitPrimary -down-> CatchingUp State "Caught Up" as CaughtUp CaughtUp: a primary and a ready secondary CaughtUp: (primary, secondary) CatchingUp -down-> CaughtUp CaughtUp --> CatchingUp state "Failing Over" as FailingOver FailingOver : (draining, prepare_promotion) CaughtUp -down-> FailingOver CaughtUp --> FailingOver : perform_failover() CatchingUp --> FailingOver : perform_failover() Demoted : (demoted, prepare_promotion) FailingOver -down-> DemoteTimeout state "Demote Timeout" as DemoteTimeout DemoteTimeout : (demote_timeout, stop_replication) DemoteTimeout -down-> Demoted Demoted : (demoted, wait_primary) Demoted --> CatchingUp : taken if the failed\nnode reappears @enduml pg_auto_failover-1.6.3/docs/fsm/group-state-machine.png000066400000000000000000001404601414244367200231610ustar00rootroot00000000000000PNG  IHDRQw )tEXtcopyleftGenerated by http://plantuml.com09zTXtplantumlxn0 DOICP$kQ'afmdHr('v$E#%XmgfZĖ[4  ragi,p(ȹu _0XZ7Ԫ0 3`Ң8;"GXThR@IhnQ8/$.wG s0>opO=fpr:Ts!I>&l,Fjʕ蚥IW` +Ŋ4>: /}4|g4ӪUz IG{>V2Eg,:W"}ܿaJq}t½@i]&=d\t&t48)B "a> o&`TTŲ;XJׄΦ%\˭JXӴ:_ eOB]Ӵ TF0>!p9>c&Kq[lqyV5|߬Y3'1c_8.9.]ĨT+y=Mv:~?'o鳥3gΨ}6%{ssGΞ=V㨶2%++K]{.ףG~ᒊ :<BTHvء2-ս8OEa{X-5*6ey믿ӧ{XԉCuQ̚5ne W_}^R RSjH'S('|w}5l)%$$h7JN)/R2dY7GUڳ^otdK>F+õ*XBB۶mVF4ҴmY 22Ҫj'6m'J8x.gΜyW% jܸ5'N5o yiuo٘cƌ͹MzTTߤO:{2T8$$D|fMFgmAe6YDvn5lQ޽[U`׮]"k$1uKy< <y8pWNM;<pyp|Ο?q\ ӑxk׆:=e˖o&ѿ;w>|l6WUUt8ӟmD{999b>RQQpwqF5dԩhF?^]}BBB֭[iTx:)))j?ik{8ӉxؤBNĦo3 999*գ=pW$yZWTTdXhOΓ Oo6iO\͹Mk2hO}<MI&<M?;M8xYYY5uEx=cZ][ÒUq:OjUԀyP\>8=&YĶ 'w$>[6-7EQ{o2vٝ"+M9I|l9NX,>fk<ΫV{8ꜳe!lu]XeYςe ~[8=dwGQmi+*,9qـ.ùMY_Z8/66699ގsu1}d:Ճw/\ZRذ=:AAÇ(+f[vNXX:[f~)m_ޭ۝2O7Κ|];oH/\a0rrr-zWs=O{䓝7o+Ix?{ډ;w&\nVeצ決Gۢ\t7Vw^\\\|||jjjnnÇ%yଡ଼*>*uu6mn.)1to߾D)WV7aƅO?MiRu֭o)8=29tN0T;/f^BBBzzz~~dT:Oge'?y )_'&QUτjiW yl@7w4yIII3v8rLK]zHK[rԨ֯97}Χdl,2_%e[煇s&mӧ BCo%ZT.U)MӪIO'sjܞy9v]:ujС\ϓ$^_Ӯ <(88o&tqGS5 vfsee6.[>wm{=,W)VFlR ʹyɳIwQXXX{RR";u) .]n{nS?"]M[6nv횿#5reV̙i' l,1a="`pC1cv[ ~Zow7h./[ѿfJeoh4OoL&8u[zm'xq}raΒaG鈲8pՄHK;ozꚗV.e6_.8ӝׯ6mZz^uɸqO1}祥-a;.NhM2Uཨp#p@};O 4|ƍkiӞoڵ ۷uYMl瑭\%=)lyhsWIyPM'y8p^u~ իyp;.PdV߆?ncROw[-ErR;^UpΫ\SEl{Ke'LGzvNӝfp w0ȝtP$' iGqytn=5Z:fÆR0Kc0'Ӳ.YKm A.VA6D=HS8X b4Zuw0s?8}fH:uI=Z;=XQΠ?v il7pY85G_yh$Q0Fj[^~nk!)5w;8\֭v"<}'5y6ceyF!?y1zFѰ>/SSj[5u;80ej7(Γu>DGU'<;?Oo+>7f{;w?m5rnya}l_=N! my d uݠH˙3Ӣdw$w|yJ`9[̥g틞 ;44+lPVٝ>8tȽ 7OS8~vO8ŋOMo؟I*#gLsf2s<5λxRuޘ+|q⡹o~2sɾ)bHdwmwpvgEVZ`?TqlJՙ @lS~G#<W%o&,xCuٺ}^7S}d+c~^үw {4僲Tzxx{A>{" I%^4k|߾稛xGZuPP88>>"Q?Ef* 1< XB8pyG]DsW];{ȎQ ~o</8R$s"<_,DZY(NrhQ]EE=$̦M=zpenۺMsӦRiӞUntQ e)ΐ!wۗ8n΅.G?!{{7iiO[FG1jŚR{o{O_(>{^+$''322 - #p/H|}[=mn{ʁISÍI*ۉ|8y8O>؞Q9\^.$<=ҵI'.~ "2d6ԆFWZ]Z@NDZF'']~΃7>\TT$8/}PN!^KO~r0!~'GBgjhdeAujq}*=r]yu8Fsڒ#>y>E88p<y8p<y8py8p<y8p<y8py8p<y8p<y8py8p<y8p<y8py8p}T|nv/.ܯy̶e |OE9;G< 7ӂxyԩey|)yNae8jCJ4 -vpy%8\#žQsMx}'Yxj,=*wt/.!gܾ/#ݡ._^{bsc*;|s}tyԧYb垢 1 }Ҋ*ry՜*>޲_vB>GW-749|/, .64eG񄏅DN)..6hp5N6`UEEE& K3B3j2nk{m1xJ$jQFV/V!rEhp/ŏS={vqkUUf۾M5 6A{u%wޥsEƠI%''ggg把 y_MȑkΩS=8:z[Җ{u!/n=S\\lXHqШE yee:ᇝv˻u30i7Κxvy3fLV bfI._Ȕҭ"nڜ6ޣGg.>@{KbH%eG֬w\&[2C|k2رmoZ5?}ޚE䳎[*!!!===??d2?TzX2f„ۺ^<<wΜq꭮]8rd_^'eSi6mn.)1:Tuo>(N$PMo_++wQ%YXOS,۷U<[R޴ieY?R~=WgXuSRr_(; C\ܦ~ Ʉ$SSYzopp$aR^iӞ݊h)$QIB>rrQfk,XlPPR7U+_w[}-={ng˧nc%&&fddZ,ՇDxݻw(^" Lo:VvH!"رQ#F r߾v7TV+^LӶ-a vjU6ĩγzKleǷzraqvzl6-8>'!R6Qs~3EΝ+:[OR{*n}}[ʬg.6Q8w\6mZ:p}Rͩ<ܥqqq񩩩&o|DKK9*ӧ $sGF-/ϓmj%~mɒ?vN}&6)DNڔ)#\jj>yPΓe<'~fܦǏoQeCm m{6 [jdK摴vn}kԨǤ KI)eqVzVvpԡJJÇ>cǰU.' Vx=AAg9_遁MUwdA{+88HUSO\Xy8Wzݶ._/ɊKIB e2sؽNv%{CڵsTJNg85p^OIW;rd6=:z:)'kժ(Ή2voԨǬۉM%.ϟHtM NܑY3[V+M$e+Zzծ۩j[]sq΃k༒c$k.TzsmѸstzvnsٲꜛmo+oZϪU3g%sXݷiwȪuSr~upGN+٘sp寔̝՞;w\ĉO^nq΃;=+wXvy8|yAAk8_$p?rܹGGzc3o#pԡBBZ:)XUΫfꮪŲM>ӧ p΃r޲emy:OB>8p΃r^DDoWwvGQhpڍ|Gc&m9"l-dɟe%3;W]]l[=Spԕ󂃃GEѠD+U:F8ј Vd+NZvx)l2qڴgQ͐ 3 ><6γMQ 2$[h1Џj:RV춀f"կ!e|?cE[!ypUH?xb}(>s>B-F(1p9nK3R5p piyM8eYoOn4<(84@s)Q8ј mH.bFy)}ܽo_qOku=ВYVygd}(j_];_TH e;Q L N|p[;lz]S!yI)'>v#7r懍)8p/O tztAHHKQ\3&RvG p<'83A+8"\nꥣ / 24+Ab\>ҹ_1{h4J@<3H. %&&J7}cK>2Q>J@<ګX,(I!h#K]I>,ȔCR>P8TwᢢBFsq^"|LaGr;l86tAQ#3.DŽp@5<<y8p<y8p<y<y8p<y8p<<y8p<y8p<<y8p<y8p<<yHf'γzp$KV]8<&y>2T+#6K?pެMe{y'}4)X$Sړ»^)qp1޲.in1fy>qLR _͏|M<ҙsƽқ?y_.]Ypo7`r^mC̏AprW_kWZpS4ԩey9n\]Cs#29}X[Q#mW^PwpQsMx}'Yxj?,(wt/."M?N(Lo-%rJqql.//G{umضE0M[TTd2<\~t9#=&wC!Y <\?>7jȋ*u_~Cً233<\v ~1+,[_h֭o2eder^Qt;)좢"\QQ px|bSS>`)ȂIk<'uUvhݯF}:8wi\\\|||jjjnnd[8p?b֪Usj'tm=}mܦo{TvѯJʎmZTtT皞ܺ-wܪNpx$FzLC27ЛҖH#QP^TCΝoWY[(eâbʘ;͟ʴis= ^ŵU).wI=,j~ڱcnqVwR S'u^]Yu]J Z#{׮wHr#bXj~=-z,[UU a4.sVA^_) ,s[;>jUp㶕wT aW!ӸqO4ǝ'靤M6pjռ-8y8pp<e9'Γ:w|[׮w؞tfQ2 I_5E. .O[vs;thőct~PCUhMm)jO{y yC9[Z&ԑE 7f^R^vv@Wyn]p`J>wۗ8nkX|ڍP]RȤ;|8MW.8(шv|:eؼyՎ*Yow#6$~~Mmb\Fϻt(} 78px.S,/Lwa2~Ҁ?n-]~p3\V{Fybr\1mN }>͢7'\ިLC^ƻ6')%%%OlL Y֪̐_jz]Iz3/5]A999Y,9fE+?I/o#O]wrCQ=Q?NESl)3_]ӄ,**2$ySrɔͣսA+77nTɩkx<)gff!$I<I%֬O۲y g/2YRS^|+ i$BBBLo^QQQNNNzztpҩW8?CBȁ!$r灷jl6ҩ6xra r`!*r ȥ3猍{)7(A\L<$od弬ۆG<ѯ p]*iߟ*>Zw]dyN)jr--890iqVX:5G xKN:p\3 Jb6^''Yx; D hM GWѓS6^G̏ɳl5@66/_^qr/$ω ?>4wa i7]TTd6+**j;d ?CXUM<ر,yymގ[Ϋl~p} bB89{]s޼IcF9zta%0"wYY҉5/޽֮]/kٰ=: rj z'9Gb9aamvn͚M~Y,.+Uǿ'rr/\o2H5Jgq&Y5|y<(ƌTdu޼I8监@rZ {Jq8y8<oޤ y+3s/mo;ߨ~!!-O.P/_X^SsY\ /^RbTӫIedG-znfk8цMj]U]!󄅵цt5V##*+wkcb[Y`ipmd '%!\Z]΃:q]w棏߹xqҋ壢NJB 26}ZOﻧN}FֹiRΦM{VuF+Djzn;CYsY҇[=h~/p>ڐlHzm~G'5]P_ q#ĹsGFJ ǎҦϙ3Nl$[ڧ9cs2>vyM>\8ܦS^-l6(knLmW([EޕĉO ݬDQd6mnsqNZ>hI6ݚ;ZPIelOlmP;)@h շ5pqlhذjC{N.JcVR15q W4>.yRms{nj6$t4e-qaη[]ޡTړJvv(-88ɓG'>SvԞyNH[fcWkǎeM 1j⼹sǏA'1b|8BBZWMkտ{Cݩyp-m]<G[l)#װ!|8""zk?1d@ 'NlZ%ǎeu_ 'v[:Wv[[bַdGp;E_OLl+R^XJUM%NfN3.wɲ%&c)tVBl|y(˿F=&ȾyWY2w`A.wɲ%&c)tv҆8JgqJ8h+klX hݻwRՓPki西 CF|琡Q/WaՏʄy O3ԮG,7 ǃ2>zr~t<<}fx~ ./*RG}%c?и^rif<0#O5D~g춪GQwlꞜb4 Yrj/U#|_py`I7Ų%<8ݮI?enʕlo)[1+,kΟ8<7r{~ܽ{篬N ؔ#9c\|hy! dk8~:`:P4leh8U%$^BBBzzh}ؾ|]~Ó$+'|;UUU%癿Hg| 0xc*p+3؎f<3-˩#/ 4]A999ş/5C$K%TTT]lI2R=O_,[rioCPujEJֿW0Q/B1)Grʱj+qTgo*1';|'Zzu$11Qx#)-&^l lÃϤܟ4ߑ[$N}%~^ rq(Grd)GW#< u;zo2vb>y~ ">+ә3-RkR嗥?R[RV*[]V_Y+z@P뉣!4$"@%bbT, nِsdyfɓ3,şݪ-CE<<;#Q)⿭:/@jQVJmc!]Ȯy6g>& J%5l{i_t-vq^+dsNHH%evPۣHZ#ۙv:HQZBi%xlP~OM}2bo_[X_͢")IJ41}]'ph=j=:r|>8 i^wd.>yn"k;co\/)\ߵBQNI;+_<гNSI{/#$OV}I%7X$wD&Q!f/cVts>b}B|߻%C;%%v(/ǡNsv.dM 2KSi2yQ.{c1z媳/+Em8tƛ5ԡ\XwHHxaʨy:r#EI;R8 <-je.>o:y!/%*&rqhQmdu6<mK4?ʝ`+z= ň8Qd]PjNU9>p,HsJOr{/ QRsqԱ9[cS((iX}c1^H `LQggSOan/e#6(1MvF`\O'Ugטr f9F\9luB{omX/7QUp~U3KC%-.:Mvy%<tK Ƨ*<Ms)Fk<Jyf `ۅeWo6 sOYIЮ4 Ml !~r((T1b.Ō$e#Ix<{=C3W64-l3ޓۣ'G7x*<\,;',Y y&0= uhk'?lD݆)pz*$c {̊h//\>rZ1baf9üfcDsxf?/K_.Ny., z< +}Xgq/uFg=f%eԼ N "rO}vq'm0իQN6>H|o17kRbN8X7?ExVF~7:7qs:a@{pn+Ar(dzMS^KТx=YaE7p7I5\+ VfeR'H>_49ڃ5񰥵 p:6dM `QU2jA?[ 8XOºԭa:MYl ~p{*cjX}k ^j<<6hcq2͑zXsm=.6=ԃ Rr0kJ~ pӎY:AuJ%', D<`An]N9WA)#jD2Uglב 5y 8~y%RSĭ6 >,r!c" b2v#zqJ6> fy ls.9O2xFT{Gƃ{K"PpĮb Oodʡd/kې:6i.`~2 B;xx)`hY#u'?Xkv!rsDEKCApWjwV(F_ l>:t)f^EU4,]ӖMۭʪ&+]R\|p?`+œ ,y#?Ey)s?F.)B}GO xO2N-BBbZjE{9>2ls$ޭhYλ0u.7ӏEXT(B- T*h|pōGӸRU4%9zXۓ *rP*֢=\tȚt#Ez]O]iR֌ѯ_?TY}_Q[M&YVE'7Yr|> m:ІBB>x%}GGi^?qbw/;q}4'z.S(jΒf[Z = ռ<9Ԕc/h0=}PYxdL_,zzR䑣"(--XXx2j:_ȃ YWUM(۶M} *u֬ɔQ =:ӐrcZ/ spx\XW_f9sf ty4@hQQmymV7::Z"dggT* O~vO[+ya;Wl=…C9g#[[o͚:tX Ĕi'^~~ ןy$ny4.pR[*P+ݰ#""9IV_ڥؒUk@=󪳊^}Jd ]'\&NI8p@sw}O\?rRr}ee.c-|q>Xy111RTTj4KiYSٛ8YHQӧ{:::0GyT ّkJY8O~;9=V-vrupqň8 Yʱ FmܸscE-U;O#222***!!!33SRYo"{`#l(z,o frpx!ի)c܃4lڋk弼hc E2͑V༾W$ /19_}eVQsM?øǷs/*y%)6Yqg#G>Oe9 H5,%%bvq6r :((T'`} B=yzO/%{B#J\] fU]7&7">~ν 4=}rCړ^8 4=Y,vURda~Uu7Iqʽ<38FnNSoۏ9̙3GussƍxqwuJiFooб+FTWWNc1n8}qu,j+{ Y> 555---˗/g pMzj->!2įk.2 oabFT*4!6 ې؊+BCu* 'o8Ϗyp?Bl-ϒj祌wʯ]ƌ<vqqa߿РAlO>+!HffaA%hyzWܴMJMp8XUpgvNy-_Y{]y9~zG#EFGI54t'bT*hѢNm@ 9Iql σ9wgySTP( 2$##HQFghň1c>|xڴi7<.. 9g#cjn.|t=[ԋQMƎ_j?SII;䓿0cƼh|i.%3>3=uH!$d_ Fn^Xܭ @(,˗/JjtT9]E_>وƓ/k41\ص2wW/t塡eܻvvv$61˗;tг6#@o/j 3VKx;y۶B)esF:JB$Ƴ 6YVtxHmpwl))2^pc),JI4hZ>wnзƎ޺_]fҐ v|̤Up0hGG[~s9l3<ÆS Lw LjʚVNPu6裃Lpgؒ9o^zIP5h֝/_>fp:rlWss! _CݥPnV*wp<.櫯Vg, .-K5õLgyBcϭ Cs6D$;;[V[O.?`Ort=γ 6> K{n5ZDɚm›\8r^LLT*U**~r3V_KD< rQ[\~|[)|Ao |pD1G5)#ۑ| XZO"t=γ,c3j/"I?exc[/66{`SRYOw/Wp9Hq|Ȥ7JC6< _D/ߖcmۋűq"Aiiii4jV;͵zgq#h߹&G۩Y50jfبQãG Ga~yQwI-sA=8-]~Y bT 1ypoQI( ڼ'Ϭ+E څzp[sCɸOPW@=8-YE]>D  2;y]4๕aF\[Ǡ*-iЍ7kNSp-1%8?$-RWg/T9h,΃wJ+!ndvYq5><8$t HeI[psww1Li.,ΥLpPmv82fՔk>~ʨy,VF>^Ny-΃l ǟ|7f)F̭-.G}ZcH|'ne7 /2 EAݿaK\ Dgu[yhy!/Rh,uO+i*T5yv&m-u83d]_b\V] E=8γ9ZMsi#砝Ow^#<Y1K"ճpKO΃l7WiVgV-Xpy )ʌju qqqׯGk_̙3/ 4ެt!pU4\'xdgﻗzbUO>k5cX3yyw=e˖ӧ3._lY4/sj y=!Cddd)߈ 1f̘ÇO6cQ1߫@K58d/2oB:FkP2@] ;;;QTccl2r^JJkxxZ1G#).Ŕ ̒+W0;uOIx<0c3O4=Iܫ@׿<zfaK\ dE&LksKP΃~n2ce#{a&|x <0C)wS<<]yp%[BXy_L!wj3>7y-u}f͚dgoJPs^X<8Ĺaf,0?` @;+QWpLI[K= sxL4n궧~g$8Nד+KP=jOB"^ӧ{l;P$3RWcǝLy40tM: _f&&A%][~8O~;9=VѳtLYbwV"ypބ 6n\Fۤ1̱M+oD7|3 =!;"##233U*zv`3Tשy<8O}GgebӰ~ |tF/뿗_=s֊JTF8qY0#_>k,GGG{{{ooYg/..6l_|\v!C(?ݻwى>J7T~7Ru0TT]%?x F:444γZX|՗Ƿװߗ^]nCwƍXl.{ dZf~_;;?C"7(sm t#GT*kKK˚GqMFKIIaF~gB& \|9;1IS9;v0aBΟ2 o wP4o#hγEW(suu޻7cU@7[rO> 9&6o4hГ۶$M y3~h0p@nE \\\] ߿РAčl!Qyz4~@C%S%+++<9OSz@/;r OOOGG^fǚॗ^ڰaÆ:w.8zмF6ĒvCƢEBNhRzy\Ϥ]vq'fχ+::X2<8<r;ܞ<;0`[ؑC ȠuL@#Gʌܲeӯ_N×/_nn;Y1CeZC+`|M)9wB΃pܹSRR\]]ۛT1f̘/@ i$:3:OgŌw MvXInCyz8y<8ϊs=z"[y΃z@nys,;;qS, A y༾8<!p<8@_;+{.]:4>ȋuY֞={GYvA˥iU24 sW/%%e4@ 8z(!<}eիB!Ȍ7b_`=f1_w|999-ӻ9p~煄|07_fMv>qWPSYY eQ̰.2єYt]4](zRw ,'ՙfq^C}.׿FND罀p3u^cdc|S̙C|饗طy8/,lՊ(q^w 8X([&OyzŏYy^g׻999lp^Pe?65СOo۶>ݻ8;?cg}J5k}*M3/ ƍQR"6qC+`O߾cfӻkuȐhRlד/]:gG NiS)e]7 g&;#wR,HBNU q{ގy/+~b?۟k? sWgh8$QGa?Y֝;4+,a#?{6GgXk4rVEP_?a('`VD+@1ypHB-V80wr?@Sz^ d'd}gZNN?/1/v8޴zfwdp566._˖-{3ȋu:#y-ց]%CP>wh_~t8i|aÞz5V*A]ulWȝMhJLrʱdqWf׻..c'wO9hГg7̛ٙ(/d'cwMz34^^88_n)kT[\NjazsngO9+<ۏ7qA(\ُ3wufp۝p2Ý- <ypxSٶ:ӻ{>HǝlWss! .(Hy&tVXﵩfpkzM;w|}_9op988tj<׳DcܳP4 P_?dSɌΣ5k2smhzd:ɸm鞌MUo/ɞOe_i9Æf0 ӧ7eRk1y+ŎqpYYV-, M|bbs}c^iF Ouܹ(5ts{1WW{輆-+`wFr֜{KgJ@0y̘gpͧ]`)9Fdz 9j'|ϓ?hX 8jJI]NK =oBN:akzDkqj'O˹oޟ<y}I@˴ugn>=C.z([R86pghsKU%w{mCa(3192udOs^AD$;;[Vp{[F ǰx.'aSLGŒy111RTTj45΃z?}!z"r&EOLo$yd;rOPkZ5΃z;I}{)s!(Ui'F+60‹TTuuuk*MMM&sS/J]%exoaNKK$Z=5΃zV9GΦvƙYK~ ~5S"YBm 7*hLh{䊻x!أސ8TOժT*R?~JIm7T! w>oglGchX&QF I<SoվC]%n6.ϫDmܺG8.k;&1Ԯ <y4DO"A}w1lC(jW<ӞZ...Ζd ;Gn7rVGLj-fPVDmZ+ypiN0 _ffBJԄ VRk6C-Qv8γtR]L`RD$ z+]mBmZl<8̧jR*`&r|s~,)Rk6C-po 7<kSFͻr(UsBs USHvnY΃)%[ZQyp9d)Ad>Tpj@X@UyKOFUyT(Fmm|΃) =;;׳ymH_rUsZs#Ep*'<*TOȔQp= l 8JN I yZ4/^pM{ 8Kp= ÖwVGU8y4« 8Kdgo\<׳8y(Y ׳8y&,p!p= lzrTϕC)Q·p92- 8zҐ 8ƛ5Ry6AuVd仗Pp?%Fu8Kz8d;J.Ƣ*<WU\|% 8"@ؤEU8y\ ŋ<`+l@=8yչQI֔ ՞M8zň5yԏ' O.6ެۅeԏTgZ嵄S?R9u<Rz>8bs<5}QwI2opOSZ 8yoܯ>~ϼ-{u޷g!G8yom/|8/)Kf<~Ҥ-.*常L G[—`F2J<`fVڕ2j3[D_ŧ pky ~tOL̝l[Մ[D&*^E$"wV<'u)pέ|T]iȦyaPPq0\.ν((7yp4ץH5*>`隶onUV555 ycj%')FAg(߻Sd~3PuGFypr{^wCVm׶WjVkڃ<`|3zOr'm q@Rٸ<8aKiJr>'6U6]!?T*m\{p r5>~!ZfG~P ]}E-9n6Lfڃ<`_RI088<>tӳfM’0;iWU~O$gx{Od?Nf-$ SYVgʺY髭\*P{ݰ#""9IVڨ*ǭS">9@ 3JYY;wraz5>j4GGGfu-!3 F]euy%|rd2r^LLT*U*pgi !Mogy }@nrtf1.8ϔu3DWNcoj8y$472ogdY=<#KeCvDFFFEE%$$dffT*8y}keѿqބ  i=HYV96%Z^MJ΃YS&%?ml3ǏoD"7(S^9vK7JJ4La4װaEsϙZV99o׷+ثEp<8I4x֬l`Ҥ?0'h 55S^a/1bV~C!_ծFSV7n܈-l΃!p<8<8yyC <8pyp8΃<8scv,kjm sw>z&/йy@_w <!mJnv_ y5w}۴)hGggd]7 g&hhn94#oU"$F ѣӲaG|w^#޾c hv:z+d5vv7[&`Р'@%3cRg͚`oߟ sTwadZL7D.455J:KQPhxtnp0y+[wRۋ| ziYͅ˗GNN?/1/2^fd zb|ӏu&av0 [2Y4[/Hm,Xúפ~*Z;ȑϟ=IIH*2]db*"O0E:K֔ <Lu01lسW0jl/;v{J^U&wvJA@J5v󌯉Eue3'i4FVȢMf6H!Vo5v銕*2]lG_7NjtnpЃ6Ǹ_lVn3齶C.?@#ykGt{6E۬.O :M_8j5s>x Ћ6optϪTV{FG0{)=3`͛o:3tf4yah KJƍ]][˖chtSM=ͤu5h*05-PVHf6iܞcbv]y'[ki(HOѤbuޥؒUЭy WSD&hnK =aPnG,XCS\z7s-;/?ʡdtkp0ÖVÛ?gϐkϓotq\-;O.]R[1.na]kt?&%yd;rOPkZǓ6<`*c3j/ѓ"^-#E2‹TTuuu6N:]LHq|Ȥ7JC6< zUkmm[ (KwqH /4FIFm z<Nx|Y>e۶^M8|a6FNiF[ت j;Is\{T@VN<ЕTOժTXCѰG%EۓVmc;sG2LTR3l#IqneōGЉyڣ^vE~fStkĭ&ZmaQi8CK>oݢ}qmǤw4 H5vaz08t]{iii$666***D$ld{I(5TjhmPxq2ҞZ...Ζd >5MGK%&|b KM$5KjDRsFkk#*K lAniN0 _ffBJ7,"K{C)RfII塀R53V_KDsvJ%Mg䉔?K=LIfRSIҖm?ri>VK=ZVҏF|Dj Yڬggu%f&;{_FUS^Ky6j!+/ 8FJrgF?Ƨl@=y񰥵((T1bU >|Х2F{|I[{bL܊S[\N^iAT{ne>}Bߧ8g\|ptÖ%[pzϪ*˃<T(NS5\T5rvvt0dg͑ ~U\ lGUyNpO}#}QhEtJOS7ެAmX%uZwI]|g\ *BUX8%ztmn9’91|&vM<_U:??wY&3/D=y5?`)%?Ԇ(y3S92ٻ&Ua9ȗ x&0?YER8g9Xړz==g.قI?-wr|>h֢66+z=NA&J2V*r] @%y#S@mJ lA=y93#!s<RU[BW%<ByIy!{;\p_+<C=59/e<tX= Rxl+p{yk~Uu6>@m.]VL݆3!w\@x0Bsm}7)J*RKW<_U1mOM^\ l֢*<h?嵄u"s<UxHQP(uԨ "?`U"xF{bL ;z_9{+#FN<Mt9>gM ޻]Xv)46??G5<mA;9[M,[}]z:G5<q5>]42yǟSߠે%v)gFDګ:~{k e<\[5%X&}v*/$s<º'p϶&4ٛg+<0%=k$\*i~(?-w#沧--nenb x5v8i.z(y^I[[[\^Ud!q3-vfM >;Msg4dJ]/#O u=9Bj<,;%voyѥPnw\yX"W"zsg,+v5!)UčcURY[C67ݻ~+?IKf`qȗ:M%ەlz˙!fr((ߌ`Y\N|#e rшk)UYUWWzrIƟs5#z4ٹ T]Q؈<,r{^wm׶WjV{yX 7f1ň^ O}`AAJ<,-R)y #zܞdTtR<,iUJ [`׏[㾯ŭ&x=8 CmLJ}z֬11_XXυqqq @VylrQ_&)i+SԔmdL_,zzR䑣"(--X2Ճ{WUM(۶Kُ;njyI׮+w޽{]]|nQ_x>7_2:9OXb2_ő# _*;mȰ$;;[R2Ճ{%pŦ^s;OcSSGy3C(Z}vi^JmmI~4d[R.$0%ç:K7Ƿ_*ΫL5tj6wyp估U+V}y֞(SyX'*{ُMM98<>t۶vkuȐ_{ןaM ptv~F&۵ypz~f.C#)h>r_eaFP}41B32c޽V1㛣wY&;::cbz=3V;;`dAwuu 7nDI݄p 8G}yfM[wR'| ziYͅ˗}GNN?/1/2^fd zCZ#.NO,a%dds>rgFҦ4,թbFj,~42=ɛ'IO0{4p8&(L ի)̰Z-'rР'>U&wf*-2 ppqfl_ob]^O#gjg?huyXyDppx fMɝ^Nl/~Ao(.Σ!rΣ!Ǿ .ik@yym(fsm <:;}lZDhk@yyN꫰uk}&3E:??յ~oZdppksKKpW,ib:#sR_I 0 faQ`|1O#oXi׀WAL.<:;I}P=  A mTx~o3 Av MXKKm <:Q8?|>Ծ  ūa4썰M mi0u!G߯Ȱ*;v젘Ķ?P la6wv /Nvw?t\xV`ґ8Gifpp_1oGǝbB FXK!wv w?͞XM[usNyVpϛWuOaʬaofƝg]̌z\[[;b1: &OK=RRdUp*0_}lkq GiZJl|j&͝Zy۶毳z-Xt#ˇ\iFħγTy$P($r6- t!*ߘwo hA\Ք~5L4p"PX,y k\ Gd«tŬ&<΃RT.+Jt!ZT|4eEJ#BxHD?iZ\!ֲxYJMuox{,h^{gJ ; 4bT!ٯgzwedx!>a"~rrwKLnGhE/ꬦBEGHzRIӴlD)mZ>+g0VVT;HN|a;L*C9ȣiwa ©4v"" M;HG*g FIzա eBD9^xt!6ОtZe U4!MO C9^xt!6^Prlg0pV&A yBH$ C9^xt!6^\.8)y2L$4 ɃB" !`H3 #N5_>WUUQ XBiA !DmyX~81ǀ$H$TGBqt!:Bq=9y:/IENDB`pg_auto_failover-1.6.3/docs/fsm/node-state-machine000066400000000000000000000043521414244367200221660ustar00rootroot00000000000000@startuml title auto-HA node states [*] --> single single --> wait_primary : another node has joined wait_primary --> primary : secondary is within\n1 WAL segment\nof the primary primary -->draining : perform_failover() called\nor primary unhealthy draining --> demote_timeout : secondary confirms\nit's receiving\nno more writes demote_timeout --> demoted : demote timeout\nexpired demoted --> catching_up : the old secondary\nis now wait_primary [*] --> wait_standby wait_standby --> catching_up : the primary has\nentered wait_primary catching_up --> secondary : standby is within\n1 WAL segment\nof the primary secondary --> catching_up : secondary is\nunhealthy secondary --> prepare_promotion : primary failed or\nperform_failover() called prepare_promotion --> stop_replication : caught up with primary\nor primary timed out stop_replication --> wait_primary : demote timeout\nexpired primary --> wait_primary : secondary is\nunhealthy primary --> demote_timeout : † begin shutdown process\nif primary loses contact\nwith the other nodes\n(will transition back to\nprimary if it reestablishes\ncomms with monitor) primary --> demoted : † draining --> demoted : † single : only node in the system, acting as primary note right of single it would complicate the diagram to add all the transitions but this is always reachable (if the other node is removed) end note wait_primary : there is a secondary, but it's catching up wait_standby : waiting for the primary to create a replication slot primary : There is a secondary and it's caught up draining : marked unhealthy, stop writes demote_timeout : waiting for drain timeout to expire demoted : drain timeout expired, node should be down catching_up : following a primary but not caught up secondary : following a primary, caught up prepare_promotion : secondary doesn't converge to this prepare_promotion : state until it has caught up with the prepare_promotion : primary or timed-out trying stop_replication : stop replication to cut off primary note right of primary The transitions marked † are never taken by the monitor but might be experienced by the primary if it is down for a while and never checks in to notice that it's supposed to be in "draining", for example end note @enduml pg_auto_failover-1.6.3/docs/fsm/node-state-machine.png000066400000000000000000003450751414244367200227630ustar00rootroot00000000000000PNG  IHDRu )tEXtcopyleftGenerated by http://plantuml.com09FzTXtplantumlxV]oF|'KB2i vNI#V;{$ŏH@x{;3Ks^X_$D5iJ¡n?XFN].'Q?< ץUIh3iL8(-I4.?+G{3WzI߯op++mm$UJPPvkl 'i3JDJgR>Cc.#Jea\{UHS0*[V#+PaUaT"]OQ*iC8<>vD wm&Q7nԛogSYQ~Rq.1?u]πRXQäRX sSF3M=:0vʁ3\$WUMD<2Ӊ +',rV}wreh#wJ*fH:I"gɈUB;:ۈ䑼2F' EA}uO\77VRQsp1i(k/t04^7,0kpOŸ >8"o8e0G:]vj=f'Ǚ<~bh^,E~S6 ;YX@qMGb;l;r< @ W`% VKtX=%7OZ%u&t= J (aMd2ytT({,R>߄9q^MϠK.v{C^XX.Q|KWM|ݿhLe^"[>>'A_Bߦ3>IY}}zw!կ1;iy uчaLˋ///eCcwa/tt#CnE{ܖ1xؤ`E[B݊CIDATx \MD$4AԴ2$%5BQDS#&?+^{^+Ң"k@\DrUDQuovwf|ޯ׳3wgvLD) :@#_U|wkkk+++*:7fx< Pg<<u:u]<5b3uΉ'-,,z뭔a.inՃ 2{֮]K Q/_L TxH|ٳWK,ᮠׯ_4i=E;wneef<8j(sssSSS[[EUTTu:@ʴرce2@b^,M:P5EwFgpvv3-ܻwodd$=禔KJJZ޽Ϟ=+X:$:_Cux" :355RWWw-YΎV#3m#^,1]@/XXX-d2f+U/fزyHn*r†T*.TVrfVFn뜰Sˍ{-***_V狥I:_`̘ٯ_dYRL ]t?#i({;PvTՙ|Z$}6w7oBzqꬳ) Wg l;;;w0 oܸA&?iVcG&V狥. tԮMfc!!!L+pr[f%\F1x5Wsuue.8um͋iK`D\K\(|binK3A2333sssXlY.]h+3f`h Tjӈo;՘Sg1ݾ~: -QN:E)f*8p O?K nj-]/:_,p:Pg@:@:u 3 x9::vԩ]vʁyyyHPg:FRΤqm63PgWP|18_FسD":uUF~pҀrYE%ݼlPgQ*'E>`>n(orW2dHySע> >p? w v' 0LRԖ׺fAAA׮]EzsN6kUDٸ\3`߈;VfCL-fÇwM7w_WLLWmڴ(3`2/'qV.FP*rQF}7|?G<˜>_ݽQg^BrL|30^4`zFʷJuS:88.ߗ_~sݺu111͑7GDD&d? >$VK29{/{UMgb㰰'q` 7&!m'!^)@P[~ZTbڗ;$c[۔f>***X{NHH`sXo JYU> h!*/\uُ9m?nyzUWWP(;F~,Jc9CZHO 7ӆzd2;I @E}U!rҮA& JR*dYYY!-hZMM2lx=U@9(o'>cb2I3sYZNOodƥE!-45|p瑗6G@ 0 ܞ4`:%ysNÒ2#;g ̢}TeE 4ʋDNI܊-}+vM;k{ hufre{磌f@y3m?38$:x%Tv+Hu4;g^?gW <[;чDC^!sg"U5ע#L0ܲ]w qY ndR`3Pgmf_WCmmsod:: @'u 6nINs)Kˑٹ73cn>+.*6r30 ZTr7{I \) ?_X@ w;)чdnv!ԖWL2y,uWLRd;LMq#!uiy(qV.G*:#HwJIwI\w-(o8 5a.*4`hfg(r182j Eu$,cbXGM7Sydz:x9Spa!b>PDZ Gi_K\@3Su΍~kl=p;%3hJrS:h ;'::m*.*aZ(ч^_3PW,-'w7,U B_kG6:Òicursv]BZ7NK\/܇Tu?od%'{$ظ1̸\ۤZUN_mk:r+2Ŵ/]Gu6p;Z.G_x@*:@nvaA>r9b7K\"@mQW!ǝH.aSqQIi@\3?K.GfL fz*%2@Ct)})wJ 7YHw*]bpa>iע ` ]b.*g11J ͈*#?lv3$T3x}7/f  vNN2A)o@ @YpҀQPgke]gT V=:oĝ@*:@yhkiÝG&ua{H hu*.*evAКU>*K_4`3^A;-67蘤C4{34N-$]&#oͣ#6}| O?)8lrg`X{EquÒkQə+lpQ9`펳rQe#@y`.*wُ$#m/{q[l`4\?*: Q,=?vJ&QRC|%TTupbpI;*r-PqQhQSTu!,sf*umCf/E::]KYwK sOMg9]KJ3,U5i_R`f?Pg@ZU^Szma5}ei9W,UÞ: jw-!{ l PgJL1Kܵ&r6p{> Fu /ܗ]Kh{qg?@0-޵$#om,o{Pg_neO|ҮO-r߽bLU57UF~D{{9>|Wa3:@@y=i_K\;m%H '" @YZNAGӳ|W2j#--U5gz@*Z+2ٹ3 f G]lv@@f\Y1˫r 0`<,)hk,RМz3P'!p8+T342مJ8y$yQU 3"˿iCt:0nuAr|#bڗҮ:,-j\WhxlϼY՞:Ke 6nKp3`.\vُk9}V_B4=;}[V78 9.e0Ȑ;Le.*bCv?sW)JJU]]cӾ9#aIٹeDvyޥppǝ.m%!apLQc61EEEMGCfiPXY\dvC!f0CΏyc{NUs<ރp3 ͜}|^ /'ҮHaPgO۳fgz߈_ 0%c2LÝ;˹Ï"o퍸鏑sv;fh}g=m.{!ٹ}@FtsgKYZ3PTS\"#}ݸ-vKs kski:փg֝6! 0l,w^yl?]%w*s?t-*hunay5QFlIuس4ݞOAuUF~YZZq-Q,ðmCvtr  un[Bh=ӧ^L:p3Iq30};L%T K:3Վ }</S>p`S%ݨPYO_0=B3X[?n;|hii֥Kǐw%-nbki_JOpzJ~ܹՂGQ4u@ܞh~g};Pys_u։Z>/4Ӯ vveOY@2قn oH,X<#Ȝ7HŞɛlp$uwB0{XŴ ׯ|hVw_ X2|4Cu'h_gz/Hwc&^6z' !sIHL7wTX z߽b>P:Z[_l8۟S>P>wM?7oC.}8lոjPk$^TgUU-$Ժ˗;Q'i[Zg+dƴ UNr%*dn3ӆ$ ?h{-& R`LzPg?H>)JWRHgk.lfB">ޓ 7ٳ+dH EqfM*H^zňi;:WUQW`phV\CnEL_1 u(ܮ z67̵`3hm n4* Z7tFOC4,,lϟWTM9+/H6 3|6p94hD Թwo!N}oRS?(ʦ_bր B<޾.T={vݸq,[ z3_{MygҾ_L[1 uZlb:ޭ[G?-|[z=c?< M:,*dnӆ} ==oѤR4==Næ90@_;LIq4&Fťc1ÏfA3ٰV9wcS: }635G>Dk>G >>Z2+lb]aCkՙc.XAܜf뉉S_[dGKZW5wMm묊/l۔LoW fm49&&;gU5Im*37S*nGE+nh8hP;Úgzm-ǭEEU:y% ! s3 f,uF4״â\?mdAzY^ů~1>u&wJ=*;^06nވfxs0"Y.u֟033Au71Ju&_gPÒ2z)\bqiW_uYT8p`h)S-+Ke7߳g]w3{ڵN|A:/]1r‘zdm}رRCx:#<:U>JqSeԹMCvs+tŢ_T͍1USɛ33>'f"&c>w.:=q L[Rݝ/dGj_G<:MhД 1d IV?V6ԹMSCo7GCwZ,:Ti3.T~.ΤM29qz>)*ka׹w/C"yio;<ҵxwsWfIԹd[ yPg3ԹbҏegZ[RrT:Yi?Ieͭȳ٠ܾ=|pzR爈8B3:5:7jhb-VCjunu|hq;%PgOE~~_̘jm WI U Pg3Թuՙ${B3qV ;C:;:LK. Dn[]c6x+s;=2>U S *hիyx57`jڞ{c3X[?˽1r1u f穰{sT/+cn2}_*$]im11qʀ/Rs=zXw%_(Zrhv̈ՙ^qn.(:C-ǏHOW_9n·o`&H*5Zi"#IAڡ-0hћ&{wĉ}^嗭22jjYUE6=5dƷoN21˗/wTϠs_SG `%[{Z[2g\6&||oP+j5s3yFL(7uV'ظa:C-Gnwvbbl2\VB;v\tyԨ7Tvs DbQ\<)SYa!ҥ#S޽+n̷\L Ugv`*ao+mcӅ@EBKK31Umm yb }ȗV37T?ԙB u:# &Ο?ܷuuV{Z722~;0g42~Fun˅\|U \OOr>}OL"2!|oP+b^M3I :C>>̥țZ叜@gԙ;NuƦK|gUUe1|u6:SdS{{|LKPsW`YkԊQgR2/ڲ:&) uF@DE̬c^7C[W-,Logɨ&Lusn)~_mN3mls:N|zH77W??fg*JgR ZVYߠV4:3u~5cȞiwOcmAI!N3@:Wݺ@y⢒Jx硂-g3(N}z+DwvҪ7633^ai Ŝ9=|Xs)={v51y[]Wdjڞ4N ZSX:/0:Kܣ%+QQ JgR/. ߠV֤307gپ݅cmD&)# uF Ό9S|RG~(wpٹ4/qٍ;:ctϜ=L=jZ}oi٩9~|1nLR*ٹ?,)Q3@:ז' \ȧT+}NՃtyՕ,ԹeO|Pgu2xk&>iopࢭA^J6`p2Ү#i+i2щcRG|j;ypɬ_I0<u&iN0F H:? zyG tB=7KRGPs 913$L A QqjU9@K?D"Q5aAgz/^63:rF$ڿ2|z8kJ)1'ӓ>^o0u&r+GOͥeiYNǟk8vJΑ$gp+->nNj1 u:CuB|5BPg3յ]{s~)7GM 𢙙I{'z:׼z3qzfj}%L` ޾߶#Growr;!!!J$&&0K1w>eA̍--ͺtHgR`GY[?KΜ9r:kԹٹ]/q-Pg3:u&-}ɽfeeޒz*Թ/[edxQCUUh.0&s'u5_& 7l>zu~d`3@@u3;Izgj^m5[BB%9@ܠtȭ!>ޓԓ;tϞ]~\`7 Q}J`[LIIه66]\Ô+*ZZ w{s4ԚIf$ |Թ|$sc3Y{лs H˴Ғ !ƧZ\OO>}OL"P532~;03FL UU$TxG?>܆GZ.\>*̜9;!ϚX wXdbEVWP/{ u6unʭzzM ԙOHKYߚ _Ͷ\\z&$x7*_WoaYS*caQgZ稳^A$G{]Jf3Gz:7N-,Lo3gd44[NsMZNAm} oϞ]7n--El$zo}fbnnW~FNGkL :kmB`'NS^ֹ|:?=TrU50 u6~u~9sXXSP:gr[[iW_uϏa߿ʪ{̒JfcZ7k:{YGkׯ_K?9zj WjZɢs_ZV hU)W !k|Δw \IɓG~/+KeZҲs.[,ˉpr^,̆kaB֡V$uĤZ'Sn:g33̄otYsyT  xQx5i!ٰ3mֹs֤%R>`C|ܣ%+U;]Mv u0VGkOLVis@,wwgIޡXÑ@I :U}D,IMfus!Oi}6wҼyӄYk£Z7. gdc ZӢf7վCۑ#)Ga2UH%e/_>"#|a^,{zJ3_fCu̙n/"Sn:gQGImZPÒjU9d@\%2gł#05Ukגr*ϝ;5&fiݜ_:ؼ.@aiYX6*Z7Ӯd˖%i\M|j业eԟ.],<9EE,)9ʎl=-v9w$I)[l|qPs18Wd@UF>J> MjTŒam8HkhsffxSs : j7??fʔwIs˷Ejjm_Yk>6bkfICl4W Y })n+ـYpPsrt׽b: Kp:Qg wYTrsgfF̞!u[3vHO 樳TaBBHmm6/~CG^Dvԙ^5lun:DBY ׭*z^3uw⸳.TCE13?;Sܹ:MeĈƎF$Pc2׬|}NTm:[XߺbڥRu{/-ZWS_s:ϡ!vJ{j\gZȝ̗q̋5eʻZuf*W8ufg8:hK+VtмMW,]G 4ap=iwdTg"m9|@Ӟl^ Mc ̝;;mT$&nޠk5G_IHt.@ZI4I RM3"tn]Zد_OܹjiѺZ7k:SCzjl]+WȉξXZgu/iVNb#tH4ZfgTLǷOfT[Y%f%74Pu6&fSkg٫)u](wZW$#劶v-[WmŴ_z{FpK?߰-sh:oX> Al&06DĀPu6)?Dxǁ.6% `=uc܈~ѱ#bb`LެRn|unV[PKPu6nĝ7_6qb׮ep嶙m777޵pvJRt1!GxttthhhxxT*MOOW*Ƨζ]>WMQS(Tؽە*15m?xK>ʁØ|LryMWV.W-,L)@ Z?$]MZ֞0YfH[[?n;l;=P xraںՅ/[Hٕf1B5XY߽m1K%7nSڦ:Ynrp΅ի!D#דN3MZfqqq ԙK:Inx㿘˗;MЛWU ;uͪ-z!8=l&h>ZI@#=dwʤ읷vX`>+u~1$܇ÆM`_B̶H/*ߎ Oi ,ʅB9I;{n MG<ohfRgh\RΚ^ˎђñtr??=XhiiYDb< O.yU{l혘cW}.$]о yϳC={vz3vL/r[ظA<{ 5~.{Ac cla922;[>+**O{Ԝ͹hΝܖ&6!0#YA{@VڧSJ~5t_{{Pz{hi6jUyĵ<@ҌMn`3V1n%}<2q(mqyY ߧ3%9s~H7=ǎˣ3/S>lcyž$Xp6FD#ԙ':NiiRC:_ j}p*_vb4_[Lh?d11FBEERJX%vW @hDN&GVNՠ)CrS爈'7nիG3~y̙5b'Ԗ1aBovqh:YN ;vezAAڭ['_ W^޽ťζv_&ޣ,-F2RFP$$$gئm\7mD<Ř4/y޴#`T f:_Me:<)CrS窪kg>7633[GKgظ+l̝:F1gk6D#ԙ'TdjڞWB:]upx*0E16DWClo-QԸ*K7Fhr <<Zvn ֫ 6BK1&o7I޿tpіzB(:=gee;vL*FFFқQ+6D??*l߷HatЙB 5t7 3'a@L\?zُFeJ2///===!!މ)pmN/2tЙB 5t7CPf'yȬq'lT\QQQZZ ?ې\.J9m1"f>d3 :;a鬡sǘꌀ:?>l1 t^^BHOOO͍|#3C:/0Vi:#O?q u6rVTdJ47׽27#t^a uF@עp3LfwOuF ڔ:ǝF4sv/Ahk\hu@,>+.* @y:PdQNɤM d$%d'(!7 Q1 4ӞGP(<N:D@U*rZa@E<ʥB^^^jjj\\ n :,CRC90 `=WaޜHRЙKm3;L-ϻ<{$j'23 7PgtP-ە8yPgy)sPuNKm}@A h7$wU0@A&&@%j-Je#Đ11J}{3h[t[x-*y YժvCum%H B!n] y Ò_ijͺ!OOyߩPgumx뽂b07dҮr*ч@A[F܉beO<2Elj⻿uPgVȘP5y ҄1ފiA2 uPg&V<7 J1}f3:6A0\@#阎á rw cINoC rLGF<G!?~kDT] "i_^<{C+=w-Hq]ȇ:Bm}iyXRv+(ðCgL[~ӵǣ֮EH ebڂÝGC'UuON)~ ndY4"GYŻHs:c`Fq'urJQ~NA .jɚn_-:c< Ї"iQr_OBAÍK6G8&-siiN:#w&@кW&H:+<C|}~^^RTT uD]xk"V[qEtRqwVVVQQ=C!q5B%hEskl0y.ѽe BH3I<ZkQǢ U) }vym6~5OHHg30'{4 2t\f{OP PӰ[>_.|xYYYYJ lk@mg"e'Ϗ mttcJKK0ri)лPtA2Z nJ~}Ζ%JKPRrtf\]rŴ_Oq6Sɫ:ĬӺ@[~~2])}nfֱwo]qyϞ崐_I e66/?nݞw/C ۴-<<\* 0rMR%K'4bp_dTSZh933=ۃ y $R7nǸOQȑTmfsL2q>:#Wl emT*30<=#֥,->YodL|Ň;v,0?p`5ׯ_hG+vbu63H jm)߳OUWgRLq`Z}܎=4 g΄_ (Fz8Z&f~)HP(JKKٹs[ |۔)RuiRy-gV g?~3Ĥ}#yҏ%fZ[RrOui *s;լ&׮%VH9J v#snmeG~43H@AeV:rL>ck1Õذamu\|ƱcQcfq ZRc6߿?66655 ?l(o"s{Ѣ/Ǐ0)3RdgY "NPVg[46oG:0E%wCP6,$ݶ-[,0ߴi1~v-cfSz?M3g… #'sb]:Tgg1Ν׎;L{k| @H7J y@f?,;*rTlv@{kMYG,|qG;;J&O]\,ӺζN 3mEwX݊D}{U 淆EPgtLut]#uFHL2{ua ^k/Ez?6܄uFCԇ  XjlTPV E3:#Z&:jv[ݎ @øw`7T3F:#Pg3002&\u:uF Pgupժra7&Olnnnjj믿*w^[[[Zꫯ={?uVU͙3 T,›K.ٌ֭3*++n*HLLL4P|Oooo:88Z_H}FFgL`7:#3 3^A_PuuuK,!cMDΝ;ƍ,pwwx¤I!ߨͿ+ڜSscן9s&6']vi1me*%% $|[C3Ʒ@ƐhqLIںtš;2JMMMD"z*S5:kLÇdZ{ОtޝmkZL[|Yݺu͛id[יfo7Je @ ,~΁kPe8\MP1ܜnΗ5<͹5'i1meٳSL!۷oRR@:{ČAAC0^uFRd:*_&'HZRsDaО:kmZdfɓQg54cuY/m.KF Ftjcl5uirJwwwvή۷u:ڵkǏ *_|;:fyhΦŴŗ%ALT[ 6n`3;~kŬԫW/[[۷4y=aܹ555͛73.hggG-4ΨF3_OkjjJ.v Mi/KTI~ Om3&@Ӡ@}i9횯yie[kmהOjȑ}Ĕ ì OBڊY#w/EX|6 ",}u <瓯*+FuEP8ztq ::kƨ30r2W[ y:==*79,hڧ2A.ՙ MM;0e䅒ȟNu2Hucꬵ'66/]ĔVQھ)I~zҡ>p 5ӫ3e>=㘘3k֞ ܎< kQI톪Voqq]-[拻&7[B^G]芋J`(ZqVcld̺ݻ'f=q@{k2 Q{L\^cFA$wCą{ m777B?̬c4Jk2蠾4Ys!pTT*@H7J yU52Ү9CݏjDI1ֳfRg*'$$( :Vr:VaIhA`4Lrz#[\ afK􎽌1sl%t x3As1sVVֱcǤRiddC0`D= {J,tY,Pgj$ ^w y=+ʼ􄄄Xf@+<<|?0dFo܆<tnJ)tҙK/ uIsY칢~NMMqqqR42/ݾy0>ܤ3S:[f:,f3h|W]<[ Ezzz*0d59<y0>ܤ3S u*v]uR0nVTyT*!r"C:7 T4CAPSz+y`w+KAG{ND@!f:# 7N  Z ׄ! L-:V&^Z*G*@!~%[[S<3hi~pTQyPgТ+(NquPg[8y`X ;yPgr0?|T0,2W  .ܹע;!'  <鶐9uPg^֝8+ Tqm!n(s9ܧ)xXRr0@AK5OdL.؄ _9E[G2K2&RYP]P4/1QW<SMwyGH3@ (j~;Up;[4# 6n Ls/%d P%B3~'}<[ZQQQ]]8h:2;wO:gL\^tAg`g?r M'ޣr Fsl7#"zݜ?*JJ{pٯ,-yΪr %B Dm3kIVVVQQΠN 6c1hP<湑'( 3@AP;g kSvP[wtD-KH= u>i8Cܼ uևHށ=<(˳JeEE񿝒<Pg݄Ӟ~p ~Fbdz/4qg T()9:yhss3SR={53hg}ܧlYB͙W޶-VBo#"jP\,cV[K?-fwܽ{8 ڿ.$<<\*a8չ<}_ atSJrnnԠA}H[O#$\{B#&l:dRRȑS&Ê,ڱcQk)>1pԨ7tvܚ+T3g ^zو\644R "w/{\C_|K@-,XM,Zd]X4N99 fbժv-m2Css3c=s&&@A}a:qqq o| -ua>@'mSKWιMKeg!YA~̔ fZE#ԙ;ejUPgptȸ;u7:s6J%A@x6ҲG?u?x2kCXrnuu&ڶ:Gl;vhevBXc?{gEq)}EE&4"Q_Id*i*S^5RB44|!yQq.oh. !) T_;w/3;9̙廇\s]v%$$dfflÖ,y~k  l3R:r6`9ꬵzltߌ9W Pgu:~Mh{e666,eۻw[3C# ]<[ 㙯 `sYkؠS8m͚qšlI<5o-@M*~a?fq}bmsEGZܯE{gg鵦ǯ#{:9 nP>Smr樳QWgmm^:@ks5:5/p5@ƷlJZ9sp-uPg3Y,*+3No:@mVVjmkeo͸@Pg'%{+@@h##::3u:@_*  =;9l-Cm2:_xg@ ˠ~U:?"+DS::c3-LqA;3u; E;3uz;E#3uuڵ$5)ZLZA 11#4uIJ?[JBi :@@{un:LΣs;/\,0p[aa<ްdewgoܸL&{¢H2^g7J)]\رJ3ei遉GY[[Qe_L#P5- hkkCfܻ`k ڠMQW8rvt|O#T[n98t{zYU%"~JDn4SRJI#Z6)DXz+ OBM u}Lי\|Ď$ݙ+WI?5{ gaT(o$m R.ޔ}ĉTʍ܋\G3 o&BXb' (1-ń MAR26={:B6ђ%Y6?3^./-6ux3rDNYB˭HS{R7Uy PguQٮ8RK̶׮3ۿcgבM7J[leee UFs?o&B);d'ٝxSP&l攞Wݸ6҅~K!!׉3 g멳kt"XV7CiۻۤPg:*D%loWsXY=m34&ͳ0e^\M)O9|sVysDbmywj:o V-ތ18MXhJ<ڭnߤ՚ Pgu/joޢ??Y:;9=;X-P2qGD޿˸UCיu歶PPmϏg? EJ:;[D^ ަxBκ3יڇDH^˟@#::?Sǝ4OMllo*4kּke).VhTP2<鷌QwLI;Jo&B`RnAer,_ߑܱμf2Bլύ3*u6 AzY(!4}7{ڥ䭼PׂNYcuVgaQLzBM$ԤR"/pQ{cufΧ{wrBaRfTNhQ.dyo{fi١_?WA{g(+gg٦Ma$P5ȌIfX`*3#$d2xSPٳ'P$jYT7)3S=pK[y4Zo f)t|t!DBM*%Rj\?x34 [Eh`Li\/&cnغuٗqhʕT֛Loى 'ϑ*N@;ïa)Nc>jw_JW!}۷rش:k֢N#cB Hq?=]W3,dhĦMaOX[[5uY:̭a7i!ys\\\ttݻJ% LVEȞ/n@hڟ%wrTR{:ٳ')))'' o|uD+oÞQa{1~F˙ԙZ.TUUJB;Q_n|`f=#q`8 o(..4WShԹ%{˟6cUL`Jx3w-;B,((%:Jض}}'@'Xp|⼋>HB3ʾs#egFw&K >o?ƒ=7}fvɓ'A3@s]]]UUUqqqNNNrѳx̵+~O73Ľo&u픔zd]uj,v q{k.$nŧ+כWjՋq=@_ǵ@O3Ya2q|Hm/~>7+Lgݻi=&f:3Ug=?9Gl{qϘ鋍?f|ȮMFzq z@17@)ZiIʘL\`Oy'QGm.6.?,2yƿlĭWLIt#@=Lg3=  ]5 ''';;;8~ؗq-_H5y&Ȑx,17Cg{C}It#@EHJeq:8v"z@;xe(7>peRܧ^&u5:odcO}ozwЃi3:~)"w|z~xcm=7w?J$xv¥Lh5K w@:ABl-J R`nM]cRLe##Q\JrrͰF̹ FFY"(9Mެ_7.zh,@M2$/EO"mECq:P@܆>wjݮ74fx]  ܆K#h-]@pm<beG` Pgu6DVnKumίٙh.Q4_uPgn aZS֚j0<63:-|`@@]*5z~EF. }I<|$uPgCfz ěd7r)"6}Y-KKm2@ ›.4ѴTCq]xh4'*1:=g6goPB\KF0 ڀs0\G<&&9S Xv3:xCy[Za(y3:xv&l3/'gPgu߇$/v o+yFJ Q4p~sm\@՛ݧx%Cx69[\F[$CLl_3Is!7`!i1pTuPg΍ wK+qD[t`@ Pg3 Π)OIy .Em; 5X)4 E~YYY|W{ޞO>ݿl2;;;[[ۨ(fOmmmppCh^}vgggKKˁ9s___H9;::2zƍ'Z[[Szoo~͇ʒd#GLHOzyyy.]ڭЩ1… v\\A{zfҎIiMJvPV+(!2@= 1CFFm.Xٹj*__[n544̞=ݿb ZOK 0ad%oׯYdlzV:훓CPJCz3sLƏ;F6V>$$[m… &&_3m^&%W2=)nzlj .8Kkw Pr;;;GEE>rw:99]~c޵ctRRRL&cw҆#+l4y% 9Ammm|D_yxx0?#UQ^^u=5Tr\ڈ޽;SN{R &ůrTqc$t-3g+==?.ɻS&5wNNí =x`޽i#004›!况444'߿FF'KIal4 ΦO[?~ltolht!QpEYښ3Spekkk)yޏ0 ΦM^ꜩŸ޼+sX{r-l)uXf˘1cFÑׯ;991 f>4 dJ4\ufj/ٹS : ojgd¥:,-Ν{)G$X|ڴDl$Z L֜l Φb|zvYWd=Bz$;΄mnv u޹sgPPzРA~rkhNNhv^/eAn3Tu :p)"y&3&'gԪk|:_=tbRX˶2y'Ǹ8v'fzMdpٲeT3?SY([666x@DR-4s<|+˅ju6Ғ`yy9m9 GUKᡙfMd6((P/yJaeeg@M*Uyi_&:K痢+Mld7Ə?}uPg 1&KMC2Z͹{E!07&NhmmM iv?rHr-]tah&RV/7^BCC~guPg ƙͦjuUΣZD/4p߾}srr(1_lxY h1!!!-EO%M͛7;::GjyvF9:3"#/ꎙoNVD/40JokkUygggvp[=:uU4W J9&Oԩ:pNNí2$3sc賄qF@ـhj &A.ˮ]SS: LSSSx8f>g{8pի]\\ 4mΰs:s$bs;IGO[+1A"##IFI|O: looA@@f>jsbb"d?6S+++)" 3: Ye޵7oۉ-Lw&13ghaae9ӻwyf|&?))a|:@ ;)N>UB<4W{&vR'Nd3`GM$;ӟZnׯwtt43hTTL&csˢ.LY}=~W_}Ejϝ;$LͶU 3:+Vn3ϙ-ǝ@OCCC}}}2avdMOX*-zٲe3g"USVTTPYܗ/Ld2A6V+oJnkͶL铃and?ﺒѦ3ѱl*jgTg֧,l\&f۪s@S͹T2W`i~5KuPg'g'gv!VnAΣv @}|̻- "\B4: Φa7`V,(DP@hQR_ } Xv3:2Uwn:@ K\hh 5΄m>@ٔ9|IaCUrA ,Pgu6q~U'9xOQ\Js5M0@ٔ91) |l;.h:R($iTRxR5z>lp-n<8*p!{ @4iOqR7 /ǠxDyR@dLH|H13q2Y`@M{)N>rp՛O^3aͼ<7@9>\$/3pjZ \3: 50,e9fg؅<\!}Zei uPgS;N׻a3%{ ΦFcm=[Tmy~_相 Φƅt9+.j'~&l3( Pgu65~ohLsS\n n:on'>"#7@٤[k 0Qlj0]Θ˹9YMv5: Φ}GFenm~U'9xU_PYɸuPgT. <4xaq}ZE3:OOwz#{¢L4. &EnJOd3am3:UwR]|jk\LS:qyq5 /@S{Fk\H3:i~UB\`PsO6Z!<+[1Zhа @{~dHъ/l@Tf>&n[q$/"k|Pg9|u"#g30YuʖD\ݶ5O/uU4W?Pg9L?o &{>R:s[#'nrbRXHS: 1g63Lua\V=Ci3 I6&_N53 ,:#Wg3g=Ye, u7KSeަwK+ݧ3RLQОi u&u5~24&>f9ڵH^4yqig7i, u&$78c \ Gg,Z u6u~`ϛÞ9$D3jk3< v]!uƥs;>#[yaխ{6qPg`j|+ R%˼w\G^g5:CtI1X{:SSE0w*D\u+S{."hqu޶ٹeb61++iTtc{"#_l,,%%o@9۽|xiiOQq]m e155KhccIARR+*ʋ9O[L>gϾ$9;n#tq˖1"m{ҙ3NG׭n,bZo ۷ h X=C3 0dˤkdt#jn$:TT]_nxK0W7 2HJVκyB@R$%{8064]Y%Ch:)oܘOFD&:gmrdfv|sc!7j!b{c챴;ϋVCS瘘g30)u: ѩi~>7WGkZCٞZ2$K̶իvub;;+)Y!J9>*^sȗ_-BHe2+W'mPZάсj/|DΎL-ys^,vPR2v?KlTL~o\8ږ=[%%o$r:GGGݻW.] SlaeE9$'O mUʚ!Hur{[H<;jF爈ܖmuX\_cI_|щ6';(\(sLLLtt43lɓeeePgunyjoqRA9Ux8^O%.Ч:W/arƍBY?Ǝ*1V "d`tobT׃;Yz):OΎj[\oѹ+&poq9;:T 1}uM\,gufkW; Gȇbc}5Z~w֙wvU7'--ۻ?9`C%!!WݽTRth,5Eږ*ԟF +&po7Ӆ%Z 2pu(..:s˓RD,.sϷNxd Ud7,O%Pg M%:)'Ǡ{w{oz-?Տ_|@⨄܀ƋsYpH~+CKp:-:7e2QS(L<@Ug+ϲ@@?j e&yv:T4@A:ybz>PVNkPg3:uU$/\HS6Lo=ٌAKD1u 8tSsCk¡Pg`|}_\{V24~86u~|wr4?j.BPg3@ Ù+R^E L8w5zzTv=%%o`i~jlG.lü joŴi}Tw۶y;;wL x:??W_rt|9_M9b3w)C,6[ [7:DEy1㐝=cWudŮMu;w%m0U)fUE|Y&xDJ+ }~,Qe{w)HRܥXxZkѼg!Aԙ$hIGnKq3󟬻y#q͸ K?v-.AK? \Cw~Wm&/RzIwkkCCC=VH'˲X9n\)qPPAlb2!O3o[.V6mPY:Oicשu-KnjNƍtlDhr;ˡCa-Y2b]֛hDyuw!l%w_)'9ՊOS"ZYHFPg30#uNy-4J+F\2YahS1` r#[G}yyNNW3Ջl"ebiٞ\+We2f{Ȑ._=6:;wbQg޲v,+tK6+Ր*IxB"+飐tn,dEp?kg!A\(#H|g1=oK߿' 08l-Iژ9_dM5<޺oU_tVtOVb:=  uy=4J2gV@{GO8}_YY uvrMNT[J+F Ȟ#"FsM>H&Gծ]Pn:dEuV[$V"dGz5ԙLuh: :kFPg30u.J85o!FQ„@Qo{O}<Ǖ ֪PZ W ;^5 %T7l{"q=c%RJ='0dtEM#Tbsԙ=iVu/,\#3:Nl5gxL8 B-/~a8y$i4%{UGYYY߭[g G;EEyi ڳ:^=](=d'ǐΤ AAcclZE⭛6YS$$dJ[UVUHif.^B%6GZZҩZNJ-t^#3:|"#`WCY|$0.39 ]3:.GF#=::vE{69]s97ww~lӦ0u˳bgC@իvub;;+vn:Z5~r{.Wd6Pz2MIv?wґXsJ̭:Ȑ!]^ΝKF 3o&BFOSYj/٢AmmV:S/ j,쥧 @ɓHտd4>H~uPgE65iiõ:f~8DDGVJ2Cz52YDnnO̙#C$uzR ^*ÆںxH?QI#DϞOKoXV '쳎LyE' oPgunER]|UC{!Muf;qΝH''̛45P3o&u0UujW_,T7Qg*:9y#SDp%i3whnkKugz)LbOW^k~ Pgs˄գm[}6B@uS?ZRm_8[S)"$b UޡDDbL2, 3jq=D:kEBPݤԡlo;xƌ"+~YMxP 0~HBc5;όu~f/ڦݺu^~@}y6CsR{}J<=jt-Vn<JqgS\_Sh ],,yYGfⅠ>lRXȺug&=KLSӓ3Ͱ Պfiٞ)2ÆЉnjd 3:DMJiiTs(/?S&i,(wUOq!!(V:{ JO i~s@(v zPgՙ[EB=6(s O׾džRe#S]^9╣c|rg͙p染h3)'.?4[ȃKx?̯&|Vg|8m&ܘڽ3rE& .J(ZEO |7ktБiI>ї<8Dփ$=5d6;2B!lwpYjxqç$7yp$<3GxQONt 03@M'C5{ϯYr[O5##g=u:Ns}9׎3cҿf/7^第NNm5x@kGiÀu3:C[98RDxeV z@ [̱y%{:ǐp "ΙlBK½vc㎎@7gl{Ȑ.  ug&8z@1ca#Qm6ŠHElm/${M3::#&?l"{Þ3HS]2mhu:C uF ֤,s z&|Y&xD-zY[wl,g'~a6;!A{\]활mvvDt~~Ȁ"ӦQєrҙ3Q,{f}(o^&=3" ,ggg]Fze3a6]tܲe Stx׮v۷">F7}%y,{4OYuPgPgE5W7=E)믻 G> E@},+N/X0QCHjHBB19TTM9:XIuTPPPp*]9_!u=>t챴;ϋT^aMH&~$ݼm˽"u=)f)Ky;@@2͕+6m8:>Φ!$=LNDJ,ܭ[gFLM>H:Ҳ:;9^gݪYIP)ltB{,o}.^܉=η-ʋ4,ՍUF$BLՎ|>HY",t_I95{DSnuPg:3Abx j:)P k5ֆƳ:80"ĻwPe"r"Wmg4lTF(=Xjwg~~ФIn++MxjRI)YYY374Pq*# %&E[ ZV+)&:3`]z SέQ3ԙU_qz+冽c3 ~f/ڦݺu^~TSG2.^9Z2B0|ÏηI,Rguܶ͛>0{4,3fNDsʈdB;q`Gf!q:S36Sml,oܘBǪ6Y픥ԀyϰFH;9ʈdB~IWNtۓO5}4ΰ{_IigcՊP)Ky;eԙ\|Ď,ݙ+WI?5{ gaT(o$m .ޔ}ĉTʍԍdH3 o&BXbI' (1-ń MAR26={:B6ђ%Y6?3^.Z)6ې&RUҙR;PUɹ=5^gu[U3:3Թԙ#l;9=}Z:9vv~Sd%:;6n\VVvHPelmm4f"T.]R*SA&{IO7efNy [[nс..]跴29>~SnNe n"U-ڹcͭ֠Pg P&S(uF Pgu6;8Y3-y͚_{m8> lJƛ'226{N)i';7 ^D̘]ʍ;f.L;;Y|./T[jܘ1COMlcc}f6nBU4gӽU8qSUPmE`:3E07d^{c ~4ioMuqbaѾ ]MWMƛ'wns'@;wrvmBU̘Lm 2;BB&j)b T"#wjV;-m#Ս$f27lXʌ`؆l7AAlUHA933PRJ|<^g)[I|@ *͐5ɶ uPgse=c˗6r%UC[ͱ}jي&? u6+2 un+5}kCt̜^gu: rISɅaܻ~\%+bb:Νo1uS|0xռέaK/Y[1 g8WJlXYY`:C[IGK߇U0Bn]niSV:W"yu8}'=b|g2...::z݉J u::j;cX,AQӱC,sL3#q8~߼p5qqq$ͤ$rz**ޮof> E ϧuf0+B ́7:EA߿dɐqzH+VxRbj:~ʍ^gr᭳P5R믻f*ZkTUIꬹ7o; @q7+}`S{4@ L>~6/pQGld9&&;ZLVWW:s55l8;w|xի+2~ڠ~GE ure3^lgglwґu*W: UCA^J: =f*Zk:Ӓt4gP; @qkL!:U;iLԷ桯+!Z0|݆#4矝8UFMZ+;A&ibR6/`#WIz|"-_bu5 "tRY\)iѤ~Fz3hGbAQg θLuuRρf|3YrTg7kUg{3fe{P31R'I%&x9YDhgɴi}:LShuώ`#||\xq;d«ej/TXg: UYtԎT?^L&MrLhP;C,I vi '~ٚf8ˡ_|l oi{L{Y'3rN! R74:_?BӤwgNQQ^~yJe{wxl&0i&7{qknߙ'߽T|M͒A"89vPH[vr}֑Ұ3BPaCUk:: UYtԎ8ϲ;sذP;C|Ƀ&&&D teׇlGU3;`rgktӝO?=,A3Ĺ^9s5;ko2 u6Fq\Q\= θYTdggл{]@W z̛gkcFb;š; tOO= 7s<=?<+gUB3тdꝟ~Z(4 θ~\t% ELV_MN9t3г 2FcmCgQ\=d8L[anř-QQ^[[wP[  uPg肂L>ߵO6&PF'FN?mYv)ҬFEFb|h̻h嶆{Pg$ θ% tUUR,:n-᷽so e;sۛ$\j.&_ni%ԹEZ Pg3:3>/3}N`W)\I4e;:C<89l$i~$h^Ct$c .A3@q(Ƈ^ْG%>Oq!: u:#u i^Hn:F6LNs#n9,l(ƴi}TwhOpsv&mC%%o`i~QQ^Νh>3o[rϞ}MTw:kV:C<>7fxR$C#QXd%ڰWW7GE۵AACBwGp曉riB@ġBSqc>*"b41Сpח}{Z"A u1ŭ1ZLr ٸ&4Ug''۫WvvVvyn:Z5~ -}u_dSlYWd6j,T Pg`ʴh 3aF})JHq \չca?A{""F{abmA-`ejJ2;[kT4)WE+%vh}՝uT>{ gNrdyJ089yRmm(mO!'QgμA uL+ͭF^j R{%9x]\ݼ#||\KJޢ0×kj?P6O߽;p` 3cƌ:=Uw(x:V:C<x<ЕXstuU*Q*5z~߭"3#]YX<쳎'(t`f/CiiݺuΝtPuLA2:^2)#?2r~ \]rȓM>1)Θux3`TuX&9xm,hUk.ߚ,.JhPg Pg`pAd"b&ïrCgmVV00θ(Ob%Krz'e8.c&θZ}z"FCͽ?JucPg Pg`jИU{˽{(&ɭ"gHPg Pg`@* he'uFdzX[&l3 tei3ux30N[Px!onkzJQHܜք@ t3:-JÚmi.S盓էd@w0h1RAmXL>)Oϡi>ءs999))){ h"CRx3`;6 ܆hOqif$0 TY'%ݖ,((LJJ"  =2CQYY@qIyM8*!7`%p~NwO=? sd}3:8*qVn;-J<`@<'&]ْhDy+~IGS|T@w0h-~ohLSuPg}mU$'zBxa7k@<w0hy}ԼX?:|+ P#ц܀W@1qcRbQxj49>)x3`В4֓}UcJ.@0]u Z֪ۧK.<ЄYr):-5;}/ERMRi~@qA1>H::/K#OL @q YmY Qǎsrrj׮ڵm[-unj奤?泸h6<@<w0hH:zd<}qQ*Ngl%%%ёv:3kOMG:s~MD~ohM[VVV<&ת{SrL81&&Ɛՙ:,*@<w0kL\񬷡:תu#oNLLIHHPiT?2Ux3`#I^wyʵ~zGGG++ZW=>}zMM >**J&YXX@cmBlVC7ovvv۷ǿݻˁ;wNnܸ1qDkkk:~O͊888TWWjfTrn***h'o͗-[fgggkkK9 GrS__O׋vRÛ|.ă@q&S}AIiJgt5:H:O0!A_j[fϞ`6̙3BfJYKCNJoEFFq_ eIsrr(!Uy+pB2N67^d'լx+VS$lJP}(k:7ܣ{L<@<w0hhL}ߔΈYQb3)י풒Ll;99޽kggǦ'uY:m  (K^%- hkk+d|2{g}}o ra8y5/V}7 bQ)MNvQ;3Zי+t\nV^g{3PRzڧGcXEEEXX#;Z-"7M:<_}uf(_x3"EQ"[K]m{Ũsddv X+V0cZZZ?eGRVJ;]VVF;dW1"88xK.eSD^)On tXg$wՇ(慣L+V4b"g3@<807Йeij/LʕL]ԃMP_=钓HLLd:j\.O͔XQQ-E̅z__[n1OD!Yΰ!|H$ڙ;ZBx x3"CMBꍌ'WAFˎР|ŕwOx3"u9U zun,P:ݰJ M@G}dN9Й+҄VEn D諬+hM@4=eg95\qrJ m{χ'87I Ӱ-N勓r[e1-|L13$ͥϽvsO!^q &ʆ΢ N=e'g2E_k[:@Y|xB{9SD_~|m/}İ ~M]b;.>|<.#3/a@< CҪsv*-8Qӭ_s ڡeeeIR777cGLOOuww;r\"̙3ҥK!!!ٝQQQάgmxx8w}n???V+j߿[__/NlݺYMfl)dYs!vbg fΈ`76e d;`)))JrpHk֬1)s)p1&>t433tadz=]"Ͷ"((IbbJ%LOEϟ?_8}ǎ2NONN|0_̅؏/")@< ƙu~DPU֙{T*mkkci>=i)S*4SDb\Ϙ1dOǖƋc 3A-/]&9445n=*`2q/ĮrrʶxΈ`̀™ڇ.uշ/N2qSDej4 zxx0؁"ǟ>}:00vⲲƭ޸Hgzq/ĮW5K#@<oYgvΒJ.dn::[2LRzڧGӉ;w*Q˲Zbu㘹[O;؛+muF:_RUt oJ;F>d)7of.e*JXguScuFh566V@ )&gffR>WXSSSWXI---q#F/K#@. ykN5V)MfN>Fbb";VXҘ'N1Wj\.P碢"fICBB|c:3P((`vBq뙑A-#& n9:#]W 9v<}ODQQQh*\ٰu:#]'a.ۑ>,-Ͽ&WsssӺ9{Ըk7{A v]HN͋uMJkeuiT\.5RRꚿ Έ`m0ljE?"^PPע)x3"MJPz s@<r [gzN`SklnDRM^:#]aPFƄ<Ӕv +PgD3.KpyQ):̺(f7DSPgD3C'gD;и0~: C˫x3"ؙ,PM2_4)HGJd˿nFSPgDҸՍSXs##Έ`g癄6@<(ymK:#3/WYv0I%̢qڤ4uF;'3=@;ZXGx(ZJd3@<jxٗ\ݸK-۔Έ`'kukwl^MD2ѫR( @=WE;y9+SJ7:#MMi|Uxr'4IUz v*.[A;9gRUx3"yP)X 嵧Ţ/NB;PgD0>Ay΢ h Z@< ЇzsDB|6l)XluF;]ʊosrZɣ0:#9Xe,<4,.?0Έ`g6) Jj@St*cA v*ɳ̅l` ψngPG/:#^-0j@$Ii ?F;PgDcS0= ?`t<)T6@<<xP8s/x`O#҈ 7@<|Ux7Q)6) DR%b@JZs hG3OUY+@<SXvt<(-fƴiS¶5: B4lbi^Mp%]']1 v[pyՖch n,t鸽C@cPj CmKGl9:FtÅ3 :Pg0EQ}eYO9h z9===..Nӱںu7^Z2t%)iiilܝrDR__o]B L O4>>>99>:2Τ}t(&}||hݻl|'Tgtb셰MII a)%nذnn']f*#ڍ׬YZ҉~;""mse2@<s]μƗTh0.N<*x[T*ee2>44>TgoS"*˳H[ jbDǽTazʦҎJ"9 Fcbb(]%,-&d":CwXv 嵧rS*,$g \a ?#|p&5 Pf)Ȃc.PדNpppYYYXXc.͂ÄdsLu}xG8V%fVDob^ԙݻs{Dԙ_kD2Qf_ZMg}.]g-Xsٲe/A٠LlM6p"x3p zʪ'lE{9v m/'0E:3c +V0ǤQQQі3ƈ#i?|ff&3iիtOOϞxv6T*1L:;8==}{}z0KMlҙtEܱ|mnL&:hiUvx99keU_ZbiJ9--!>>kr -888??Itk׮H$t h<AwRntnPPи6CBBrM'ӥ:sgؠΰ>Fbb"{ׯ_ݻGHSYl +/ f0&d"x3p0 s; !^Y~SU//R%޺u&GپKvuFq;1Mxqg^(/ߚ4 3%h  I=eh[Y|맬7¨>j)2`x3"^XPGo|E ƥ&!hYzSX3x3"P)_}aXݝk7zk]d9[svA|ƞ]mQЗo+6:fٚ:@Pg6)`Mvrʫwo7`_es;@< L[J!8ns/]^~=s]mk=&[ǧGn|}y Q ˫LG&G}3=яTiHOOuwwΤAYYYRwWBΖdΜ9.]:x𠿿?= aW쌊tf9=l@n???VkR%bׯvzꠠ B'nݺYdl|sټywngdS5qY/x3"o=\ݸ;Kd|AՓ'Waxwa{']xٗ|·fi`5kְ'&>t433tadz=ɮ5ͶDIbbbZZ_SRRJ%[Ovi2!%MNNfwASlɃ0fӉutc&)Lix3" y<>mH) Ix}l=h>Y*J {=z4ٰubf~[[T*eL4Svn.>c ن>[P.sׯ6(*FcIYYYY߂L&c/jhhK`.~~~l),|ESh /)Lux3" [P0W7Q$4T=KtWWwmܧ|ǗSFpBf3ӧON\\;@BLl=룣Ij5{0tkV6Ȧ:C< uՍ=P?Y?9=7{Ǯ֨L&STzQ!!!;wT(qvTTTҹC,d") PgD_P&YU=bmas>mc7_r KU)f ccuFQű(,,Ç ן*ƌɦz*Jvq\\#dT(bŊNoiia< 3W4ߵ{zz|Ma,s@@oydu6,D㛱Mv9yg% Y!?b9--">>ml:j777\-P"f̆pu:]rrl=tDAP(<|s!3&5ad|מAG&28@< uN5@GsMOGon.ݻ'm%%`O#6#r--**Έ`8&Le硃:OV?Ǐ/--rJkkމ깯[snn.w~: vB:.\^|T_\^͝:nc8Hvk׮uuu9ns$1 3 tSr?Ǜ*x3"ؙ:nP(ݿ?{|m:·nJ=zq**jDLDK Tv>| @EG̬ch۷m5qQ}^#G /^Uێ :Wk7տEĉS"\͍+s##?~\3/7K~W'~ŝ>Թ^}RР}Ӌp5MɳHg4c=vwc&Tɚ:>ѣG4MWWމu2'S u:HxC4gY$gȃN\+V,pc3babs:c6Z[[N,P}"?>|Zp l"S6Pg0i1mPg;sy'ѣj ,F>|O>?w\ssމ}u勓PguF0Cٶy6G̴l0tz'j[:NZiٹMMM߾2;2{lSstr\"܊?e֭R,WjƕtׯvM㳲RqMfe&p\\뛖Ƶ Eꠠ JT(ħcmÛMOOȡc(ؠ2OnYLYs̹t?Sי:;;<<<(Y8pv Ϡ|Ɨ6@guuyO?_ƌx@Pgn.Ybٹ6l`>S>299ځ?|1r&OٱcRt^ϔk@JJ 38Fdd fAǯYfLfWvn&_>>>&/8\=A|roƘ*+33tadc47ζ;zbb"}y/p uPg5񣏶inUՑYt\.\(s&3PgGQ߀o*>h4}S:}c)HĸSd2Y{{;?44eT*mkkci\&E$|bd2+EP:[1ryVV}]ӨMrbd|roF֧,&oFJ1clCCC7ݺu’]^ެ6@Y/'Pg} }D^O\VV|S: <5ix쇷E ʬ]__M"Vi-[f٧nAΎC*rz*u櫉@UTTYLx :[TJb#DHHΝ; +b rrRxSsl׭4==}{}z01KEbŊNoiiaGLraGkZRɎ0 ԙ&Lv Jha~^x{{Æ͌uf`5LqŎ9PBJ7C. +JX_.HݻGH8$vwwm) ޟE%''Ǵ ԙ&OH`0Æɢi'00 € G7L1AQLqE_\\-EEE̘ .`Biu~K6y95/bu<6'GEEl@A-xSl^#r秃:#6WQ:)p>E?]Jzsd3ذMF:oIu&uoxSF7ذMٕ7Mtt0?3rO[\S C@h&-+3:y^iذMF>>>xfd};u:Qdhtί͈`0G5}`[ :cX V#GX)f떛u[;xuy:3:cuVɛ͝ u:CPg`M'g:c:;:ohjj87١ٳg?7`c6%++K*Ψ(D.ML߿*C!!!LZ rwwW(k,fGӭ_s ڡ¥\Bzz/ǞWc+zyy͘1Ԡ&׮] J|/[ne%YzVQt]w0%R#j+> .{egg3-3gΜK.}4@lù+;l@׭* ΅K1Ȥ=9ɱ,NJD50`d?44%9ܻ0 n_we:K4j-fP7nLo[g":C)u>h4 .`qhrv\u6H1gYs6*sNAS(ϯie4NyYߡㆨHu63Ts@ډc?ANK3/u:O|Χ%C⏏/\tg}V-[ai:ǼJb>Qӱr;uq{Ũ^7:߽{933Sd ub7[|,|{sYl:񉐐I_GGGPg'ܾ.Lֻ&yzzH?,)ܵ3{::NEE-pHx{n#ܾǕWs~w6֭_:KLGqqD_Fgor\]]p:_yd˟y~%s#KK4P8d)yhz hzʖoRstl:(D;wܼ=[bV5wYu/O#?~xdd$4==}{}z0K==={zz>Jٱ˱~1bJL,v̨VU*ܱbԙՎWsf3%u9siPs +Vtvv~KKpay:aL0Ga`2>qo.REDaa!%ćL:k[:O~#s9?|0%>~\3_'Y+FN&o&bo_Je8$äS>JI혫du'Ulr|u. OE_6ΝÊ Fbb";bR:2dz؏HJLΰAF* &!U(B C/r ^GF;E;6qo.ڊ+YO1c6Pg,tm_eѣj}c"k1Ó=tʒ7Mm5/g}:Mom\uf\>l= p]|0ΟrV()'&G?uF2R-Cm1: ss!%ZDn2Y:_[ `>]gUgg~2wy3"Q)_SVmMnЫ;ܿ|…|,zDw3f&0u W}Jzrơ蟑RC¥pL?I5#TI|l+5wAu4@ΓBr秃:#_RYN?g:dPRI6F(`]H~n/&;u&ocMK3_ƽ.|jNXؼ+E ŋnc^gnˈufA#oV&k.fkwߨsC-/Nq7>U|9BNȕwo,*/.J0Wsacodw%[G)on;LBLIIu~@Agsՙ^ǝ;e3m ҥ/V O[AE31H*d)(pe :3-fr3m{nl.HG[+oP:_[:ncKysC*ΆuSfe&Ii_z,u.-ܾ'K-*hp?N"kc NPG#%gfب y)buø׵{&f¸sP, fpzu>}uIkYbׂe[-4wC9rŋz$ZIa'MeZm^g:#8 uK#lՍ) s_lvtw+pQ'H?N|Ѣ"F堷aU5DB@Fm:_žiw_VN)oׇL3I33 tiik4^,===..NӱYYYRwdggrD2gΜK.| vTBZ ˣGNC/iw3a?F0[;.,wK'cӧ]FG`Ehe{{;&Jt23:(SDb\(Ϙ1d԰s-\><,O6 6钼WWٌ4+3IiŒBqnV)Z)-j:d2Ji7$$dΝ btttbԙƎERQQ>+YL)_UPgDvT-{8\ZZJsw}#[~{lkmAtrIbr?ڟw<+Lg#G(B ؊Bek222R;7;666V )23f==={zzĨsjj+:;;i̗N;du,Nz1Y _UUPgD[^{6lmdʕ+Ocǎ}'aؒoN  ! $ ' * -'fvdciii\ hc+YVl.**blXt |3lz* ^pp0wC5H$AAA(Wy:#Vʞ[[[]vŒ`9KrWHc†B‰B ΁/W+'(GPgD00dD7̋O#.ܹsEEEJ^?ti8(`(l(x(f * -'f6)9+Jsnn.w~: AP6)s@_vM\xq'Nz}J9    'flVk ]~Pr?#x3"Քsv-达>Rfף:ed- ACaCdprJ[I@`"۔yhҰ9e'a.x0::-^A:#]{ vqˀc?=9k%yrʢoݺk"x3"ij85/`WT %we2RUV,E?'s,:vΈ`WpzMN?0n`wC]cbbuF'gv`' :yjRo|͛7{yy͘1#++/i:=ǠzJavv\.H$s̹tiHH:LV+ :VMMM~42;2{lVOq`޿?S4U__oRmTOLW%|RC"!v!.!Nw.Ȗܷ~;""xx]J:%%ETIOYۋ!>]`Y \'''TlÆ Owww{xx0Y#V@\z5]>|,z`* fV=x:3"L98Ŀ{; 7EP~ڍqq|<\??6DT&%k{8[ߟhr\JVg[[1b'&૒@>u:C<Zt](_vek΀ ?0L 5H4C" Pu2vhR(݂× J|<_ǀ :#]"%_:šoSzۙ}R3KEEh\K~g z$^le?Fuӱ! @.GlveL{><tqn+ŧ3c޽L;ZVTrPk'u6[zzٳCagOO=F૒|br::#]lrܮIR{'עq\M[ٞ c$&&tV%IPP {>=q'`vH"6z6_D%Q vQҎY΁/5H uvq*_MU' &&&-- PgDWY5ɉ[ Q)ږu^`z=)lY+oDڮ̣0: o`m!7/$7kG,U tx3p`nmn>qTpk_quNuɹ 5t*/dSG.5) әPg|],Kru2{k/ڂqY·'t5h  еտ ᛀ쫚Ƴa>.KE@<86u2o+F;؊3/_`>kr 4px3pxn196X3OE>KD.{Uvp" w nϾl@;X|S;)s) g.=@ D-FQ"[>a]Ih.~M uLZsybH}h vN<8#a4S(FG b@uN uF;-勓&5szU h1W^:uF;-,!rֹ˫-E9=;38@< 6b0>p톘#ض~K6Zi/8PgDR(ǝ̋"{k1+аc|GΈ`&!qax \$` obduF;?jxzʪφ|4BW#4ыΈ`'gD7Ln%v*v51oWvsbib(@<*\^C*Hsm{߰cLcC_@<òYȽ嵧ŢݜM @.^gh 1ԙmނa>$fڴVQQ!&ng^xPgDk Y+͝ԼXl, .Ln ;'CΈ`Bɉg_kuJmRZݦLqww:t* uFs6.Cح8(SSB0:#]a Ρ ~<`z5juPPB8x uVoooJ_zVe7o5cƌ,&Eӭ_s ڡ}߿_.K$z&}xx8..rMKKc%3**Ã{.%J͡Ϗ[=TrFska%BBBvܩP(FGGTI|@<0c66*nΥ%IJs$fSSSWXI---cf1Y5w1;vY*J0b*gөtvLpll:RÇM^M*iCu6b~M@<좜 [Yt`~IJێZܩ(H Ȟ [pp0wR NOOO+HLLd$O%GrgؠD\-EEE̘ > ՙ/QE>K,@Ӝ1|X63ˣ™\Ǵ P7q_PgD뢻ӏy6j( ]kM勓}Eno霒ڤc/x3"إ95/c6LҥFFՍrܗe@<89LҔvnS590cڇhLG:#z'0m{{{; RUO@;PgD0xZM0P4:,Y2PkxJ"`:(ٸooN6f ;q sΠx3"[^R(Lq @dz)0K4Έ`0%::L: @V[٪`m fP)5IH*_v S*de|].]>c#9eqK [\)x3"~r& [M>ܼysddVU*|c+Vtvv~KKK\\qg޳gӣFFp鞞===B;̨Rgooo+_3/A_l&@ *?,_SV=aŝkMq|Ʃ钓HLLd~g:eeя=ݥ<#a$ҭ֜ [vΈ``!nΏ:z'qڤ41oɮ۔vΈ``9. 3pbJg=c1I|qPgD0q;]JBNbtwOZ^pQ7O<HPgl<яK#\a} [ 7iDM0E0:# MJs1SAdp⧗:# жt~Ĺ/U[BMn)ĄLYVuFp6l3}_RMQ[/<1<(}5uFv o856g&Mhb*_S[n܍vΈ``HT sQܰcڦN}FtbTSVvΈ``3Ҏ9,uSM{M[{Ȗ uFF4‰'ߒ=X ;=.2s9uF nST뗵!W߽x Vbg;AmWYvΈ``c:zzC:r/fݘe+_MŒw v*v{RC|yՖId{C[q##~K6:#]Vk’{ĴՍO͋Z o`uF;Rk_4:uW}FFJtʗ`pΈ`0|yHy Ȧc'gpOMs \٠8vnz4 uFGqPu& 7{ ڇtS;VW/$~OY5c]xL ɖGq\u&tw/^6,Bמx6`z)vuzڜߔv̠uF[Ba uD3٩+W*}kPGe>*>bf~m @t*iZ\)'%/ ]yŶE?Zr'ɮq/>iv:d͟?gp/PgD0N͋uu& i"}Vk Ωυ2mK+Ս}_at9{L؄,@` o&{vg'ֲеʔ-K+~q+&止-e9|ܿ@`B!iV)=ex:Uxk_qENvٸs><uKܼ@`h');5\[^kQX^/'l,A+ r- r7XJ\ 6 9L* ^p梯jn.hIi*R@vaF%nmrI vaǚ.ʺaĂ/]^~=s'`SvĔ5uSuv[ǧGn|}yՆaWmqx ݝ™} w=b앖2?z*6l ߻԰.Y*R =z5PgD0ږ™WGnRmjv?[wR鮮.4x3"ؑ/ ՍئBg{uF_"[ά7.k7 zҽPʕ+ͰgPgD0Ii#9_"[b˼F{uF{m(-k:b g.SyU\-эjڴi+l"6'a[I @< Iu9XsBiymk MZY铮%?[y+W @< l'g|:srU Nno˶긍_:|`nnӧ]ՅgPgD0=Wm ̠k8Csw{q-8#8O?غ߸o˥3sѣiHs7o#{ǔN9{:Lzl&+&P:ctqQ~^#G /^܌gPgD0=U 깯;\dzy nw8o>q2/ȧ}z$YdԩРWshΝ22UƥK_8523"%OߙrrR)ʜD֤ eր (N| uy6mM=t3lڵk}}}xuFSjÍx>9k`Sͳ-;D |.\OR*!ko}Rzu}n2s0\)P,bʼnӨm:=zHtuu @< l3ƈn؁|><4n3SF7}wFnn+=q欼"P,0i9H1xsuFy˱F=r&x##VYy02_2?!BG)̟~TUu;,檳@quY||߬YSd34:k:JsuN{0q\uС_,}?h+W7)Ό S2Tv7JGg#WtsY8:YJS[}+1#B k 6s5q^{;fgT_~dX֔l_@(\*j;?_uh蟤 (&w^= ) 򣜜=dH6 TY!zᅡrd_9ݸ-jrdT3de yTyMolB}~bͮk&lHA+oܿC߾Oʕe͛Z3aBլ&[$W~)WZTdIqmnQg@<uCnh.ǦL u>xp/ f#$:+eQFYn`0"OO=zuVG"华)e҃JY ͪ_RRȽxҗ.%oo5`:3.@m[{?QլZ5kƍlYbo,7L?jFԙ ]•۳O{* >m/3Ṙ֨պܽ{W/g姲De ='Nʿ[έP zZ6]kL&ny[j6Xd0lun54޺ɚQg@<uCAZMْAl[g*9μx F+;V(ee011DZ)K֌:nr`ufu ƆMeF#u6YANLsmۖϙ3Qgkj&my:uPgz0_gY478W*gl5%bR"U[,U~%˕+{|}i,Fz9_f:+sBD$N}YU'.uV6!o1פfwufM׮|?QEδ6??Xسv /s6s5V oj3b:YIle[<2zo$ `#760~=o8Ȼ7꫉2(PLJ:8<@c1ڸq/x+ΝΚ.hv?otsNӟ@lδd&5]F}seIE0|߱s6s5۳A^~Y.^^#*w̐l&_kBעMFԙ ŗOG%m-un{puR<-+F4K?3 :ӃiNt{hL؀2]ͫt _,Zx͛G_?ln:י uPgz0#y󵇁缀HW[] W^uu}A`~svvѵzGԔQQQqqq%%%|X3=/HfSQN.(6G \E?3&zw޽{E[VV' :Ӄ./;w?E%Ǟ |F#׾ 94:@'%%WVVI`N.Xr{3mo\+s8 fqʁs[3cAt`sc3*ꉑQN氇H9%%%??_t`>ԙ vDxnk~gwک푱Cɳqq%1'~D]EoL~+<87ެN8~tZԙ v2l;+oTSŹ A[#G}%N ͸^Yfqee~JwNː3 :Ӄ) Gsg (Om9))I|%-爥.7jzJ, r{.I)jˇ{#Yl%9f@<u>xSLnBǏEDD޽;>Av<CJΉ73=$u⣹śO^[Q칤$???333111&&F 3x&:rI'(RtN@5u2T)vL))++SӓbzlIǓ'Pln)oPgz0t & )qO t~~~nnnfffzw'%42eO:ICԙ ]:Fu vE+++E\JJJ5+(LbIǓ43=*uW]Zp<^x*p3yPg[Rr{^@PoW6z96/&L:P@<uCnDW#A ۜ+ T_YM*]dgr|RsDߓԙ M,C MiҠi"SI :#b]w:r:OS]ɷ 3@G7^x- "ͩ#fri :tuW];2ӫws)2IJI :t/֑7'+NWRqO.]?q+H}e.H x шuؔ;OJV#l. PgNCԹnp&WuL @@<u4*2:z3ky+85.y|?A@<u⎹^Ž+IxK~|x WVW{7t|Ҡi$E.80`R.T3]pfmX;`CuWHol?lPPg{Ak> 6ͥx `_\ܓ:bf{<./ 1T+2T4hZG>Pg@֒3%V={!ngH :#CVlMG-# [f @<u_N.xMnbߓ/1 1J~g}V><5@+N_N#Tא:bfVHXxc{i+WF@싌E6n͸ǴC֧\ig " :L 7 O0I12Jz˓s644999/((H_.]ڧOՌ3jkkUܼy!C9k׮KwwӧO+͙3C /XފE`MKmd Yyix ɜ\u6ߪA=9wo&}%KL8!RP񩪪jll7oӧOvڽ{BBBD_j/Kbjn+jʖۄ-4Vv@<u#Kx|RdjMz䥁.../_V/^TU]~޽{܊tttTFjS5)b:[s33]SNt흘oVc7}b:Qg->׷,or+DwB QgPg&gJ`3iд'd^M?0`|y0[ _}"kRWkaVl6aSʵ33s<8>#:/ ڰɊXu}*֭0a•+W|9?? :ZVIܫWk׮ۊ>oi:ۄ>!!!b"3fP1h :۰˱} !ҩj"a7oCΝ+[Q˒f[Z6aNrwwܶnݪcr@<u76% fh;KBhPgz0D*2tZ^/:bCW($w{e$ :Ӄqw(j|S{BE|η=.x `wMj+Ot[>iOIt6wK Pgz0t[oKewbe;FʥO\xLIꈙv$ؤmx騷#辨33=z(E /=zkri xLHQߕm"#/y2Egֆ{6 `A490`MZ{򥺫U,?ʍg.]ڧO''3fʒsFDc&e6lׯWWWgRشiFqpp0W7"oR]ԙ =[9{ᗡۥ4񩪪jll7o,w,ѣGf<[’%K7Td=X[u@<uCO&y.soUɉ2o(m˗۷{ZJ~JY5cu/j4cuQnrzPgPgz0t+4^mlNm˝u1Ϩ-?ϲ$$$De:::5xL&qچҹ祁꠲>OvssKII/^TQgk*gԙ `8mߪnpki]uM0ʕ+R>w `Bس{CC7\ssuVA@PmT-lu e4Y]: 7 ϋ7_?q>'j#Kn*K3 R!K&+6펎Rw@<u/Cu5Pg3@`QQߕxc.QouN 3=sPc[1![%@@uty{:wΔ[|i}4@Vgon=::N ݀C `X{G2E絛{z>VnM. Pgz0XDStE olP(# `soUK](8>\$̥xLh+]tֿVIvŽi$@<u7>-:b]=:(AU{ `c~_w] qԙ WV'hrLgy/<`Bys9[ip#`#gJ6dͳACh@mEۆ6ti:7 >0 x QQw`;qO,s˗v^ܓ!Pgz0@qQߕ MEjs6uS2y>z' @x \X1!\M2]i~}13=Nx5Q^65ɢT|IqͭhZv3=|˔r|uoV诖3%(8򛇓R])+O :ZPgz0p)c˱r([%'lTG/Ef xK U^Rꛉv$ptԙ `_Ԝ-Qm1/3rN_&zX{'ۥN,[<1{@vAEF^BuZacr&Mu/Vn'2  ߓIټ=ҒXQRQo55pԙ `ol* T&,/?c埮2io/E*sm<y!QDWϗn m37PgzpFE!tX frDv@<uwnV]"'Ff>̻/ń^;Ek\4tOmuy/övaE￟;}'\ :bfEF/nŵc}lj4\Oіhs:Folsԙ8QC'Ux†qs<~.xLT=r h( 8h+++C@]WqO{8DE/))Ğ@]ӫ?Cv5#̆ ;rnl~bԙܕwQy8 At̼8qQ'$bĥ9^q8M#Pg@:4ȓuP*~!šC}Sѣr~W7ndk4Ϩ{RR]eeKgg'Tg ۲BBN:JAQ={斕uԹĴۉ+wV3PGDv՗+e'Րdã&KU6xmYh!Ѯb:g|3Ψ3:S׻wOXs^¶PRv^\\_>z:744999/((H_.]ڧOՌ3jkkUܼy!C9k׮KwwӧO+͙3C /XފE`MKmd YyisQ0acmQ^_ r۲Bu@udɒ'>>UUUSUr׮]w^HH_|QYmvZyiRLmp:[sʎ:w ,?/KOFDv"u?KjZ,Q~GAۖ3 s`˗/+/.}v޽UTnER ^:::*eF#ՊךPs[RiMXBԹDhG[,SQؗ_we "uVoN'K䧔=ߜܶ,@Pg@;[0E},oY4U5^VZ6:3͌:W֋KF_Qg[:ۄ[߻wO)֢:34׹!>>> [n„ W\sZ$jZu&q^]on+lnZχ=pϘ1CǠ3A:CS熆GG>}aCS:88 >\lHX͛sV٭c𖖪MkSee77[u&PgY vL3 3=Pg@Pg@: `mzCNDt9p x{pgK?"S"o xzpݷۨsف#r*2谐Qg[Q_YwyԙQv\gέrև<::@3T%}<lKiƌdܹ R%nذ_~_]]JaӦM\<|]JUvA @:{r@MoTUU566ϛ7O޻w?r||ѣU3V-Haɒ%oloԃ:: 7^<lWW˗/+۷o[)W_jj/^h4,l...(uEt:%;mV-4W6LK'mdy/Nu6Fh!%!!!+Xx5&*Ӯ[lc3 ֐]s<lvYTnnn)))ŋ֏:[S9]t`:Ψ3@[iN4<luM0ʕ+R>w +egwwo>׹!:7[7ӽչ߯{ᄏ@ )Geiɓ8;;9:>5"@Gc 77> X\T߻fM@^βĐkSSo_7}|W9c͛GwuΜ䥅m&^{;fgTY~K֔o uo.d%B&ۣII 6@Vj]v|u 0O`Meۚ0W-.X WɚJ?^ntoc#uPfY:88 >\pu[Ō;fŧ-ʿyx yTﰱdĉq#[B ~ު 򣜜=!o#1!1 YG\Z?aC ïo???VVXwRlyӕϔj奕f&7dǥ1RaR*Ϙdx+3xs&3l. 7|oTWhi= d\ sl?IAYT[)޽zVvoPu<l$vHuvqKJ}Q0bO=؇AYs}1ZF޺ۻJYyFڬ6^lnflܸh˖+VVD?c\2oAMf\o=y%eemi@G ˶5{a~ >xp/ V-=y).r+7 .80`&R3Tf:<''l8;;G-oͼU5YJܺjBmb Æ PoFnMQgkvʚ$l~DOxR6o@~E,Bպܽ{W/g姲DCpou^> ά ;<l3I:;V: &&RHY~DWgٺ:Z?4gwH)#q+GMnsĎ+m[>gTG&_KQg1;w4{8^Xu6`[`q 6i^j=  <:b:Vɘl}UgsI0W,T4:ˡo]7xq sk|A6KY~Roٲ&gDDZСdy^~4a Q77N7n\{ꬨV" >嘆J7o_s )Q1h=l=!!X5Ri$JN}7x+3߬:K͵G?d|g6 -= w0B̶5{a~s t)YQ Oﶨs1oQgV1LMuRd8g#)}A\ܓts6y:M} sU{7^Q'DDZGS]NsR3tWh„MsTTThhhxxx\\\fffII GP彐< }BӞ.c6~q~Q{:ٳ'>>>77#f;k Iu~Ѿ޿H:, ɛ\T(fQg褤J>%B42i՟_ƛśwG(?["˔]k @?1N/\ w~¿vǛś"唔29 @ժWBR3555EGxD9x@gxyK1^;EQW`Hs uެN8~xII C͒3%䂍Pg̈ry8TFD=gE/WlWV7:K911177W89r;>>...zEo%v5r$2,g_x3e76emPgzvCspΑF9fiko>Jl ]O+++~III1 tfޞɃ#g;r!{s£+f@fN_v)2<@`KӶuT)<@n4Yٯ/fPg=+K*)O4`Qu,7 >g"H^^Tf]+]/Ǧ @2Twoy˙a:Ou~Pgol:Qo]":@s%pB]e棾+ Ψ3@Qws6yt@n3 :F\`Qȁl:t(UG SG$%Qx>yG}WWV @QgVINC)J*uFPgTzKB dPgӸ[}sߓ/O76)E\3 `3|:sdPgqO+76^S>Ψ3}0}쇪L~}1̀:vuWr|Ja3 `y̪. Љ(34;:!< :{ 9~~u~u@LP] A#f)3 5^X&БԞ+=J2T @Qg.?]:UB4^/v& u@ Q>6,︼ h3 8`٠k?V:c[Z@S]CQpH|WF:]•%Pg=tfmXwIΨ3@lP4Eu7EəsuF _BQg[X{xSL|0 3 Э3%М&K>C777GGGwwSN+/]O>NNN3f̨5nΜ9"yi ~~~RC~LJjrradVk׮fdɒ޽{?S6mj&w\\2ydgggY뫯Rmi4OOϘ1...&=UR~*ps+w2o3 ;9ѨiUUս{6l ߞo,oll7oqU/uj2qDyi \Y`R훖&QUu+涾b ZY.+7 fUC ͕eZϛoqVVs|[r䂍|IԹ:^ôbJ}wUi4/*e)`...j%Iuvss۴iSy?]onREkar׬iSO… "t9fTfN_-,MBuF3OI1zu6٪T7կ N:u-od 6V=FSRR,???u@ZڱOG%|~k\S_ uFzJuZ6:Fu*YYYR孛u +[%tRT[pww_jVw ʓMMo'h: +3^z]vYu^n݄ \"sΩQ' \Y:K͊ :֕bsm}}}-s\\, AWhq5eJ)@Qg3.88ɚ9ZaP7oCΝտÆX;lH̓u Nذu1c;lXn_yf Tfdy/<0`٠7IΨ3 ɓtE+#:b旡:frfߓ/5!i:Bev1U qf@Qg:ϭr`s>3R]a ;C3Ψ3`Hyrn]fu|Η=|z&})23:@3ȟ̊o?8z殘t96=clHsJ V彰L>]/ JtΝLN|n:@8P ЍZUy93ou#? jweS?C777GGGwwSN}nΜ9"y) xoO( K}Y:-vʕɓ';;;=, '4OYi&Fࠟ)l޼Y!C9rDѿeO>fmjk)//wqqB1#s&, 9[BNPgh'l?ܷo_A33f̨w /|{.^ǧ!'NTt=|ſ/gyly51\\cc%Kn[xf+V>[曪9}k׮vCBBĹ_jڦVs=hk8{aҠi7Wf01uPr2mڴIւCJY\xQ)KA\S :NuԩS{=pdjrE.]]]0|𴴴QFIYoqѣG;;;+2PgY;^66KOV%[ɗyξ#nMr:@P)&/ Ȝ}feeCH?: '|p㯽qVtFOcms[ԹٶZJ޻wm~cӕb̉ޙYƳPghw __S]:zu^d2G,lذgݺug4C~kzvZ,ͫ}\綨smSaaatylL,&}gt90Ʀ'Μ |`;Ψ3=q\!eɹʭKb3 :WV_ؑ16 1Y EԞ+%-3 `]JxF`.\srҝN[Ψ3'|K}eq'\(Z1dol;zgꈙORQs{%n vV3%0o]K;: VI9yhMu eE!L{zL닿 q:gֆ򜝠t[7W-$uF9#fT P.Scuq]GUR~)25w2'ycPg3%rl:yPZeN,9iд)s<1Pg9JD<@Olr.(Wey/<6LtΨ3\ؑ@L JF+gJ٠'0 :`ϗo+\<@Oj 6>})<Pgf1J/I:1{]7]sG2 33k f|aGBeI%h\3(];sŘW_ܓt 3tV,e騷=RꈙyAcӹ2^'fOik:Zx6(GtY`QatPgh/jΖ劌CsQ!_b4hxsu E鲕revA,}. $EY(L Q{z|+\: 9׊7|^:xY.KHt.E[%YC.8uF:BN2]Ȓ~Fr]Yr^@P؀fy/<Q2Ψ3Qqn_Htv,џKq6G<9D=t+/Ψ3@ }cy)lO9ΝLD94Yʔ庫UPgQ~LԿ4/_ܓDZu9.yyAcΨ3@~cS\_(,q Zgo;`cꈙяL\uśbjΖЗuF++S=5HXjՕg"L󘕠ҿFcR3 Н[}3:;2 }nV', ^x`$~16@^D@Qg]/#lϗ. iAjkQxrl;`_':~L^@N%.L:`cK/~Z_Y]]?YY9~~ ;Ψ3騷W>12gJw7\yPa8nIh(^)c%9gJw2y|6PkiNn?Nx);ꝲ:Ѕ9`cc?KtnKݛG?I]:>R}Og-Oε aْVAE!yAP43t/z닥3,@7S>6E~=޸J??kkT|[Δo>[㪎2 :C$EμEt|SviEꈙϼ;-Ȯ3$ ܬ"i4M12 = 6D;?6RL5江]8hY!|zN̾@Q] "}\;%x K?^|o9YhE?NH4-Qʈ,APgQ2!ᄂ1N#;EkQ PgLPuP:-aT6Gy";>\_o˽^JtUev 3 }b$]"o>YIQz}h<=Wʇ:!:rƥ:X΍c~ sWTTTJJJ~~~&fl\v„N GMJ[vf1VmQgdPE9%%%|&Ψ3̈́&^=e5v9z冰ݻwĤ:΀:7?aaf'>RMTQ=<ϲeuW-_ҥd&l3 s;^&XZzpΜm۶̍ omT}_7pֲ:Oě7(`2h*.DPgPV?vpxTrÆ P~Aj=ߜۨBnٲ1 PQ>}NN"#oEH%JUܜu@Qg@s|ž ԙuԹƬY>ʭě=^nرMhey}1?_99=ޯ}wI*-=8ygg'yHef~BD h488T/ٗfפ;p&쮙f2{3:Cǩ,^ǖ7_K[Y F[ߪK`8VȱT%6lgw\6Lfu@Qg4uh))D)KAלϙ\;ȍXEdA%ꨡHcjՁf/}^&ҥdmR\$X="ea6_lQuJyiXߪVZ>|}-&\S8sl: :.TѿY1c1'vʚGwY,Q{]99aGI2²~365Y6Q Kb>청Ze_ZV'v u@QghV?;G,:\+c uBF?x),W$&KY~/]es5Q;vlo;V}摶eQgsk\+B6Z=|n{9/3:C i3eS&޸-"ka5m[ާwŞ,?޶lYj:+scZg^WH U)ses;pi=6a.{1#F3 - / (wo5awu޼NNK̝;M6L :~>)OfJ51Qw]`IICG4|hNR^Tmxպ]*(vwT._3:7n\$k6 ean5܁3N 1 sٳ&t?z-jhuFT-pn{uԞm{r~}cQsu@Qgh1Ot}#)^ϼytusÆ (+Kl3D3:b/ :b* }=gg>B?+$m[3mrl~L[#j|樨𸸸̒> Pg䂍 q=&q}2.1ѻ#D+gϞܲ2> Pg"Ǟn _I'FF}_riuNJJϯsuF*n\+sv; Z7S =5kjj@Qg_8ܖ XhsNaeo|gT#{arJJJ~~t~9@Qg푱CHqz۸ṘEY?:5$˲&{ ҢkHfHBD]EoL~+<87ެN8~t{PgZF}}}eeeqqqnnnBȱQ{p f\E߬H2YYʉեKguFM$ޖ su{b} X0?=9$WWC;|7R_pY"o@Qg=?~<%%%...""bݡ'̘;W3gyįS_T9yTضa~47QtcҥcKƛPglf%%%111ʸ]xxnYn&DriԿtϲN+]W:tcҥcKƛPglf555eeeszzzRRR|||\\\I"L*}fLz WzzbdP2H+X2,[7 :^nDͱ?JxuΧH %=*__K=R'V*yJו4Ψ3t@WVVvte _?ad_1ңN(=rVlN>&z3`:Cw> F -xs.TA:CFyk;Jl '5gKěo EV!uW t~k ix" ݐMc {%":C7$/ HP|!3t7.H?o!ٯ/.CPgV(<`ۥV45l wQG*lȗ)un1UW$%bu4YLq-wo&z75  K+tZ%¶\L͝<}?[@cx9 ݇+2syq{%Ӈ)YYYR_~<;88X7dlzʂ138hРݻwKUw2ljNSL?U :q-M;<=UX]~So޼L!"sHoF֫c9$F]V֭[s9[5:V>F &TVVJ{/^,ZR՛ Dg3\i'm#s|_;m5/cƌ9zSO=s+1Z~Y٪fewP-diKuNj)$:љ ӫ K野X]zҖcǎ >\22[~!C$F+̜9j3f8>3` n֞jDg3/FܶױѰCquo-蜜77788ضr[5`CCäIy[|0Kmh"g :q5Vd‰ޣҙM׺_KLtX.|1厊Dk;G{H^yBvGQ?j(Z.\*liZwl;! :=ܥGJgDg9zכwРAEEEOHHk sG9:3oNNN``RRRm=V:?##Cѱ.m1ި_"= :)5A7<5_B;aʔ)=勏z4qr>j[`l=C/]h4*͵?i} 4hݫVv];vL$j]6 Lt5l6k(_۞.zkP'NvsfQuBB~ĉjF &TVVʾ/^,[rĈf&T9Dg3Ύ tl26VtxF3geY<6ki^ymtVꥺk-Iv3R]X<~hkYwt/Yv_؆]v'Tt3rrA" Z_7zΨh4jo᮳ݗz3Zmw׹=ѹIU7 LtF{5\J?_Wgm7Z'՟ 8oq3h-N29Dg3:@꽿褻jyQ]Xډ44:u:W/:;?C6NLLTfልuks :iAљ茎?_zD:=u9Dg3:̎^(-ߤѹ0gZt&%;ѹg}wͮ;cAљ茎T/\>z:ܞLt֎7jP4LtΤdM?Լi%2 DgswW]Xȗ=@Dg8&3]Gg3Ej Dg3Za1%ٍ;LLz љ茿Z\{BW@t&:ż-􃪦ch΃tDg3AҴ/\)+ :aܦo~nAW@t&:ÎSc']EW@t&:CWFg$JnhΧUdv×?rw% :ѲӓDnLtS*s?Z7 3<l) :jo->>'le>|]Ysi,SW234{ nc1?Z=g%o1۔ LtFgin#kܨ36OQIϾvr љU4w'mtKP_˅z x?Iɮ)9_yLt5ɪvR,wKkO-7+9C^,nْ/s_?@t&:壧KZ3E)J KGљljy٩b/M[]ك˅mݢteS7'\h0D]D~Ν2w_Fg$4vߺuBiV|,XqpZ"@t&:>cצ)...++#=@t.n55g}jottǵaY[HQWYz}?fygEWL7o [5c]B^ڰaCvvvqqbDgOh sJnL^l6o߾=*9w_e<ܿ CزeС >0codcH@qʷ+V ˗Ԧ8l@yK5~UUyR';?@>Fm>~D6X$h|47N˅nt ϛ'Oek׮MOO/++3DgU]Xsk 3ɻ8eСQQmےYx17<|ԬYn]"yϞdY1iR~ƹ}IS5Jn޷/EvOWjl'vlHիZ{R޽k}H'/̽N\67%:dddUTT3ٵʌL }`7SQvoTwMt>|Nmlg6o^0f/dA6e7Ω$''Y&---//߀ή[.^UQ<Cl5c۶$tC/dY;`C/:ހ 6Eg)?=~|Qc :ݣHZmn> EA?WG*~|UO~M|(4Um2ɚ5ASگ 3ea:K$ 3U3Dg3ѹʶmIF=u!2r;)|βӼyCOٝNeS5k׬^{^j~8C}k޵CSltѹq_;3љLSn]2y87m<3љE*oЇ**v :GR#_[$:@t&:)U 3љL!:@t&:)DgDNTU((VlnGsn8P\RcO.JiPZU3Dn:@t&:)DgDg3 љLt :&:@=&]ǛQ:N :N8О돗 љT3љ ѹGڵ?O2gO) /[, +V 0z{44UYظ/&y>_{n+/9 }*nF? D MZ$h|4ixΜW;>rSm}r V&$^zJ -n.kujmUv{ љTtJZ/(%" yLt d)S4)5햪$D y|Ϟde{svԣ6I6SO8VmooR $+C>x Syy8ȔfMp,= Wm|^wDg=ItLtnKt6X>WegzR❷jU[-0n<{6GmT ֶG"0j+=L~,OBjZcZ<4\]z۶l'vT!_ܱ^/iY.۠[\-{$ɱzw##GK=RƏnGʷAR:-:v!!Ӫ2Yjӎu˰a? ^ mgOsGoz=d GnHt]e y|yڟ& I:FgjҤpo{%rqEYIDATDΛuO@qٲzY>.Hbc#l謷S۽oݺdsf>8Cv  PK0.|K:nvדΔkisߤ :wg7QWwOgi3>>>44ovԨQiiijK.\W__mqQxƌ}R~MLLLYDgtQ{,$$$'NNm<0aBee͛7/^,TrĈfdeSS)նڂ!C-:uYZx/]$[)Sڃ6n@t&:;Fs)gΜ1zV()ꥷ.d}^Vl6e믿~N^ڻwoeO{v+mo-LtFwڗjZ7ˢe:j۷8Pbbb.]y-֩]ۻd@t&:;FgYYYMMM,?;L7o:-֩ݻ:ZɻmwDg3{tjs{s>}rsseA"ltt蜞.+.X0wܰSQXg`au22G>>.6}6Lo߾jF:AYKi ͫv[лw^zI מׯlg@t&:h}zo~ҥظ8u'jZg|||DJiL0A͛7/^,OrĈjv'H>%-fȶMvkITk7|kPɎV/FRӒK/}mةS7ϟ?;Խzj' :Y^M=2Jv{KU9_:EEE!!!>>>z-76Ȑ9iҤVd!/ذ{Dj|kODg3:+:uV:tڪw;#:K dY~:Hz +((9r,Ou>a„Ruvz3Lt]wh:ϝ;7,,Lp)/ѷRUxxvsgD>}ʂ$hSabС~1c5N N锱HXg堔'Et蜘L}6%R 644:uZU'E眜@i^@@@RR0!YVh́755~KǏD޳gO 7^=/9_ra@t&:36hAt :F.~ p :D%p=3Q Dg3DgJzDgљ@t&*J :At :\LT :D%p=3Q \LT 3zDg=aqp=X @tFu"q]EVj6ELtmW˫([fu=uu,_\љ茿) {POR@t&: K2Pҳ,Yvsљ=`|=z\S?I3nܑ="r=O)t :aVSsƃO+6l3Z-LtI#t :aߍ2`C@tX]SSrjAJK7zt]!ǯW]vҷ ~QKf[^ua)( :wU{gM?xWS[x6mufJcs U7'\h0Dx%u*se}FB]n[(/M׊Oe+NKdљg9"Qܮܨ[<%Lymmmcc#=Dgsǫ)9~) Qܷ\5$WhљgOd/E>ZIIb&=DsOw'uQ<\ҟ/ :;gD}jotyI1egm-**"=DqvviduQ:y=zx}rao9/33 :;FWL7o JvѹZqB^ڰaCvvvqqb)Ltn;FkpSKnL^l6o߾3ѹWI {ߍߊ03֬yW֒% F^^(klY4tC3fƙJ˷EF1x{xUU^k+oeJ3[.[6= (o U)):ePЀӧ>z#$`qذtO}yry'/\v2n<D6.,u5$IXj֬-V"tϞ7nOHy7U%J>V3y՗O>O`&Hn޷/EvO>wZV)(9%%%##_3ѹDgtFF\( O-ʻHL ʨG͖ڜRejp#5`ZY%:c6, RDg3"d7}Vu:_9բsk֬IKK+++W*Lt.!!jd%ڛ׋*og%k[XtPխqg>BtLtvܿ@ii}6io2/c>|@Ja᪈'Zd)KX,6P/:;^?U&iUtvPߏ]۪ :8:KujcIO>OʈdYض-J$YO-Tf+z)N>v9RSߗ`rwJtLtvOn"!:@t&:DgDg7 C2pLN-G 3љB!:@t&:S(DgDg љL :w\U(-j| oGs3ѹ\t%5& :;@“v)+ras@t&:w:S љLt3љL :)3DgGnZ9ť@t&: {Ȥކ^ ]DgљLt&:{lt3Dg7ˆ g0g2\~ƌ#YZW[]ӻ{dIڵ?O2gO) /bŬAAJKSbb}}7u5WocCCS}zyӷ/*뛛ѫ\zGj[HK ٮ=]W?lֿ_^[hrГzqo.B3Dg׊Β>wH$MPVΞ$޸?66B]?kd ^Aee|K$Z)OK5I%Ο)۫iРٓ,Nezj='URݛo9*> v+kޑ#rj.8xHa*eAv'j;>~S7Dg.K$TTЮٳ9+E{PYUd1TWʂj0RoIJfWD\7e^z&+S*kޑ#r&:/9! A쨼|zw_m^Ggnf].:d6eg/S3pRdYOhh69ܳ'9$Qz; UsJvfF=RnqDwD[霺Rϔnq.Ъ;m3Dgj_i4>ތT;sT{W{n`һ1,;\|~YzС#G>o_P﮳FniOt4)|ٲK$o{X,{÷[6':m3Dg”H'O)+y5,,D [eeR2`W–v:nwx(]Iwl^ `<:f/vgrFni1:sbUC:!oU(2r-?qfi1:m3DgΩ3o{%i\Bu5bJԩ/H4td٣$l˖Mwn]"m*kޑ…o)#춭]'EH]Av4iR-Qz'ZGZz!kEgw,˗Ϝ93>u6`dsusk>o{zLʌGR[TTh"X1+ ( PZjwu06;;Fc9,{P;a^m溎y^v:j'? _b?l*+,\++m>}:}ן83NK,[^ѹogM =n/1QuggxꝲYllZNҒ|V" o7`n>&P*>&}˅[-N@)6U-^{Y\+ [Y^^r<`n 7G~xNt>:)K$XM Iիrl|JQ?lUS.uFt#c?c!m14lukVώ4Xhmے) _w˲d\Y8:؟N~ę D粥i_^ѹ4u%}~\uYu63UIZJ,+:W[ (LTb څS6c?cSڱz2@Bnm8pj)v:KY|\>.?qp^3ωε',[s蜚?wȐ~ꀄ R\e"0^^-LI?&l9tCRm)-M T%Ōh 'mZmzhJ|K3lզw׮yҤpً$y^oqtR2Ell}*7dY~joTxF?CtssSM?^љj, @"3 /)-/]dLt#_Gt,[6畴  : ;MtP[pxz9Dg+? ѹI=Ggz Dg>o{gVط/;fL2^j" zvh 2co Bv++IJy^m66qwv+fÃ:~3Dl15ӋOH2}Ȝ8q#A7 p7/ ,A?߳'YV޸_jS P>%VYgLmVYʙ-6x}IFMPA;pݥS_K >|RM!:@tvE?ˍ8sAuN6. szPh2ɻA64xΜWBYV'up:VXi]sذ|/Y:!**TU#i^wmGɩNݰ :Ꚍjjv2޼q@{]_"66B;,m9;{Y`O 0.[6At޺ug7]縸 S_k,\2⢵ VҳDcW7\m ׳Lϧ?ʹ' 9⯫ O?DgT<DW]X?ٓ}1}QS'ЍΤd3<=n55O*[ֹbW]^MײL7j8msݕ{~=St` I]ס~b *s]0Ĺ k;D-\{Qp[Tc^T3 :59_=a;VSGǘлw^z-]TYsHoooq~zLL`MLLT§~ >|XYr>‚ ~ً쫡?'e٬,Țn mdwײ^zh4zyyY55))IsРAw^jU``cǎjFJKx .s%@tn5Gf.;8_k5kK.I WTTtͦ&Y)yqm"""$JԆȨ(\9|p8ewȂTk& UɗGpdYbq\\U=Vm6vw-'NY.&LPYY)u.^X2#Zdڂ!C-:uee ѹ-neeg'Ibcuvߙ3g|J]^ZxyeY!RG+x{{':;worYYYeaѢEJ*Gg-ednMUi.:Y[mppr믿P$D6q[ti3C~:el2 qmz'vmmjjC9RewIK';yuD :E΃85Vdx|.۷uęN9w7oTu>w\{.Z,ƌ+#"gNNs=׶\kRs{^'۶!((w1Ljs ~f* ƾqt'vR:7ڱ}͕YaÆ-^XҎuc-Ȟ={VVVm..u ,߿G}$300pѢEwdtvFgNmCzzLNNJ^]yxz'A|1e#%PjgɑÇ)%&M-q011! h ݜ Y~`ll!ǎ]w,?u: ];YmېDj O|Ʈy믿6LYT]/222==s$1g)9eQfv~:@tnҭ1Kg_zIIJJ3 w8 ѹ#O;Duxz9@tvopg7o\S5KA'@tv ׻l\?Ue vqe7 mtpZb7?MײYKP!{!@t /e}+'bsA_+Dg@tnAфߝH\mG9_l熆)SC3$-]h4zyy  $V>#))) {РAw^jU`` RerHY<жQF-p႟_}}m*xd3Č3DEE_~]VVʁ'&&M:616;KM/ti%3^\hUwDDD4K8q6jl~DjGu„ 7o\xDO#F(I-**MMMRTj 3O:URmtotȥKqqqzi]$~@t"Τdwn\U!3g(˲ RMu_v=z;wN`#ڀiɩV/Gyk^z٭688l6YvSiztW^ݻ=jAg :e%=wGf.kӛ :'?^(cԁ"Vo߾}tfGzͰ?@t\;{js'|jZsh4jo(J~zwۓYdY~:ȗAAAdyfGgi:Zɻv;Sz3D&{iL5X:W"]xx\)js{2k>}rsseAktt|.+?xgϞmnܹsΟ?/˧NR\+taÆ-^XҳZTTYsԼ8|&HǺb 2w_Vgcԩa911Q"66V#ͬ999^^^IIIsFF2fîEIΰL3l2%CU'Ôw0Æ<|pPPrh}Ys,.ȗ˖ew^dddzz:W> :ZSGמx1ǾW4w.2ݍfY;?ѹuIݙmD'`0]8pkۨGן*C=a{i2ѹcD;5u[;) @tȔp'Tф :w0I4O$ ԁO? :oW˫2sWrnFMch&1¶=x}bK7lw%:ۖ}1\\]ԭUݝ[~l}#$4woByBJ׌ͽLo kjJNJveq_y vv{9+xטWϬ֭:ǯWYt|t[shw-y%lq{ʴ.W퍞z/6_M=>oٴՕ_)s7t.g&[Rw.ߜ|rɒ ;{jAΕ2>#֭CJחkŧ?I2}pZ⭦f׊3vmȜjd(wܨ[<%Lymmmcc#ݏ5%'2{@x76딓Ė rtnnez>Qg(Vʒ`k%%% Zt>M.XOJf)zMD?_IqqqYYܝ|9S{gW90,;kkQQܝ|vvsE.(=zZ =|rn9/33 Nt.j)M=5:[}Dkũ> yiÆ 뢳pm6ܰ>|#G>8:>,X нztVc>=wr*蜒QTTTQQ?l@tvTfDɓm8|^tWr6-rfO^]u mU "a7,,D>VnU3L+**,YV.bwK磳$oy;q9atN 99y͚5iiiyyyeeeDggŋ@d=6/CBfakllۖd[@_nouzGJt9,,D"is)a {U7[?--MURL~Fe>#G~/>6J WEDy=iDMtg$.]5uh&-w!|"/pBQ^2QYuiu @tDgLtDg":TVV!2n9 c?&:<:gNz2.EjZ+Ճ3|D g4ɥB20:_Y':O{YeniiCTF};[l˕y֟Η=HuVkcQт3v=]Pprtϼ΍uw̛'-gxu9]BvI;+V?0'sE<}}|,=7a/H?V{n7/77oWϜɘG4p]|N=ƻ0>Kt5:KQÐD1Sݎ?^#Hay=Mj%ƌO=^{<#MI~T__ǘMEݻ7KP/jO[+p&I6LH4.MI\ꁸFM+ffll&#?q8qw2I{I|i1WIVRUU3ѹjm Θâc9(,=mhv^3-G{ɵk;̛wkY3&W-J;99sFg""Gy ٤ۉDgݐ}sL"\kPޱX~*[>p#:$\9> 86lxD"}z<|N OF2W-9vq} dyuQF]QJpp܎ItN&9K6jS;Npkk+|%jN6Dg᫒E.^zEjdW-Gh"Ҭ%9b~芒FM]uEѾWv{ S {[/ݼ{I}Ld*@p-\waS>=:w'r\yS7u$5{夔^gUUoq G٘Mu7,=vlo79\K;$=J>G|0n:pḥ;PQG!,[c:G13HOt~{xWm˖}9'gNYmF[ ڍ6,n}B2kSsiiqvmKͯFM d%_V!=_ fIIQ{Nō N'"<0X~2s6v$:pḥ;pU~IcݷV>p8 =yi%kU̴;󭁟ehjjO%{Ū~F {k"lΩwʖDż&F/T,rt#:Eg$qb;]k2gս%MImYQÇl6z `O>=YZk͵(L_ܪ_(\t?|O6Yʨ<խHh,p86,{-y$Qi ~f͝v57vn@ 0226<>>9^ި_>֛%7I{zz~(y6,FFF/T6%Ϯܲo?R(*5|Ak[MАмJ\>7k[5 K my||<ٮ\ \#GS,YY,YnY*s%gOϒN<qƽzE:HQ^VeA51ClIr3ȔO===Nnwvv &sOfC2' zn[]l@L&N@2eN@fEg5=B!u\. 4o2d2eN@E}>ROf7uf0;>@aI0`04o2`6Eg L&)IENDB`pg_auto_failover-1.6.3/docs/how-to.rst000066400000000000000000000227661414244367200177710ustar00rootroot00000000000000.. _how-to: Main pg_autoctl commands ======================== pg_auto_failover includes the command line tool ``pg_autoctl`` that implements many commands to manage your Postgres nodes. To implement the Postgres architectures described in this documentation, and more, it is generally possible to use only some of the many ``pg_autoctl`` commands. This section of the documentation is a short introduction to the main commands that are useful when getting started with pg_auto_failover. More commands are available and help deal with a variety of situations, see the :ref:`manual` for the whole list. To understand which replication settings to use in your case, see :ref:`architecture_basics` section and then the :ref:`multi_node_architecture` section. To follow a step by step guide that you can reproduce on your own Azure subscription and create a production Postgres setup from VMs, see the :ref:`tutorial` section. To understand how to setup pg_auto_failover in a way that is compliant with your internal security guide lines, read the :ref:`security` section. Command line environment, configuration files, etc -------------------------------------------------- As a command line tool ``pg_autoctl`` depends on some environment variables. Mostly, the tool re-uses the Postgres environment variables that you might already know. To manage a Postgres node pg_auto_failover needs to know its data directory location on-disk. For that, some users will find it easier to export the ``PGDATA`` variable in their environment. The alternative consists of always using the ``--pgdata`` option that is available to all the ``pg_autoctl`` commands. Creating Postgres Nodes ----------------------- To get started with the simplest Postgres failover setup, 3 nodes are needed: the pg_auto_failover monitor, and 2 Postgres nodes that will get assigned roles by the monitor. One Postgres node will be assigned the primary role, the other one will get assigned the secondary role. To create the monitor use the command:: $ pg_autoctl create monitor The create the Postgres nodes use the following command on each node you want to create:: $ pg_autoctl create postgres While those *create* commands initialize your nodes, now you have to actually run the Postgres service that are expected to be running. For that you can manually run the following command on every node:: $ pg_autoctl run It is also possible (and recommended) to integrate the pg_auto_failover service in your usual service management facility. When using **systemd** the following commands can be used to produce the unit file configuration required:: $ pg_autoctl show systemd INFO HINT: to complete a systemd integration, run the following commands: INFO pg_autoctl -q show systemd --pgdata "/tmp/pgaf/m" | sudo tee /etc/systemd/system/pgautofailover.service INFO sudo systemctl daemon-reload INFO sudo systemctl enable pgautofailover INFO sudo systemctl start pgautofailover [Unit] ... While it is expected that for a production deployment each node actually is a separate machine (virtual or physical, or even a container), it is also possible to run several Postgres nodes all on the same machine for testing or development purposes. .. tip:: When running several ``pg_autoctl`` nodes on the same machine for testing or contributing to pg_auto_failover, each Postgres instance needs to run on its own port, and with its own data directory. It can make things easier to then set the environement variables ``PGDATA`` and ``PGPORT`` in each terminal, shell, or tab where each instance is started. Inspecting nodes ---------------- Once your Postgres nodes have been created, and once each ``pg_autoctl`` service is running, it is possible to inspect the current state of the formation with the following command:: $ pg_autoctl show state The ``pg_autoctl show state`` commands outputs the current state of the system only once. Sometimes it would be nice to have an auto-updated display such as provided by common tools such as `watch(1)` or `top(1)` and the like. For that, the following commands are available (see also :ref:`pg_autoctl_watch`):: $ pg_autoctl watch $ pg_autoctl show state --watch To analyze what's been happening to get to the current state, it is possible to review the past events generated by the pg_auto_failover monitor with the following command:: $ pg_autoctl show events .. hint:: The ``pg_autoctl show`` commands can be run from any node in your system. Those command need to connect to the monitor and print the current state or the current known list of events as per the monitor view of the system. Use ``pg_autoctl show state --local`` to have a view of the local state of a given node without connecting to the monitor Postgres instance. The option ``--json`` is available in most ``pg_autoctl`` commands and switches the output format from a human readable table form to a program friendly JSON pretty-printed output. Inspecting and Editing Replication Settings ------------------------------------------- When creating a node it is possible to use the ``--candidate-priority`` and the ``--replication-quorum`` options to set the replication properties as required by your choice of Postgres architecture. To review the current replication settings of a formation, use one of the two following commands, which are convenient aliases (the same command with two ways to invoke it):: $ pg_autoctl show settings $ pg_autoctl get formation settings It is also possible to edit those replication settings at any time while your nodes are in production: you can change your mind or adjust to new elements without having to re-deploy everything. Just use the following commands to adjust the replication settings on the fly:: $ pg_autoctl set formation number-sync-standbys $ pg_autoctl set node replication-quorum $ pg_autoctl set node candidate-priority .. important:: The ``pg_autoctl get`` and ``pg_autoctl set`` commands always connect to the monitor Postgres instance. The ``pg_autoctl set`` command then changes the replication settings on the node registration on the monitor. Then the monitor assigns the APPLY_SETTINGS state to the current primary node in the system for it to apply the new replication settings to its Postgres streaming replication setup. As a result, the ``pg_autoctl set`` commands requires a stable state in the system to be allowed to proceed. Namely, the current primary node in the system must have both its Current State and its Assigned State set to primary, as per the ``pg_autoctl show state`` output. Implementing Maintenance Operations ----------------------------------- When a Postgres node must be taken offline for a maintenance operation, such as e.g. a kernel security upgrade or a minor Postgres update, it is best to make it so that the pg_auto_failover monitor knows about it. - For one thing, a node that is known to be in maintenance does not participate in failovers. If you are running with two Postgres nodes, then failover operations are entirely prevented while the standby node is in maintenance. - Moreover, depending on your replication settings, enabling maintenance on your standby ensures that the primary node switches to async replication before Postgres is shut down on the secondary, avoiding write queries to be blocked. To implement maintenance operations, use the following commands:: $ pg_autoctl enable maintenance $ pg_autoctl disable maintenance The main ``pg_autoctl run`` service that is expected to be running in the background should continue to run during the whole maintenance operation. When a node is in the maintenance state, the ``pg_autoctl`` service is not controlling the Postgres service anymore. Note that it is possible to enable maintenance on a primary Postgres node, and that operation then requires a failover to happen first. It is possible to have pg_auto_failover orchestrate that for you when using the command:: $ pg_autoctl enable maintenance --allow-failover .. important:: The ``pg_autoctl enable`` and ``pg_autoctl disable`` commands requires a stable state in the system to be allowed to proceed. Namely, the current primary node in the system must have both its Current State and its Assigned State set to primary, as per the ``pg_autoctl show state`` output. Manual failover, switchover, and promotions ------------------------------------------- In the cases when a failover is needed without having an actual node failure, the pg_auto_failover monitor can be used to orchestrate the operation. Use one of the following commands, which are synonyms in the pg_auto_failover design:: $ pg_autoctl perform failover $ pg_autoctl perform switchover Finally, it is also possible to “elect” a new primary node in your formation with the command:: $ pg_autoctl perform promotion .. important:: The ``pg_autoctl perform`` commands requires a stable state in the system to be allowed to proceed. Namely, the current primary node in the system must have both its Current State and its Assigned State set to primary, as per the ``pg_autoctl show state`` output. What's next? ------------ This section of the documentation is meant to help users get started by focusing on the main commands of the ``pg_autoctl`` tool. Each command has many options that can have very small impact, or pretty big impact in terms of security or architecture. Read the rest of the manual to understand how to best use the many ``pg_autoctl`` options to implement your specific Postgres production architecture. pg_auto_failover-1.6.3/docs/index.rst000066400000000000000000000012521414244367200176460ustar00rootroot00000000000000.. pg_auto_failover documentation master file, created by sphinx-quickstart on Sat May 5 14:33:23 2018. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to pg_auto_failover's documentation! ============================================ .. toctree:: :maxdepth: 4 :caption: Contents: intro how-to tutorial architecture architecture-multi-standby failover-state-machine fault-tolerance install security ref/manual ref/configuration operations faq .. Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` pg_auto_failover-1.6.3/docs/install.rst000066400000000000000000000105651414244367200202140ustar00rootroot00000000000000.. _install: Installing pg_auto_failover =========================== We provide native system packages for pg_auto_failover on most popular Linux distributions. Use the steps below to install pg_auto_failover on PostgreSQL 11. At the current time pg_auto_failover is compatible with both PostgreSQL 10 and PostgreSQL 11. Ubuntu or Debian ---------------- Quick install ~~~~~~~~~~~~~ The following installation method downloads a bash script that automates several steps. The full script is available for review at our `package cloud installation instructions`__ page. __ https://packagecloud.io/citusdata/community/install#bash .. code-block:: bash # add the required packages to your system curl https://install.citusdata.com/community/deb.sh | sudo bash # install pg_auto_failover sudo apt-get install postgresql-11-auto-failover # confirm installation /usr/bin/pg_autoctl --version Manual Installation ~~~~~~~~~~~~~~~~~~~ If you'd prefer to install your repo on your system manually, follow the instructions from `package cloud manual installation`__ page. This page will guide you with the specific details to achieve the 3 steps: __ https://packagecloud.io/citusdata/community/install#manual 1. install CitusData GnuPG key for its package repository 2. install a new apt source for CitusData packages 3. update your available package list Then when that's done, you can proceed with installing pg_auto_failover itself as in the previous case: .. code-block:: bash # install pg_auto_failover sudo apt-get install postgresql-11-auto-failover # confirm installation /usr/bin/pg_autoctl --version Fedora, CentOS, or Red Hat -------------------------- Quick install ~~~~~~~~~~~~~ The following installation method downloads a bash script that automates several steps. The full script is available for review at our `package cloud installation instructions page`__ url. __ https://packagecloud.io/citusdata/community/install#bash .. code-block:: bash # add the required packages to your system curl https://install.citusdata.com/community/rpm.sh | sudo bash # install pg_auto_failover sudo yum install -y pg-auto-failover14_12 # confirm installation /usr/pgsql-12/bin/pg_autoctl --version Manual installation ~~~~~~~~~~~~~~~~~~~ If you'd prefer to install your repo on your system manually, follow the instructions from `package cloud manual installation`__ page. This page will guide you with the specific details to achieve the 3 steps: 1. install the pygpgme yum-utils packages for your distribution 2. install a new RPM reposiroty for CitusData packages 3. update your local yum cache Then when that's done, you can proceed with installing pg_auto_failover itself as in the previous case: .. code-block:: bash # install pg_auto_failover sudo yum install -y pg-auto-failover14_12 # confirm installation /usr/pgsql-12/bin/pg_autoctl --version __ https://packagecloud.io/citusdata/community/install#manual-rpm Installing a pgautofailover Systemd unit ---------------------------------------- The command ``pg_autoctl show systemd`` outputs a systemd unit file that you can use to setup a boot-time registered service for pg_auto_failover on your machine. Here's a sample output from the command: .. code-block:: bash $ export PGDATA=/var/lib/postgresql/monitor $ pg_autoctl show systemd 13:44:34 INFO HINT: to complete a systemd integration, run the following commands: 13:44:34 INFO pg_autoctl -q show systemd --pgdata "/var/lib/postgresql/monitor" | sudo tee /etc/systemd/system/pgautofailover.service 13:44:34 INFO sudo systemctl daemon-reload 13:44:34 INFO sudo systemctl start pgautofailover [Unit] Description = pg_auto_failover [Service] WorkingDirectory = /var/lib/postgresql Environment = 'PGDATA=/var/lib/postgresql/monitor' User = postgres ExecStart = /usr/lib/postgresql/10/bin/pg_autoctl run Restart = always StartLimitBurst = 0 [Install] WantedBy = multi-user.target Copy/pasting the commands given in the hint output from the command will enable the pgautofailer service on your system, when using systemd. It is important that PostgreSQL is started by ``pg_autoctl`` rather than by systemd itself, as it might be that a failover has been done during a reboot, for instance, and that once the reboot complete we want the local Postgres to re-join as a secondary node where it used to be a primary node. pg_auto_failover-1.6.3/docs/intro.rst000066400000000000000000000076421414244367200177030ustar00rootroot00000000000000Introduction to pg_auto_failover ================================ pg_auto_failover is an extension for PostgreSQL that monitors and manages failover for a postgres clusters. It is optimised for simplicity and correctness. Single Standby Architecture --------------------------- .. figure:: ./tikz/arch-single-standby.svg :alt: pg_auto_failover Architecture with a primary and a standby node pg_auto_failover architecture with a primary and a standby node pg_auto_failover implements Business Continuity for your PostgreSQL services. pg_auto_failover implements a single PostgreSQL service using multiple nodes with automated failover, and automates PostgreSQL maintenance operations in a way that guarantees availability of the service to its users and applications. To that end, pg_auto_failover uses three nodes (machines, servers) per PostgreSQL service: - a PostgreSQL primary node, - a PostgreSQL secondary node, using Synchronous Hot Standby, - a pg_auto_failover Monitor node that acts both as a witness and an orchestrator. The pg_auto_failover Monitor implements a state machine and relies on in-core PostgreSQL facilities to deliver HA. For example. when the *secondary* node is detected to be unavailable, or when its lag is reported above a defined threshold (the default is 1 WAL files, or 16MB, see the `pgautofailover.promote_wal_log_threshold` GUC on the pg_auto_failover monitor), then the Monitor removes it from the `synchronous_standby_names` setting on the *primary* node. Until the *secondary* is back to being monitored healthy, failover and switchover operations are not allowed, preventing data loss. Multiple Standby Architecture ----------------------------- .. figure:: ./tikz/arch-multi-standby.svg :alt: pg_auto_failover Architecture for a standalone PostgreSQL service pg_auto_failover architecture with a primary and two standby nodes In the pictured architecture, pg_auto_failover implements Business Continuity and data availability by implementing a single PostgreSQL service using multiple with automated failover and data redundancy. Even after losing any Postgres node in a production system, this architecture maintains two copies of the data on two different nodes. When using more than one standby, different architectures can be achieved with pg_auto_failover, depending on the objectives and trade-offs needed for your production setup. Multiple Standbys Architecture with 3 standby nodes, one async -------------------------------------------------------------- .. figure:: ./tikz/arch-three-standby-one-async.svg :alt: pg_auto_failover architecture with a primary and three standby nodes pg_auto_failover architecture with a primary and three standby nodes When setting the three parameters above, it's possible to design very different Postgres architectures for your production needs. In this case, the system is setup with two standby nodes participating in the replication quorum, allowing for ``number_sync_standbys = 1``. The system always maintains a minimum of two copies of the data set: one on the primary, another one on one on either node B or node D. Whenever we lose one of those nodes, we can hold to this guarantee of two copies of the data set. Adding to that, we have the standby server C which has been set up to not participate in the replication quorum. Node C will not be found in the ``synchronous_standby_names`` list of nodes. Also, node C is set up in a way to never be a candidate for failover, with ``candidate-priority = 0``. This architecture would fit a situation where nodes A, B, and D are deployed in the same data center or availability zone, and node C in another. Those three nodes are set up to support the main production traffic and implement high availability of both the Postgres service and the data set. Node C might be set up for Business Continuity in case the first data center is lost, or maybe for reporting the need for deployment on another application domain. pg_auto_failover-1.6.3/docs/operations.rst000066400000000000000000000466651414244367200207430ustar00rootroot00000000000000Operating pg_auto_failover ========================== This section is not yet complete. Please contact us with any questions. Deployment ---------- pg_auto_failover is a general purpose tool for setting up PostgreSQL replication in order to implement High Availability of the PostgreSQL service. Provisioning ------------ It is also possible to register pre-existing PostgreSQL instances with a pg_auto_failover monitor. The ``pg_autoctl create`` command honors the ``PGDATA`` environment variable, and checks whether PostgreSQL is already running. If Postgres is detected, the new node is registered in SINGLE mode, bypassing the monitor's role assignment policy. Upgrading pg_auto_failover, from versions 1.4 onward ----------------------------------------------------- When upgrading a pg_auto_failover setup, the procedure is different on the monitor and on the Postgres nodes: - on the monitor, the internal pg_auto_failover database schema might have changed and needs to be upgraded to its new definition, porting the existing data over. The pg_auto_failover database contains the registration of every node in the system and their current state. It is not possible to trigger a failover during the monitor update. Postgres operations on the Postgres nodes continue normally. During the restart of the monitor, the other nodes might have trouble connecting to the monitor. The ``pg_autoctl`` command is designed to retry connecting to the monitor and handle errors gracefully. - on the Postgres nodes, the ``pg_autoctl`` command connects to the monitor every once in a while (every second by default), and then calls the ``node_active`` protocol, a stored procedure in the monitor databases. The ``pg_autoctl`` also verifies at each connection to the monitor that it's running the expected version of the extension. When that's not the case, the "node-active" sub-process quits, to be restarted with the possibly new version of the ``pg_autoctl`` binary found on-disk. As a result, here is the standard upgrade plan for pg_auto_failover: 1. Upgrade the pg_auto_failover package on the all the nodes, monitor included. When using a debian based OS, this looks like the following command when from 1.4 to 1.5:: sudo apt-get remove pg-auto-failover-cli-1.4 postgresql-11-auto-failover-1.4 sudo apt-get install -q -y pg-auto-failover-cli-1.5 postgresql-11-auto-failover-1.5 2. Restart the ``pgautofailover`` service on the monitor. When using the systemd integration, all we need to do is:: sudo systemctl restart pgautofailover Then we may use the following commands to make sure that the service is running as expected:: sudo systemctl status pgautofailover sudo journalctl -u pgautofailover At this point it is expected that the ``pg_autoctl`` logs show that an upgrade has been performed by using the ``ALTER EXTENSION pgautofailover UPDATE TO ...`` command. The monitor is ready with the new version of pg_auto_failover. When the Postgres nodes ``pg_autoctl`` process connects to the new monitor version, the check for version compatibility fails, and the "node-active" sub-process exits. The main ``pg_autoctl`` process supervisor then restart the "node-active" sub-process from its on-disk binary executable file, which has been upgraded to the new version. That's why we first install the new packages for pg_auto_failover on every node, and only then restart the monitor. .. important:: Before upgrading the monitor, which is a simple restart of the ``pg_autoctl`` process, it is important that the OS packages for pgautofailover be updated on all the Postgres nodes. When that's not the case, ``pg_autoctl`` on the Postgres nodes will still detect a version mismatch with the monitor extension, and the "node-active" sub-process will exit. And when restarted automatically, the same version of the local ``pg_autoctl`` binary executable is found on-disk, leading to the same version mismatch with the monitor extension. After restarting the "node-active" process 5 times, ``pg_autoctl`` quits retrying and stops. This includes stopping the Postgres service too, and a service downtime might then occur. And when the upgrade is done we can use ``pg_autoctl show state`` on the monitor to see that eveything is as expected. Upgrading from previous pg_auto_failover versions ------------------------------------------------- The new upgrade procedure described in the previous section is part of pg_auto_failover since version 1.4. When upgrading from a previous version of pg_auto_failover, up to and including version 1.3, then all the ``pg_autoctl`` processes have to be restarted fully. To prevent triggering a failover during the upgrade, it's best to put your secondary nodes in maintenance. The procedure then looks like the following: 1. Enable maintenance on your secondary node(s):: pg_autoctl enable maintenance 2. Upgrade the OS packages for pg_auto_failover on every node, as per previous section. 3. Restart the monitor to upgrade it to the new pg_auto_failover version: When using the systemd integration, all we need to do is:: sudo systemctl restart pgautofailover Then we may use the following commands to make sure that the service is running as expected:: sudo systemctl status pgautofailover sudo journalctl -u pgautofailover At this point it is expected that the ``pg_autoctl`` logs show that an upgrade has been performed by using the ``ALTER EXTENSION pgautofailover UPDATE TO ...`` command. The monitor is ready with the new version of pg_auto_failover. 4. Restart ``pg_autoctl`` on all Postgres nodes on the cluster. When using the systemd integration, all we need to do is:: sudo systemctl restart pgautofailover As in the previous point in this list, make sure the service is now running as expected. 5. Disable maintenance on your secondary nodes(s):: pg_autoctl disable maintenance Extension dependencies when upgrading the monitor ------------------------------------------------- Since version 1.4.0 the ``pgautofailover`` extension requires the Postgres contrib extension ``btree_gist``. The ``pg_autoctl`` command arranges for the creation of this dependency, and has been buggy in some releases. As a result, you might have trouble upgrade the pg_auto_failover monitor to a recent version. It is possible to fix the error by connecting to the monitor Postgres database and running the ``create extension`` command manually:: # create extension btree_gist; Cluster Management and Operations --------------------------------- It is possible to operate pg_auto_failover formations and groups directly from the monitor. All that is needed is an access to the monitor Postgres database as a client, such as ``psql``. It's also possible to add those management SQL function calls in your own ops application if you have one. For security reasons, the ``autoctl_node`` is not allowed to perform maintenance operations. This user is limited to what ``pg_autoctl`` needs. You can either create a specific user and authentication rule to expose for management, or edit the default HBA rules for the ``autoctl`` user. In the following examples we're directly connecting as the ``autoctl`` role. The main operations with pg_auto_failover are node maintenance and manual failover, also known as a controlled switchover. Maintenance of a secondary node ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It is possible to put a secondary node in any group in a MAINTENANCE state, so that the Postgres server is not doing *synchronous replication* anymore and can be taken down for maintenance purposes, such as security kernel upgrades or the like. The command line tool ``pg_autoctl`` exposes an API to schedule maintenance operations on the current node, which must be a secondary node at the moment when maintenance is requested. Here's an example of using the maintenance commands on a secondary node, including the output. Of course, when you try that on your own nodes, dates and PID information might differ:: $ pg_autoctl enable maintenance 17:49:19 14377 INFO Listening monitor notifications about state changes in formation "default" and group 0 17:49:19 14377 INFO Following table displays times when notifications are received Time | ID | Host | Port | Current State | Assigned State ---------+-----+-----------+--------+---------------------+-------------------- 17:49:19 | 1 | localhost | 5001 | primary | wait_primary 17:49:19 | 2 | localhost | 5002 | secondary | wait_maintenance 17:49:19 | 2 | localhost | 5002 | wait_maintenance | wait_maintenance 17:49:20 | 1 | localhost | 5001 | wait_primary | wait_primary 17:49:20 | 2 | localhost | 5002 | wait_maintenance | maintenance 17:49:20 | 2 | localhost | 5002 | maintenance | maintenance The command listens to the state changes in the current node's formation and group on the monitor and displays those changes as it receives them. The operation is done when the node has reached the ``maintenance`` state. It is now possible to disable maintenance to allow ``pg_autoctl`` to manage this standby node again:: $ pg_autoctl disable maintenance 17:49:26 14437 INFO Listening monitor notifications about state changes in formation "default" and group 0 17:49:26 14437 INFO Following table displays times when notifications are received Time | ID | Host | Port | Current State | Assigned State ---------+-----+-----------+--------+---------------------+-------------------- 17:49:27 | 2 | localhost | 5002 | maintenance | catchingup 17:49:27 | 2 | localhost | 5002 | catchingup | catchingup 17:49:28 | 2 | localhost | 5002 | catchingup | secondary 17:49:28 | 1 | localhost | 5001 | wait_primary | primary 17:49:28 | 2 | localhost | 5002 | secondary | secondary 17:49:29 | 1 | localhost | 5001 | primary | primary When a standby node is in maintenance, the monitor sets the primary node replication to WAIT_PRIMARY: in this role, the PostgreSQL streaming replication is now asynchronous and the standby PostgreSQL server may be stopped, rebooted, etc. Maintenance of a primary node ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A primary node must be available at all times in any formation and group in pg_auto_failover, that is the invariant provided by the whole solution. With that in mind, the only way to allow a primary node to go to a maintenance mode is to first failover and promote the secondary node. The same command ``pg_autoctl enable maintenance`` implements that operation when run on a primary node with the option ``--allow-failover``. Here is an example of such an operation:: $ pg_autoctl enable maintenance 11:53:03 50526 WARN Enabling maintenance on a primary causes a failover 11:53:03 50526 FATAL Please use --allow-failover to allow the command proceed As we can see the option ``allow-maintenance`` is mandatory. In the next example we use it:: $ pg_autoctl enable maintenance --allow-failover 13:13:42 1614 INFO Listening monitor notifications about state changes in formation "default" and group 0 13:13:42 1614 INFO Following table displays times when notifications are received Time | ID | Host | Port | Current State | Assigned State ---------+-----+-----------+--------+---------------------+-------------------- 13:13:43 | 2 | localhost | 5002 | primary | prepare_maintenance 13:13:43 | 1 | localhost | 5001 | secondary | prepare_promotion 13:13:43 | 1 | localhost | 5001 | prepare_promotion | prepare_promotion 13:13:43 | 2 | localhost | 5002 | prepare_maintenance | prepare_maintenance 13:13:44 | 1 | localhost | 5001 | prepare_promotion | stop_replication 13:13:45 | 1 | localhost | 5001 | stop_replication | stop_replication 13:13:46 | 1 | localhost | 5001 | stop_replication | wait_primary 13:13:46 | 2 | localhost | 5002 | prepare_maintenance | maintenance 13:13:46 | 1 | localhost | 5001 | wait_primary | wait_primary 13:13:47 | 2 | localhost | 5002 | maintenance | maintenance When the operation is done we can have the old primary re-join the group, this time as a secondary:: $ pg_autoctl disable maintenance 13:14:46 1985 INFO Listening monitor notifications about state changes in formation "default" and group 0 13:14:46 1985 INFO Following table displays times when notifications are received Time | ID | Host | Port | Current State | Assigned State ---------+-----+-----------+--------+---------------------+-------------------- 13:14:47 | 2 | localhost | 5002 | maintenance | catchingup 13:14:47 | 2 | localhost | 5002 | catchingup | catchingup 13:14:52 | 2 | localhost | 5002 | catchingup | secondary 13:14:52 | 1 | localhost | 5001 | wait_primary | primary 13:14:52 | 2 | localhost | 5002 | secondary | secondary 13:14:53 | 1 | localhost | 5001 | primary | primary Triggering a failover ^^^^^^^^^^^^^^^^^^^^^ It is possible to trigger a manual failover, or a switchover, using the command ``pg_autoctl perform failover``. Here's an example of what happens when running the command:: $ pg_autoctl perform failover 11:58:00 53224 INFO Listening monitor notifications about state changes in formation "default" and group 0 11:58:00 53224 INFO Following table displays times when notifications are received Time | ID | Host | Port | Current State | Assigned State ---------+-----+-----------+--------+--------------------+------------------- 11:58:01 | 1 | localhost | 5001 | primary | draining 11:58:01 | 2 | localhost | 5002 | secondary | prepare_promotion 11:58:01 | 1 | localhost | 5001 | draining | draining 11:58:01 | 2 | localhost | 5002 | prepare_promotion | prepare_promotion 11:58:02 | 2 | localhost | 5002 | prepare_promotion | stop_replication 11:58:02 | 1 | localhost | 5001 | draining | demote_timeout 11:58:03 | 1 | localhost | 5001 | demote_timeout | demote_timeout 11:58:04 | 2 | localhost | 5002 | stop_replication | stop_replication 11:58:05 | 2 | localhost | 5002 | stop_replication | wait_primary 11:58:05 | 1 | localhost | 5001 | demote_timeout | demoted 11:58:05 | 2 | localhost | 5002 | wait_primary | wait_primary 11:58:05 | 1 | localhost | 5001 | demoted | demoted 11:58:06 | 1 | localhost | 5001 | demoted | catchingup 11:58:06 | 1 | localhost | 5001 | catchingup | catchingup 11:58:08 | 1 | localhost | 5001 | catchingup | secondary 11:58:08 | 2 | localhost | 5002 | wait_primary | primary 11:58:08 | 1 | localhost | 5001 | secondary | secondary 11:58:08 | 2 | localhost | 5002 | primary | primary Again, timings and PID numbers are not expected to be the same when you run the command on your own setup. Also note in the output that the command shows the whole set of transitions including when the old primary is now a secondary node. The database is available for read-write traffic as soon as we reach the state ``wait_primary``. Implementing a controlled switchover ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It is generally useful to distinguish a *controlled switchover* to a *failover*. In a controlled switchover situation it is possible to organise the sequence of events in a way to avoid data loss and lower downtime to a minimum. In the case of pg_auto_failover, because we use **synchronous replication**, we don't face data loss risks when triggering a manual failover. Moreover, our monitor knows the current primary health at the time when the failover is triggered, and drives the failover accordingly. So to trigger a controlled switchover with pg_auto_failover you can use the same API as for a manual failover:: $ pg_autoctl perform switchover Because the subtelties of orchestrating either a controlled switchover or an unplanned failover are all handled by the monitor, rather than the client side command line, at the client level the two command ``pg_autoctl perform failover`` and ``pg_autoctl perform switchover`` are synonyms, or aliases. Current state, last events -------------------------- The following commands display information from the pg_auto_failover monitor tables ``pgautofailover.node`` and ``pgautofailover.event``: :: $ pg_autoctl show state $ pg_autoctl show events When run on the monitor, the commands outputs all the known states and events for the whole set of formations handled by the monitor. When run on a PostgreSQL node, the command connects to the monitor and outputs the information relevant to the service group of the local node only. For interactive debugging it is helpful to run the following command from the monitor node while e.g. initializing a formation from scratch, or performing a manual failover:: $ watch pg_autoctl show state Monitoring pg_auto_failover in Production ----------------------------------------- The monitor reports every state change decision to a LISTEN/NOTIFY channel named ``state``. PostgreSQL logs on the monitor are also stored in a table, ``pgautofailover.event``, and broadcast by NOTIFY in the channel ``log``. .. _replacing_monitor_online: Replacing the monitor online ---------------------------- When the monitor node is not available anymore, it is possible to create a new monitor node and then switch existing nodes to a new monitor by using the following commands. 1. Apply the STONITH approach on the old monitor to make sure this node is not going to show up again during the procedure. This step is sometimes refered to as “fencing”. 2. On every node, ending with the (current) Postgres primary node for each group, disable the monitor while ``pg_autoctl`` is still running:: $ pg_autoctl disable monitor --force 3. Create a new monitor node:: $ pg_autoctl create monitor ... 4. On the current primary node first, so that it's registered first and as a primary still, for each group in your formation(s), enable the monitor online again:: $ pg_autoctl enable monitor postgresql://autoctl_node@.../pg_auto_failover 5. On every other (secondary) node, enable the monitor online again:: $ pg_autoctl enable monitor postgresql://autoctl_node@.../pg_auto_failover See :ref:`pg_autoctl_disable_monitor` and :ref:`pg_autoctl_enable_monitor` for details about those commands. This operation relies on the fact that a ``pg_autoctl`` can be operated without a monitor, and when reconnecting to a new monitor, this process reset the parts of the node state that comes from the monitor, such as the node identifier. Trouble-Shooting Guide ---------------------- pg_auto_failover commands can be run repeatedly. If initialization fails the first time -- for instance because a firewall rule hasn't yet activated -- it's possible to try ``pg_autoctl create`` again. pg_auto_failover will review its previous progress and repeat idempotent operations (``create database``, ``create extension`` etc), gracefully handling errors. pg_auto_failover-1.6.3/docs/pg_auto_failover-arch.png000066400000000000000000004654731414244367200227760ustar00rootroot00000000000000PNG  IHDR}l IDATxy\u?w|qrsWr'1`;1/qx;`M$6c0,If3/3ffׯ꾺KG d̙][UU՛R\.&D!J)fC6^`7~3y9ǯ0YAd=Bd6MdJɃ73'i>C2TIM VHv12D2iPx4+ڗPa˲i;. 䔹q8H׋)g@S#`kT}751 H$VJF0m-hW[~0S_4&dLi"(M07@9W3 ;M! kP`Tx4<`T-dxra}w_1 `a,Ʌ@$& A xz!wR+ђǻIq⡑*`G>G*N[XAFVUx8&T(H;U8ZG䡹ULm"W͎xPiÛiwvrwW I? & xH \Z.HBbUJ`s| Id-"0=*7J|k8GB 98\raB, Iq+5yd=k`'Sb,,(Q3G''uM& qC"BU,,rWq729#WQ3eIқ:LS*&2IY%BLDڈƩ`=N9ͲZ9H6H*&HrV "XjP*ĕ&GVAB%@qzĹDY^U8Pe^`z/4NlW&h+^!eH7: HTBY2d$78ezڗ'HyXFZ~-*&/ W e*报+r2itSWWd ӫ 9‡FV̔ kC?T!C9%XwI4~ңi*n Tɩxp}2G٧ytS =BNЦ S5/x_%F2N2?Hiy'5)x: JdܥTU!Jx0MiAC*RZ@d>l^%iG1AÆ,*(yjQERAC U([e˚I \L6蔵y%MläfQC*jDH+3M$C@b^eS EB2*4A`*$rx }aҵTeJ0, _͇TD*wN\i]HuRZWo|4覆FRxДopZյlPq oRw$-4 N2rrD5URUk*wߨOFM%&5` Uio#0#U3"^y &X>5J4zTdzY`ZTlXiۛx8F>  !80Zݿٽzǀ>P}jd^|_ %Tr!knoS^{j`=o7([TٵF⧉oV6{H;eb@ NCJ4UɍY{Q-&q M/v$/**W]i\%K+Jͮh.qFCXxFx^~h ?$rSVjMmE4$vm{XQ&zjTy9Ӕz%፟E!eל\!ӔQ &*JdkCTPs)m#459x47^8Cs9N52UTUCr kLmEo;P.? '\R ¿|1W?_/RXϯ1j"Ś_K%M(D_,RPSbDrVT)lA#RJZ[Rݴ 0%+u1X$fK! pm5_tPŵP7Ԋp$_w-%]c7X<s&82 D_ݔVDf zz˧<^7!T1M%՚GG隬 DS+#_VƼƮmAx2fwecة#_%pb%$ A4d[ Ix%,@24 9X?fcI$RvVLS@^&p,aphPDcS3Mw3B@ V0'նk.AV$"yBD bbR@y]q !8@P dCr H19RB*9 (gPFM M6lEVnML“=//X'VE ͑_9)+A35pI52R(a*0!<< cLUG!SI#?^es(g [w7㯤W pT9l$(Di7s>eX9.ap4{jj8)|vX A.Ԭ`#O=4c3 ;مROHb H4Hb[i"2iX*gDwSˀ:}J`&l @.Zœ  bBd x2,׊+Ni۩Gσ(_U$ˉ33Ϙ Qʹ$9sǷ$E;v%D V$D{ɜ/럓kBYSTdErnźCuwĸ,"H+aunjc  W7"-H̡]D:u0+Ds'U|m ^j_Ք'۝OIc+rG5 B \{BRyPZ0;n:toҷeƹKj(OƉTIp8eȶ-)aX'V@zA;kfAh!x88jy@b$̏/.TRsSI2iN] ~dD Tݮ)4XS_nƣIA.v_IRB3"p͕hLx\8#sʝzW@/r?&)QsdO9; r5N90 {Ǒ1`wn xsmT{{ڬ1NsXT?;_w\1^)iFXi "<+#,"tf8 vB;j#bgeH*麗V\W  gގ/;r[+=7='k{̝G7}pn^[{_X~oYs,êϦ%@ c1其5B9 ‚u0H*dwI9,qYqJj¼asLjRi9)G)Ga[`IU>TyA(+0OB".a^XLU/xsA@#:e>tț/xtW^Hɀ 2)7 {ąX")֣c@s ۻn D̽p:!bO*1Vs ';nA60^";јf $ޫ2.".-H4D /}綵+]jٴ95洟vҗW\s uV/u™@818S }qϪ{6DZێ 2p{ '%@\J=r%UC dnFmlHl) _1OSI{o]ٖēP̍W]i8LL@KrP8dB YcbRQObZP+N:3DQO\ń2ďwd:bאeCE|NYP,,/=' 6n}EQ@k.^kDHbwR$D3xN` )HsD,6 @B8r0"mo{! ?Ž_wv& XT(eXkW۟0-rǼ;(Iz[IUE^V(id *") }/ eA)Cy-qA/&{OoD9kB!kM%E`N,eXq`뜫\Swʐ8x^IO1f UVEPBr<ŐW4%u7˃WF~a9UlE^jT͕c@_,T,YR4"fEk0׊H!$k[b3@/ɊEZ_`3oq"8^.M,O'6˜ ipᐙŷ?R״9C #R(}k9Ἆ\(I-O|AKmj]sNZ{3qa%@4= GhF'[`#?vJRN <`8|L%sꅃMڒee\E(y49`/D#B:(|!INÃ:c+SO Hz2 i@z.#ęiY0{ڼ%l_|~q*A N) h19dHgIqJfCpn'pu>&pxaEh hx pC+/8U)h*b $-XlHPsD@\CΌo[g7>}k>xYǏ_Z[]Շֵ:+ k\ˑY6=I7f7 0M ֩\MG$3^ogL@Ɩf0kM IDAT~gdL#46iLbv1fkSKZVXy,!BA/mU3F@@ qzb4yUJ.Δv+"ciC>BHl^`irPUδG'+ ^/s E;$i)$u;Ftڅ:`l]%FĹnGy CP"9@()bП-10Oxn"gvۖ /|}ߥq^ۏ0>[?jOx2 ܮ? NLo el7r }CYve_pE4)Kbqz+4kLcIǽU*(Y\f!2 hMuȂ_!XO%\'MrWDžc=m{ H[P($^eI f.SY^dJ^0rM$} WYa\'mX Zfsӽuoo67sɓRRܭO(!hcؑ]_wl42P2P#}yax?7#cVB.|upLЫ{_^yUn{f?\4ϔ=xφ|yzVm?e ;~ oē""0H@!}_ɍÝZe#:0A1sO:42SG,,+WƶH%\z?+}| Hd=r1}B^TpD!+4F'jB x0dV&U2x$r &i2BHȪ ߱9=<PSQJn+mlߞ|ԇ"٤]E VP"J@ZC+FC_5I0vF4N$i`h@xX)dؓFY{¿mH18inmstXs­ՒP!_[s_Jz>qp-]7Ӟ7 ]m jl9 6Bidmd{$-*5, t˼+#U25LXox-ѫm^`X,R^s.|u 9A<0U5o9Ne$ẢY cUAk*ZI\ŎG$ jIWŎы5WÆ`BpOrE; p;oVz"{;Xi#`k-(Bl.!@)$DpuSLiI}J`"_ު13_V|җSv.^o7Gf \ض_w7LWm+}uOwt~PoYC:#q]Wo8un[ˍ2:d<qp#bmdR7 /o񈔦-V:Bc.KO9iw.-k~g޽~py}toD.q( Nhd{2$*y Z:R/ `O~&1cNa5I.bIq> 8#8X.@"-rlA'!A~qHC$orlk {e8ϱ# d::98fI~ӤuD.$A!Ƭsb0p5RD!U 0,)ȕ8dzx}I" }.<{` cO 6P$C\| <y3%^hC-"-HcU#b0ҊiL bVUC¤YE<癝;~N ~v筈T>yŲ%N.ڏWeKڕ9u;xuBaF| rdS5Ӝ6ߐ[5 CvJ` FpʅK~΋TLV%I z5KBW^KFS҄YޱZA(4͑:eV+L* R\ \99B=aU*R?ɠSjACSXO8F~I*(q5H\1RPFC&N{Y (w 9,aP GiT 9z~Վq{N#XD+ x)flO\CpD9H' TL`"ul-Iw !C)$DMm+}˶}ʥ\!U^[u賵Ѵ껷{ni#,Ygu;κn%c0hk)@@).zņ#4^|`yܫMSI#Y" 12TGR`,u_`+TUglK dOp㐴aװu̒ 1͑|ZNJTnjL,̽~;ܜ '@i|t˚"+RH\%IsD l!ƟKvbpE>qҎ߹mu:}ڨ9wfOf)_I]Cr[pk{[{$6lZmo\|0v,[px sF8V,re2@* g* "T[PUaԩ.WYLぴE*f cL"T@^.^M22%7*,7peQi"f%ׄMicrG22r~imeNB`~{AΑ8`9MЙ3[#"^El>^HHLQ\0wHc( +>ްo5 rWdyե8_7͊Lۻ[gWwq7|yZ8]Yoݮ/ݰ*'Ħ:SO2_7Us1|9$lJ?hfœ88r\ H>(6B? D򃱥mNO8EeN4Z+fqsj9r9)+!r. sNb̤- ,G^f+[@±T.9>r_exɷǀTMOūkV$7 s@wxف;)? LJY Q)Y{NXbDJHTG.ymu]/|^۷q/~ԵnݿOwʞcQ :\{.'?޽~:?y\rr/3[mUAmEl  %'So_];}Lɒ(p(>hp+|$`gZCzŃFgI$cƵDYkAp a01ve$AAOVH|J̐| >|r8$2$t W$B4(o5 u'+!+TE@VBĬ5m=8 `Jb]8)`3@"qE8|ߍm k/?i"κ=ȿ Y1[(Z"_Ր|_^bSLJ|u ؿj[3zF^_s_lO/ |餋:XspJȐy~2=~l\ǂd$mDH6:>S)Ƀ W\-3g S%9qHڼxH^<3-|l{W^I~8b+]s,Hj# َ>LIEfC#Qz) W9͸FKVSfϑ, QdzPW҄@_&s™WI:9(9"fZ|ê'_ sX;jzepo#kV>p fimVV͂b7[e_|P'9HBczeq|,&1W2xId#NPĀzH2*N8ѕ꿠'-!d8(NcB&M,Fv >x%f.%sAU*WT :\Tˀ"Ƶ*5ڄv5qּ,ڕ'Zӎȓ"}rbcVV ^H^"?Βآ @l6M؁0e(ߞo'D2 KSޓ6#b~"fDyDH'#B"-9{kؘkqEx){RW\Q ?6;s_WnzJ-Kzu>Xe}$j:̴{ 9 SGdI(LN>q+う2bLM{8k6k]!/d>gQ( Q9ϵ RaI*!1`Ipɴ5^ #>H4p_5aM-xηlVcQA`~Ʈ$<_+,_Z5{bK1<.AįnYk/̃xdW Y*"p U][R$;߱ng~ܒY(V폗n/ E Ya)Uz _R}ݑFg]ƀ _H$&jdZ*~d ^d'iV%@8$c璣)}KS)I ) bFk\A"{rFxZ~8^J𙳅ۦ>bCJ%_dI=SȲYT(k洨8`YYZ,TC& d:TsUEit kƅ9`"dB+T"%94$gn0ŋm}k(ءg,b)؛%P,H|Eb0I+|`ErgIX*I+-%L9,b $EV-.ܻq/\"VZΥοokb+R}p?vJ^ք3grڬo\%;e߂R3yU,0Z2_i4WIUqpvK^H*W̋@bWxƎǀcܜ=Wg"׫\`S}Vƫ2*W e B3rhbTΗRΩі*9AL[XlT%?I,n0u;lcAQir:q34f E`wR3RIcT{9K.ܮ/*DchB$o݁2toARy (D <LYamrf}aEHjR֔g~E~q_c="B\\e䛀ˉ&0xb cQ}a˸FENu<_!'ύ9*iNN06(vCCzO_J$SJNemA'2r` _ S69` b @`?jn03 )P$3"pُcF$ķqF;`wًRQԓ #1 쫑Ն# ɻYYKZ>EZ(ͫ /8d_veaP`G|܊蕛SlPBoLۛɮɚ.JS2'u% ~=Ne-+a)\'Q' Q-mH[0"HЦh?b5fB/%R,[E8.3+T_rqWp7ҫ Ű[/aGr+5; XP"'[rƢCQfN֩7iLӔ94KSyyobNIk7g~R?<89J1[p+TJ[/l!3z<.O9Mrnv=<R @3et#\<5'mɱl8&18vn۵`nCd@&./%H O!.i@bD1?D? y2Pǯ]dOiRVm՟i7ƒC?cG22c*=^#˕xO/r0q.y`dmiȲdaADD@8DN R#NM ^UVD*qA0[x;´41$4If]ݫ&t~C8#R,'Cu+ ٗZxP;!RtDaB|,b'Ʌ1g7%() U\nA5FcE ' vT.t"iF IPj:lW0 2;ez)8ᘩqd|{?"<(mpHnA8d=^e_LyL% 1#CZbkT/%J3Y@/Ɖ\^*iIwdׄK)TALtfRvooDx$eѓJ&i#rP6|,~0\UbƲvJry+XdbP$JģL8bvE[H[}XA)eS3B2Y6"`QGnH $ ğH"yF$68&-!RDe,6+T%k> iwR딼:N\5P>vx l|*Aw"tYiה-׌zR9%Ϫ9p ! wJۖg<:l!17pE;R7͐؉oۛԎ`I2v?/!܉(Hw̘.10`eNFb[EhŀL'oDCS6y\ 7 M'5`qg 8lSNy=Z {m!S DVPEJBU\$؄RIA0 1r1Epu1e\0~)Ua=$)X&1?;X1zoS!\1E q9GIZAr$ +8m;TL;F:rw|;=@V*D"h&;p0&< ٻ´Ԏ/qvGpOr]oA.nÛ;Vi턟i[7 w;;[{<ɈHXh^A`f;v>XO&-F?HH\I<3"g p Jhd|H0l$CWPF@ℜd[dqxHl3dIɏ#KjʎWuB]8lrzxæOR? s81f9tyiz2[P # 洶_Ƙ ˵Ƶ]:.yйKrpk!qĤSڵ.5y$J .`o'6EG.Y/,ky.1''{ƞ@0حf5m-კB{KKѢ؊|Nus;"L>'LVvڅ[8!+^5?Kuj xR+ѤFp٬2bKMk;\xm3&FZ,F$u*r˝Y,J&_&W6z*ҪmB;MnLX:e:)s%epḢw FF7[!y0F"E,R"EZ(Z]voX}E{͇f[uwrmGwx.ܴtydܭ7 P pqs1ޥP} H32f}|/4ģ#b ه-[Aq%#0ϙԴ,'z4䊢*;^Y'Ԩ%8>+O7lIuo%.GlJɅ9xěe S؜^2px^83$!((U_J5faÕظ-N)uڑѱcS{=~L!^^HܷNL 8fDr`,hAR:T%?\}p9{ړ$ Aw Gw^V't _R$Ј$@v} dgSbzDyDZIK}`Rj1ays*8P1oy;U~)֯d_HHs UX{ =x)Sv4_Or^rN'R:%ӐF4(x7^9O4u&W$W@@B2ɸG O6OT|yCUk`%к< )C}a)op JTD4q+M#os_ݻdSp+>ĬgGk}sFg.hƷ1\ Lģ[15ZNs ;mʊ }j \rh9ڨ᪻^\Ǿl6}_A[5sj[ 9TTb߱͝5H*yjUOMhE5Pސ@fɧ΁Hb=׿Jmc&p! fh\&Y@nzTXclXXe_*wM^l\g {לX ZgUs\yKʍ+]eͼGh᦯_tz:%ykz/cJn=_ngμ3~*R˖~Ue|ա_wڅghwo]kNm.}snZ̥`b^ cC8 <!jO>m!E':(F,Z#Cc]kzFS8d ~I +0?`;B V(!!}%)LcN2rh0˕LU>,]%8er"9^%N0Y ,WR0A^ut7 *<̐4xlᤸF-7 J춶­޺yfmΝW\qŋ/'Q 驦_u*6?@,hJ@唋:=W;#p"Z_Rbx$fڙ8ln'E֜OH t6Z[sk06-|ێS͑Kg[µwU'*ã#@.*ÇGEOj vm?p՝^ Ccã#ݴ,?:xh`-:={٭{kstS_ކK/o9<!~DCU]w)"˜i$U0 hyX[|z/{>}O]\u~{d]$43t 8%qWPĂs&kiC:(JN;Lew'_rWΨcmj%Kzɔ;H`-T$52HT}5S=ħ *b=`286}oZo y|s~o}[}//;{iӦowkܹ hbkq:& m\]R>2^ܬऋ:Μ7#u=ɡ͌$4KN3?q]tK ?7WW ?`G.N*q-Ի~[w/t߶\e>w=ZDe+^Y?dC_طl*呁#ܳY?xq߾?V]pjPkOl[:>>2|c}WdŽ"mw}CUs晌ÀsRb+p 7Z8iτ|h}p}Xߡ-5g}_}̅ݟ^}z8տO->#Z弄cMM N'Lv M ܏Uå脽4:mbɪ ad:Jz7~]ޢ]FyGME1$lix/Dk2w85!̮Ir5)˚I*d~SN9~n喃ٳ{K<_Y@ IDATw޹m۶pVꫯӃwЇ6li'[uFJ:Єׄt0`g@Վ޲p?7>(Qǝͻ`O sSpߞo<ԷįJ qA}ׯ&5 FnCggL.k3/1`ɘG,鉬_ߵvpΞu] Ok_v}j/ Qh ´;Ζz*S/fg*Y+NBMUaHs2|G 3w(+(P 9Tx׼jI$TgNS,P-#K}F`*'R` d\PDʦRۛ9ʠ*s:Xׄӌ3jQ`uk׮38~ӟ ۧ۷o {{h/M. F.՟ h3E:pdtu;^\~g/-߹}^-юcĨ޻-T߁{tvcxhnWPz>]"bG8"-2Wk,k&d$(:2p{V( @p^;~'N3KK ÎZ{j/}rѤ6ۙ~鄋׽zacOut.tZ @etl?6~OGYdoTʯ-?r7NǭFo@Z5J>(7~bQPϛYD,R{toG> VG@>rtj~z3?=+^KSu㇯9Q޵ox`$26tdMj' {%IҒ8%b _RϯLgɪRF$>'ЋrG;ZO@>]ql:cQO<''̰ޤGE%@z. Uj&p7jo֭k?u\9#/>fmfiM`yA.Ex܎aq}6Ǧ18n{ 3DHĮ3 3i{BG{v7߯_˖8'(6Mz`&r|;>t,YK3 @6q_o8=zso]*J6('Jme8s?5\-|d`xՆ޾Ã|8ݪ$Hߛ/:G=TFh+4B90P_eEZO$x hy( 2ˁN ~Zϥ+~s;]A5@3[:NottF GAFxQV_œ7dhGd*=DV8y0yK@03u]pgA},f͚??WRPp*œx$Olvwhd,l{=?iϼGͺw×n\uK?rqg8@oe8ۮ̀GcxtXJɒGWs?vpK%p\D`ޢnHJ=p+4jnEW7c@zB.YL @8X_4i6' @ҝ'|T;-zh7[[=h{Wx^0ۖ q%@ȝB`SkGQQ;zЫ/m9L05n"AMTGpoQ"#M&G^|9 Om&EAƀZ~#uAuCZ{{ 3 xTenM]~֔OkW&?<4+ 8'$72+m&`gj>Kre8ێ` NDűȁQ maNrJ^sj"-bCF87lt۶mW_}ukk)S?pC9s&Vo;s @nk+3uJjr=bP0udŶk{m኏]rySJɏwof3kxAǩw9oŷo[{οIvѠXltV"3{y}z:n?H]%qK.#~L$\Sg"5>l%@n| @7.Z"%@ӿ_} 5 {\˗D @8v7%7bc?YwƷ$$Ϲ,H3sX> XT ̋@(^.Kچڇ3Fꋂ)O<_v)]S]ëGFGpuڦ\'ۻPȫdV9USQ({P- 9Gbt,I*i& d|#ǜ,Y$$l"Y2 R cҔr Hq)-}r pv4  &Hr`mUδ79jJ3 G+V89@8OqttM;P\&i SS,9zZWTg_S?XvE`=WǙ+,+^̘v7~}O8ppfwo_w-?+x]ڝэ{?튜=}W_&^IH<`,hߊ%+dKft| RR>D6XS+ 4(8OK>-Z(S'J'T'A-:ʕlI{gPm CeLіui{<^Yk~rEClB"-p4Dm<)R#E=Q7Şb|Sf[pgQpzP;vNhTR>% gmŜC9?XD&/9qa%iN7KEpFzlUXfB2H*iE($˓0,yapx40*M_@+|'N!'̬%=gJN?̍U <2x/?2:X};ؑ8b Z40,39-\NzcP1an'~Wx >O~ﶟ @x`R1\0*_]3<FiWGܮ o7UN2+8m5<-\Oq)TO96"o-f+u+?kWP8{hxhQ$]3g߰깥a Rp?"B#+bm-Ib"9j='{e둁ãoٚ~Y:*዗>~0#?7h+}鞭B¾KKs,582yӁ]Y_ta{3~Bup0e/t`oܷu#WTVo_ZSg?~ +ZtW{HF}z(0v~>94O1P Nn5̳*٬d2:BGw{=?^wmȚ zo޳XY ?S{;'닅ly8_;]_zZ;sh7f]Nk%@Rㅝ..X-}G'' aL%,RFpqj c3RJi 1' cdih|&F bdŒx J+$NJCehdl;y~ف:ys?ls]vKwW>FĽPu=v=4td},e hESJ۶m/_ݫG~z8XpaOO׾=WX߷o_3fXfMM?Py!Sh,pm>7yC}ڳ[9fO[_FÁl.˿5~G=g:7>NqB;8:É uƘx=d }7~Z „HbZQM0`EJWG"H":y+t}?wygrSDu{Xgߺ6]z߆XUɇ/]v7޽9:6]𦙷yѦ׮Mb7o\VzNo^{C/oYWvv/ܴf}vl䞵?{oU/>{sP( H(j@HH$$;sg{Uî];|[]Z֮]{V]xO  yw=0!H& 1Ȝ!CmWM |nqLG$i >h+k-uuGX'芻Kdlkz5]_ SJN~x@"|4;*ӫgؐU|exzAx%%$M)mIMcx[0?NUZVڔEkgIx 3T^qmGSGH yHd]V=̕}nKtej&΍N?lN"%s Ggyƚс/{cՇVk[cwcZr0a,fȰ%$+2w0N Nu"[n{Ku.kmm޽??;+\~eoozn XiCwu_ oe\W8~mg:{S{7^܏|cUpn}Nc+x]ἳdx5S]b?;.kɣSO$PIgsonT=i`Eu(Y:Ya< +jBNBu|h]D 4ڳ@,1L1:@м5+(nxbb&DuE?_uccch  CDӦM_ LG)`Q.t~Jt,1`,S'X}?82 kw2|:H\?s[o X1ڌ ڍ>Zm lMsFR3Ӝ'i\2Km:=r~i1HX㉖J;Nsbە+?*''뽐seRȉX *u<rjGhQ=)X(03R<Wu< P IӛTpYר.yb+Duc@[ɝn9C*3%HX+8s&L0iҤGyd߾&fMM̓>xL0Eʺ0mڴ 6b炢Wo匩]/ [Gwm~`$ݿ: 3BZuZڦʥ<羷/w'LKŗږC'Z]@;$A /IA"<BP+\⻃x#74 ꎓ$緽>h:e۶h55hi RwZb݅( ,|i۾럓 l<2m[;ƀf);H+c b΀}vÎUGaMos㭜4܅ g9ztF[<=̕yes]/c*|cewOoYKN,~+R%ve =>5y,@3M^RwkMm{t'ReWѩk*Ui_G8p@<888ȴ_iiimjjҿ޽ou-9443ut'%?I3{{?umF]AdIs@qlumZ%Ek>3-ϟ3L=o~͕OzӢ[zlC$=4p3ݗHd31|y;TFn<g#ŗP gаLb=N?S1|wDxHHh>!k%^-+$e?BGc 6") B-{3NMur*HȖPT2I7L|aK*Ple@03Z@׮8a`c\/8emtAWEﵝQT QZ]]-gOv(dKvi}:ﻖW#왆 02| ̎SQφ5ѶQ}4q^'j/_Rw {Z筫96ttw03̦҂u~9#x=o/{Lv[ +~`Sx~YZxFOFS &0i(D}v6lC ړ |H&4mo7{[;t<' !Ш-"pa!ȳ& YKq)< `2"հo]KF34jDǍYGOk7SG6tjS<:$) 4ʏD*s4G9,/Ou G=OHtן'I$I^=' ]W)@C']oHd_wZG > E Z7pu=gNܹ5=^;mibC>9hMkZ~rՇ\լtǫM7x{.U.cޞ=zl™ IDATNNH+o%vZ_ՍޡUiYXG^͵z;*]ȌJM{7GGN7wc4Kh{:}kdJ*R"D#ZxΫ֗=+>HpW\8z4 Y8) />-QH8b kĦ@C00@fZFe xa #WP;Z\ETJbۓ^Ƽq~gtt:SUx]C:X2Uzܨ.! .̕Isk\Rnʶwvmi:ԙ56䉾Tl fb>3Sz5ih>ᶇm}W.`23>~eZ]bI=o瘖Ler97'JWRt]qD'T+8I Zrϑol|Ⳗt=S_V>IԲ`u7=ojmu}||X63 dar0X[ %ZiՁ jKza(I")+ 5 3ၣRBwRLAܝzb/H ʨT&\gQ Q!o'n $:E(4 d0$\0338G^t1ͧFr}CGu~[?6h7>:}0١U5w諨zXWݨGaGf NsDxIؚk}!K|$_wnaD]p39E$SO]΋\>M91ËHBWxIbc˩2Oo͉IM!ڛx~]{鲆jE^~Vԣn_۷dmڟ[{FGľ7rnBG,Y{xpݯ=ִZԥ&"OV ?L6#cťXYrG:l h:q"d,gnmy?u5Bԧ&΍?t"%Qcl2cxys۴>G??>rsǦ@ˉ .@Jq$'wӉ9vLܺ%LkGy(Xop v!~;&*n Q' &'* lzD1$ Z-d |: fy}O8o fZ[F m?Z] EǐY&66-e֝stchL[*7d 2XM#yeu!ЕL&+p'?IIQ(S;*u4XPq ڒ((|okh: d_ 冃!A6]"!x$ ZK`CS@~!{N#hs!7;2\GezL|.O du_|Ħ_ڌ+LrbA!0( ŦcW7|y`ĆXi$kӬީ_^<8<)$rZ'N(W}yr ෌ܨt٫fU( S".#|I#}C?QKV%hMUͫwtwl =<$_wS5Ik|뾷|n? +2`'j{Ў=|'961@rs?6Tee%R`h,1m}ݭ}_K.- Ke4aQ /H$Ŧ (f$$`@y P!WtgbƘܢR\ASg3܉T5?/svnX@! ۩bvBp|v7:A6cyt{gmɗ<}?IS9w(S:90yq-.ګ% w?YOM}jؾ: ı+LTMer8%{N`G3m>t{rڒ9?ͪ>ۈت +VlHGs'Ljczk?w,#K,-S^.x`Ld0 ) t[&2119[e rD Nh &υ[ V CFV o;Xhs>;!?i`ʁ13ͭ onU6#_PJ3>7*N>rU$3g |gҙ\]T] 9\33$yDK5r`w*US?%<$uVRz_Xgވ%Ȑ/2a/$]p-*wP0ٺҕ:dOZWWkZ{u홼'j'ͭ  \u9s\xu+>ַC={ۆ{F9<<E("ۗ 3'Ҋ 9~6%t(I)CRZIkL_ЛHGlwLi HPQhj=3LK BR6~ u?uC T>.!2$)F:GV!?B-Qf:-tՅsƌS+yOF@a (ARǵWxG)E :':ͽ*ot: W7W~,tʪh*` 4TϱiFs%u%dN~nd]~*nInΫپQff}5}Xg8|<]c?Xc'-qx{'/;8t@RAwyn*noI˫;tm=Gz|y +{sz.~OHTR)Qmxj$5C`~Dj4DZJ? ,4ׁe!c'Xǖa^b d|oAglɟ's̡ _jbך{思8(l:l:fn'iW}ݾњW_aGcJQ樼\Xœp~v+V:NXaL.zpUF,z O{, L л/=z !LǻFRYHA) ǒʍdTNOWor_BNj{hUn.N}5!B`:o5u׮skMs?:֖c[jmCf939<<ɖ*T妨 滼?\T cuGs:|Y v/Hގ{ fu\ +#$:'*iY9A1 W!NӀ p K&yKa<2|&ϴMX;6Q]e9Ԣea;hyelu/d_oϚz3\q N]sBN&O]Xu0tʼnsyepׇ}t/|S?&)KryY rי+d{ ߮ERI+yVuV蜾G{(3zqhMVի' }C5V9Fg@8/h)>Frt}sbCt}6bf-NAeXǬkL%{?jO~\b41('Jn(]22*]§"ÊEPr2Z s/F$7`o\NX|Ml.+ϝ_C٬7BW؉T.6DZFũqIMWu}[_nWOZe<15Q:i7NT8ŗ@ZW.YX;* j~|x_p*Sx|-AJ* C"ΐ_ %&@I~"|4c /٠ ACDPaG A4y l Z|>਴s.3$=־G K@+[]Ҧ\Va TŮ*~ڒ;Dzy2 ZzX Qu дԩQ;ceIر{?~Dav[?Ҷ) @!GiCv vCTSS0 l> fZG:¶G6h7 >BV19x7* ߘ#k ӡ| Ѩ*י:E[ +݂#NXAy,aANr |ҹw˟ikBhҟ9_f zKat[3%A3ps{7h0ƅ%b ܉޶ p |M/6N[pٳ"'I1`gM\tQ.Y{+7\E t%J9ab#CK|ՙQ:FYIء* 8l{ gL%H@2b5AF " Nj^C% !=% 0o(#l6SWr90t _r=6xgוKIV,._Rή8c_Ec G[O$tGD^ǀvEi$i}!tb09iH[_ `8p0031c  (iģ, !czGlJA䔱h^G}""S~1$!2!B0˿u IDAT?a9 w&BpB>p@B["wlŐ 1|YYL υC ysƶ@ԯo ~Cl%Ef8Av۶=|HLK*7墇]9VQB}w'UH+.Ǜ(a͕nUN qG€zUDVWE'W9ud1)oʼbh>krڠUȿJs 7.dž}Q8fY呎X`Md::6jos|-w -={VAu(Bb(m-!P2' -]`d|yymO?5O8}kDY|x`E+p`}{(. 0UBB8򐢍8`_B'N! &OKXB`h "Ŏ`!gB=}{@(0ِs*YMc熡蟽*s`]bBXuzxHriټBoKJg~񤕍")QOMq쪒ƐScZrV|_݋>o֜07[]GRW:W -]#KCG&%ڏOLWS)< vSC6v||KW,`AnW uk~¤W79sk&/=yvvZ3}nG;0ɍJqSErE5P06dYV艋 qa9V1!:2m=8ΐ K h*GPhQẃ (d[AGL!@$ P7sm`#3 (4b'Ta;.O纻{leٯ{Ү]rmlEE{?JvÆIO(7@xHr_|;KdԸSt;&_ى.AW8DXG¼ yN;uk%ӢJdOHSYȃ?Lb kk;xX9) ̹~؛Ύgq7ڇG=]M?wO[e /8 cgXkLj-q.]ךڦ/~:SS_am =fE]h 1*Vy~ .Qͱ3z#'".'Ȣ?x%HQ`3` CC"!7-W&;g} "C,Pȁ8;ECji@Ǯ0[(.L>1H>BN)uqcW- E`ֿ7 z0'o$3'G#/g}o3YĠ"ţa] ̉ү=om> ۑoM(OWI~כ`yr䦗g>x+>]t É8mV;۪!F#3ϕK^gpW⸖XHw|TiSrԯ;R$+za}){q;q rG~I\7>!@e{VMEFXY>'>#$XB ca$R!k]#%& @*pB() ~d-|> >*4oSΨE9 DNgInzӮfц@XCf@H+7ҌaJvc}kV&e3\].DSB.iM.9 ]֦D>|_ɸ҅ 0&+a_N : j|I?ũ23wd+㷑>pgoR/MEms;x3xK;B-` ;WTj }ؑ?Ӟ ;SΜǨ cOsk7NQbǴMaŦ:+V`BG?HΘ44Bg-=T9u'G6{Ը:' zx<%AڏO-Tg ,}K3mRE434~Ɗ hk JE('Ì"JWЕKCX1h FRD&рyv A-*8MB> Ehx9&Alw6oh>@D;hU={ S#*QRAY>v]lPNhЧ}&.@㰀KV}6UP _Uďh=?e-{䡆8 ֬f!sg + >}5 9n8Ο>?Knt Hawaxb3%(t-r 0U^O?xX_HC,^)LNy_L/ 9w~ p%N>su_B0zܚX_ef/7V\&KmEyzk{O=yti)kJale/ |XRj1- - WAC*hHp?f A@/@@ 4G8Z'Y>#L8T/axǒ~4 -`Cgj_%|zR +^.N?L~d0ٕ~3k+R:l2})T^~KXqhdc,?X׻{bgߜ}sώ9;g.nUO$9Ő sQЄЫ9*xM\9z\\qĎ =CGx7*S_ FVoDX[c] zN!B?q!(]!&8B(v8Cr CsB!alG>i k7bZ`-d>DhK!$tɃ OcRX#gj 䆻Ru![#dD;v <43I*1<>O0m$,`5PމfICP;&羕YaQ'H:l)DCnbt)$H1;Ԅ2 L$n1Lè4C}eRRЍ}cHP[71) !lhb9sU fIowtn6W5t΀*ƃN5=+ Oos\K# 0->ŵҲ 滧Qi$0SS2HpDJ0P"|PGbEx3Qj)JK$& D%B Բ:B/5j3JC@\h`0 @B;^qt!J49~Ƀ 6%O$IU 3 ǧdЪýњVE&!^ᜮgKw2~Ĭ54"Kxt;)Ryk&drF4ܪwH2{-ޑ 0in8lb G%j R l`EbفMS6+6 MWZԀlcI#[W,@q7ׇWnR3h:ԙy_FsTJ89qkMFAie.}}(pY9SO=Q<a}#Ma⒎* f \U Q'GAN }Z)>KH~n&j$> !B6`*Jl*; / x ٿhބx@1sʙȏ?Ŵ7 st:7M :-I-gϲ[ozώ1b㧘Q$X3a̻ 0ق4±N`p 2t}@(e !z<Цh aNB - ]b%AG}/ B`Dq ~頂(Q!TX Q ~cP_JpY$NwEIZ`בڠfXyEBi`uT0Mז6?g}6+'N|n]O[hW%42 |cViwMk 01c3hY= _qd 'm_Bqy» a ἷK "B #h.@^vNS?=[/$e0eIh,gBBšu'ޜׇyN@X*ujө\3o_hf sլX>YyM ?$E -X`y&AeI8J'A:体t6Z2(7z5{ʅVLADy;!GP .q0 :+0C >DxhQ 2\AG&;CņDa~} bu<s)Hަkg<{:odt~x(Օۗ'6j&0eTj:k7}{$3z8 NpU^.*l)֔CH!`2V(n TVZPKO_ms-ct dAZ2A%n*ARqP@x!? \!t_4/"4,xl%FHBy _%HTPIsk*Ҁzɀ>*S +lOwjOSb͏OZ!EY T1>vg|->!8{&-4‹EjKXH2ּ) Q4{`!@p3L*Ƕ:1 yQJ+Nm [BH@?X"OUo/ԏۦ._?c] 躾5'JW~[?n|Kl_/[5]z# g vLNJ5j1'R8"y>8ٛ9w**ubC8tUXWI}ԣjУu+1BV;َBc!ke}j?NõJ 8C`,4>$(!*/uH2}l9>ڗL `$ 735 IDATư@ZWى@e†3m!АU#X`!$"\JUiWeG_wړ d;R\>7.a̽(1OwO(thJ4ա^gUd helCOJCl%G_#fbNRX˧L+`NkW?J3'-qJⴔ3Qd Q1E 3?75u?' A'pѡN! YsVM Y2   CkZPvNv"pO"p!8c @{P!E~ ,@aq;*ੴNWcG]eN7Xg8a``Ŗk7ZTBx˖)½%~%>J e2mѢܣUFGBru)#]$F Xh! X dҧ>6$cVjyƠ J (c 0Ἄ mB $y 8>o6\qR߹"ͯ`sz?ͨ8]˶]ˋ~o-O/^3哶iL}zoN?ɀ+C:1 )`a;]]ΚͭMsl2Yqe"lى:LO60]仦#!|L oM/=,ڍ_ّWt\.aSO*Rr;0TYI WN©MX͕<}"RN_9~c`*[B d-z9}Yj 2؈!N*JIūΐ!(N%A5 c0J jOYh`,[KxUH T}GE#!袶D3i^\v=xDc*{T,r1lSƮew'|r(Ֆn蝝}s6ҮV09X/ ]M]ݸFg&TB'OOb~ 23Ia+Q>(f@epByH`Y*y$5$3htw=OKŏ` op" < T/\~NC'{FSNv9k1  >|VN=eeyut&ƣd)\-'FxMKoS+uSaB*䍫p+(i?}kM wܚַd%Ha9'nJ; 'ő:^sE\8#6E$lӵ9 {`]9]k8$VLcpgVr ^g ;ZC #< v ࢏429 B./H`;&YT|Xh!dH0oM ulT}MSυCPt%%l8MArQ+AJ.#l{lviUů{-]MoP^@uV  g8 tIQsl/y?DC RPc~X* F ,%$R JSȷʊZ.hդP7 1ME_]t킳?=:f5km>lOcXY7{~nc ѕceYYOr&?~90w'Z 99-=A产GK|?6]TgKq_ߨ]R\wtUtjǕV<8 q:>U}~ҀhGNȭa^ЛDAm?@H-Ɖ {@ *#x*a}`5d>Ӆ&$B=/ TF pW^SM93.l_E&NHO(͵}䓉֓לXki]_o7v<^cƧY!U6gd=Cٮt~r? ,Pwz΃D$`, 3Wb}fD;PߑJyؐRo9Hc/fC| ŐM8,!V}y31$s8d¯DB`v|2{dӁ, 8("^]i7/<IoDu``\䏓r<|hU3yr0c=M'\xuU֑Ȉ)Щ,@V^NfWn98ّ97)I϶SlP:8$m20رIȩ8u܊ _Kؙ(YԙCDIrp@y R @5h] AYM} 8 0vw"J? (]@#aț'H1UwBG*yRw^뛢yYu_϶M8>P]m7fd|k0԰:e=|93U9GHR2׷>p,s6tbtGjQR,40PҕˉYJ'gbVe;*a  ٫f ]BrIg0PPQ_NZ,( lş@,.y{ϔKUWV[bz"뵓Hl1ϛn?Ygpjy8.wG'Wũ@ ⃀%ߙխ+pWu:(l$YRR_M u9/aT{ JaDEnp /.(_a{I,\ #@-BH +_W0WHdۇԁQQ<v#m0L+NMq~H;=N3<=|7߆N]p"ج<pT/sq;>ñ}J')GOUn?vpeN+P>OACQtso:\^|Dei%A+=cX!D鲰y!t;BH՘9B@*}Ÿ+K4A|b(P]0BjF&+?2 /~]m{W~td&D—yg5| |iwʺ)^fshm}pӀZg<')7sHk*?5]x-|B㾣CxRL 1oLc8h5JkdɜĪ~MJB**\S'(@ȋcc*<ө쭆)Y+o\8C=|*B+iल 3U[dUiܬCGY:loTf uIkB|wjSȃD<-&RD so}ݑ `( 'i8Ɂ;1 *v85: $r +Zl3?9 4a x|͝8WՙX!ڰS(P5]zMH4B0Qlx>13p JCHruk₏Ty[C=P Anj{R0J,)܎cG 0E Og#)|} S; 8CKx]RTY(^>|5P^t;~%]'Ru nU]66:Z MtvÖ? L㱛FPkĊ&mH@+11Aқ G>v٦ jt010~(4ܫHnBO߿{JU.o?פu>Τi03{yz;t՗B?7zܫ]Kb+q4 K%8[^Dώ\PgBw&~d9XZ"r#Px- )hu\7&!]Jb~%@V)3d擱jha!oAbidqwa- |А7q" (g +"@AG@$x,i)gs:Z74[u~y3GJv-l 8OrUK_-IF_ԑ:C6Bc]p堰 SR͓ԋ:T&z{'H?;y6z;'E[r"oV+R*]0PWi}^)my_GؚD{bDCS i^zg!`<`^( پ@:qv. !Jo>MAЉ;{^\'rBlVh^h$E"|LD.@Cu:P\N&*3GsKSpw OuJE3gے]w !_/`ikiZٮe}#hGs Wr'pM_PHƞ0Լ5\hmǘ7#A/kxl}Kb $>b5DE#Fp(07l[DNt{kמ(E[+xS/]f;q$(s 儷VM?Ϗ^;Vrٽ?Q) f[_biFc^oNxi,y0-3LH(tDNxuS ֺ~\KN p=C*tH | bԾ|?N<\YUc*L I&`2?%ByP+Ԟ ! kRK/%,$B ,+oIh^eU0%!cA$H5 |oc@j  7WB>@qKC4GOOnauNW#)WҎ}'Bޯ\&2MG6Xu} S :JDC9Dxw2Vs}maޏ?B (YD0 AkSJ)֤bG 2 V~_fg&V !`h'Hką Lb| qtb31K6_|3~$/Dnt`} x_l27 5?nK̘`\}h*wJÝ,Î;adsW;O[O'2ټ" |]9d)\.;W6\iqկ;'_uy]=!"{** !Ϯ=I ~\@EUtEͪ6qoǡ+I1!obC oVe2sRm@$9%p,VbB qV98gy ΧYTOlpv9s8XqT+$WVD哌ςPoV2o˹eTvmHN>2luozߠ%̀N,XFyll}OEst=1D`:H=[<2rH/~A`䟆Щ[uِr5!&D[ѷB3Ѧ" |&X8.1HŸ[zte_'#ޱ" s^~iW;_wњ饳؇z cX;39$#7>m359O :->0OεAAO,JR#1-o~GpYWr^(z*\JP$.4]|56\?++2~j*W0i \@?b`4Xp?gi8 \Vj7`wƆ4pg@xC1[j5`SU@kk=Mc1m<; tgL۱/?=& B9k`3!;1KEw]#L: f8i<bEUˆeڌg?⿽87&^`ݯL` Btr9 Y A+?}W58 cp8Qk9SlYE*1껢ydk+ۮov䯯7 &)7T ~+c!3ypF:*$@n~{fn[f^[fN[f)P)-=1Al~1js-Sび0:QӾF'g9k3yBwy<үK$-=zJ7SxxN2݆엜Wry^noݗl ɛ?wS%>3{Y?#À(Qތ=]pIʦ5&o1+AT RT&=پ.]EѦD]($r?]J"JPP6ʯW!ʇjA8b&*~q\t-Kj?Zup]~zqIQWݾlbL! Dm/:gE v<#W^ (m1wUGזjG5eh/"1£aFt+1-Q}# *>Lp:}|LG[˨G۹URv"`уn)$bSQp\Vvfw a+Pik?1nvt`TeFAIT2G @+Y^JTtC x2d#l* <|#o [.D[SnGTbt;2[ *xz|cz8{ c%=ޑytخ>gocUȻbCPj)܂CSb4:=.,eW "WN*ҨPA>}F+hV!Y-@;m/IPBa/܋qK̆EX| _WI5}z[]MB:`\zR?q6 NaP[_"9jyRx+%N9V) ]'`՘&?S#ת 8_wdh8PƦXqrb0ʦlͅڌg_8x6ߞUjt &֨ ~btj9O_s? Fp͓laV5O݄QW,@7.lwmHm&ܬwa4UM/N͍qjEYzb49,Ɠ'/kOog&>~v7;;AgxXbG$AlV=u,&CʅHi-S,SW Q~taKTuMnmjh__o}aT (ZpY&2|Bfƹ݄i+Q0+RF½Y N1B 6Zqf|(Aǜ:O?u aacNWݡ|QgoݾXyx>p80Ozz Uh Ȑ%s*i3!tݲhjavJYϲJ; ^k5{n<{SUϩy"Lr"t]Sݚr,H5ua͐JlLr&2!e'Rʶ5ʏ07屝 ʨxhplך=1j's]s<dDRPCͲBW׋D|1|î=L{ XFXd2nAě& bRee hme2vti1<4Dy~aQ\4^ n-䳊4}$惫P?WEE%xN8fD{~5;L֨_vfN.T6=tE'>A si&t<˜u\c8N k31;ě:8(_*hۖQgdcTD[C΅)T_Xm.][f0F'r9um|] f$(LR Aѭ.f| e_ T2bn1;512g΂m(9wg (tKᨂ}X.-OlҞ7ٴ6٨@i'{&O+~YΖ 3QMtFPUGISYSY/a"|AZCG( ; K3)H#>ĤIByƍ[s-z!7R{Uj5ʬa_Z:4.Â}E I\>VޕZ  V5_ExL=a)6G3_ªH -[Ffɫr٘_U??}6m]R` )xնjTRsauI*רF5ί/%myj O@۶ZTZ\ryh6ʦ wOPF%gt Nm5խ/#O}9aQ##@rCȨ y8h]٘y`UE(<#].jry7S0Ӆed#Ϣ&/yi BSam,.2kW avQз5 %Agc^4L u R-@;) 3 %rP(-Tȁ[koS G\ዟj~W}s'moOu:7~ k':[Fߨ_Y5 ,dlv(.c0{lk5 E?c{ciP00;S <;1䯂n ߫ yK !fv9u6.Į!!\ BVxW+$ڕ䆥MѴݍ{ͫ5mWc-[ش_~GVS~bƪhKtYcn언 "a(+_8<3{|ӹUSn_S 1U*llh<Uf:ХVM A>jA /ܾn.{UDv6#tb7#n6k_[Yvu6n ҭQB* [nR5jZw7{Lr(3G\D&@l[ Fn?6'6.废E1  M׶nc2etn궀`K--oa)dY|F61pd{V Ͳ-.di 7)V; 6D8V~ A PN1&u>)$Juu*jb+1#uj  /D<)Q׊_.meRܺ`Ob |f7--6E`\N>h<2A~$Sm_~ؔ~pdҽO?x$ɨ^sN2NW^MMM} (ŴFJ&@K$l|ӿ&Y OiU 'f*n:٬!^k9إDD{(W]ii\}7Iصb$kNdh4Boʍyk?BVY zaR8et8&PH@-˒tZ17 2}^i6;]Ļz[F眀\$}d<<TjqV[qШ\9YPPŮ)^d, ԀؕK;1*N?;lF/8ivT6q/*5p<?@zMΙ3]<햕_.3AHo gB `(]W|~ qتٌqs8;7Oƀk9O9/r?9U4;SK<:I?M6rF6Uœ6ǔ|L ZlBtKabK1˶(:mX=!i} q2N# & Va)/gyY.4O0le\Q,P&U壛W"zU= . 7:taQ=QϏ>RoN+@t:S.~*_jbWU<}Jt\g:5g:0 0b3ͫ£m[ݎ&#(uQem:F2S[}U)B׭n~v8ڦ}HP lzt<|򅆆vܹQoV"E~G(+55nݺ{~!\b?GA  X61SzM Eůxcw:m:gQrA/26ݚ=uͫTU]Q4rͦd0`UQdvq 5H be+ m[CzsDcd.z;_Qd[@ḃh.E8U ApfE! bA#DJ3Kh*=SfKLդ@Q cǥhǿcě̓NTv ] bTsJ/?UGOM%` ٤S Eo?XpGk֬)V׭[ЛĔk)? w^rƍSffŋ-jXvG"ׯ]v}xO@3CJJ <B }d!$ 2NWr x`lSv(Sv;Q\#`6mb,dôQ;AՑהVEK]&&yA8x)Q&B $ؑ] 0| t͙T4% TSI;M6Ռ W։(| !Whf"I0sJu:#( 5$l;ΣӺo *x0̦8D2=;egoO3pu|#%2j.8 p-aY7 POի:M6 6 ӧO||< _~SNM41L JKK7|A|J,پ}]>O:ս{ȈNv裏>쳝;w>yd-[f2N> ϟ?߿٨Qvڭ\'@G/BCC¾'N0&`xp $f9Tdu!A@.ө,7ۧ~9;Bd4{^H 6]jWզ{yURbcKba:v>,cQZP*ڈ+*QC RhR(ڔvS"{9S3c19!`yHDJK.F! ˒[ ɷA #"Ix_(@Z;<|6kՠ! HjکvKyEk[N=t0%./1?5?ɻFŋ~@U9yE߳gۻlٲƍ۷o@@W_}ohѢC.\8sLA2dHѢ+kw^A+(P`._|Jv:CڵK(ĉK,9gΜO6P 4ixbFJVK2k/sBNHL*?+-Ra.c픘N^fBvtt H7 lXj &Z8 9cOwi{+Ȧpo1ewSBn(*A  L҈xKH($%AГh; .pNe3وQVHV0mly#u{^%_GC ^nJ%3+խklu@fBݧ}^@K/y7Ü/&9sqjYln?ܤO] ,0g?F=I<21lje}]  n߾))) aaa[vo~q w~; ?Ǐ_|yx>wܻk2RSS޽nݺ BNaѠYYYHȗ/_Dx8hw܁ L24)Sy >>>-[<@|RRRZZ= Y O:ȵyM^QOz&:Jy8W-l/)xGغ`$[e| WOS%Iʡ0 U4&dgBr3%;F6^bYJ!4 r 鷀.9RPǮHذIONф$'BtqbBC t B7[8tVSO"$ٜ3ݓh=jB[.A9=FqT:Od5 TDݜ09xo w͚5AL߹sd_oQW+V d111oիEáEd ٿ;B@߼ys77;v\h@^z._|ȑr\hW>E`p3!۲ai;g269k7U`7*ڨP-~vYϻ+U6KSicL7'$Y@rYƁeoj3%J)bOEQ& !8Z Hl>7lr՚a-z-1QFS+ܱ#ڭ Ti@؁R{ʂ$;`P2.kSsN~<j,u~)v|& GzjӎSF $udtC.r"Iwu&8@䲭Tͮs`~HT9q-K |׌{СgϞݻ*Ujƍ=z/^PBqqqAhѢ`@dlٲ{GܹsϞ=u}9g0(ׯ_GE 4x@)/&{ィD9sޠ{] IDATTT ԀtFE`@Mĩ9Uo83ى?8ag¥ ]S3g)F9Bȵoo=pB|gj32smPlq g~~efׂ.&m@Fx{矗(QbݺuPRnA6֯_/iW_-SǧhѢ ' 7lʝ7ok l6ۮ]#X%k UbWC(bhiF#69GOt hM/t}cT6«FtٵPL*epX9 6[aYO>80ӹԌ'īZ&UUm59[^7Щѥؙ#=\yFy8 (ƍoVJVXqĉ?D_}ԩ /^xGxIyk׼K;wLKK;voaX˾}:`*G3r!-[\rȑ8Pܿm۶iӦ `2qe :=0jιE;>q"W}uj`]T egʈғIvdc,1%(NEÁ XƈB _FSG7u M0XXTr$AkVe'XTҪ2Xɪ‰7Wlcłr[y1mV\>ِfB3&f:L4;FҦGmO)a(,Jbpv>αJG7a!APOt!:$c'(z7boO*wShfVC1VN@K?=/[?E?>6`@bJ $%Jb}۴isǏޫZj:!E􌏏tW_*Hꐡ@J]6ܶm[777PyܴiS6l_p͛7Ν ֯_J,V… 33 ca@8N\^X2Ge^2J*ڮ>LЦ8I`o6EAG&D." t0@)L,PJd9nN HjҮvؠI,u.hW/= "vDXSIOsp*> XmpB51r??I/uq*# w:/͛nj:AĐˀݻ7k֬oN"&'OO͛7}O<pڢE P VXŋNz]\ =B̙?`B!C:t$XᛀGwsHϿ79w/y_ @oa^(q\$bU,K30jAL6z#Yf@rکmiBEҪ%L6y^ h# DI! T8#DVMq@E'1CdOĪ}zÆ AСnwHgxx\=rK˳/2lKz\\#EcS$QbI06ǎ+P fĪ7b1A:.);{FcN$FPl֯R9K*΢A @boЫ쵬%Ii8ߐҸ䛃e )VY-sBj>~l51P6(^.+r(OhlZ僡H]l<ڟڪ1he]FڪiMQ<B-O,4/=|Ѻd['ωǨS]@&hl8K'?Fdh*OwFdŅ~Uɓ'm dj׮7|K.ٺ}ʨIQXPiIUʮ> pFŹ\ɩ;2-lKiamF7ʈe،ХV_ 2EҦg ia]pvZ@J dB HlxE*I`$ d'-86(BYbSڍUBm H9@WWվ:Mب-y "5}رj6O~gqn:l=P þ:t1gt?=&=omSɳoVU9Sc˫9!  DRWds)UEdSdz7˚xihq{Q#ۏ*F.(\u̙Ŋ+QĈ#>|#f/OORJٳ'GiӦJnݲhz5} ܸq#))$[JJJhhh|ƎJ1%^U_mT56޳wjwUOyxSIp<0.*&姖=uQhTD f#2xHK\3Jί[To٨5N&`mà_(?<#b؃(CT韌3*=tnō>Qˮѐl46Cl%F۪h% F$@TԒ[,n3Uށ&h\tf?ؓ!(6j=K6=̭T{&~n&bnanaG)Ip bO\.FV1,9\;  w3y*lme&~}nkϹLJ&&N yDX,qեmRԵ\T];r b2.WX]gNB lE`!~O@Nq@!F}i8s?xLugnذaFm;lg-w޵lO>5!~U$ ,bYĢI?M~)P'Ru|V3mIa?v#ZXuV=<"ſwn& 4)f, -1"&*X,t jDJ$~:bXߊ&KBB?.TQqXF)xAfGQġx> 45.oFP&UP괁j)ܑmvZSx RuH Mo- BQWG\Z)7Le˖ \^P ,Z =붂 G}֭'O8U$!Sb c’MK~<'~p.~N;w2e>aa1%utf4(ٿ?LC?~ʕ+ϟ?CCCK,y=NTҥKuԩT}7'FP L"HBU2165 Dv$"R :obA]uVT@7{VhY:ʽL36EZdw4va%_ɳbKo(P
  • A6Ѣ? YE|>bgLQ8:aP ؘJh}Y!rtIG]T~{Gzxt߇-!W4 O>;mǯ5=%O7ZY5O 5zp٠>{T{4 I)c7 Vr@ʢ<6ڡM%dQ ?dw_aDl 6"0|^gđ@!p pcރN*d$mTUرcEA߼y3𰷗%,+K-mذ*TDsA03f#F{w .ܾCPYPT233plXb͚5CVkppp%|eʔ뉘 VhD7ڵkJ9x`ƍB ><-M^0`ԣgΝ;fY`A77&ht3j<Oj;*;m#9Xò&!䰚h'nf_rlSrb@al7O8c߸)lrbߨ5o IDAT}{`pPu8M3Qm.zO1CУӧM>|"WdHE`Mə]׹LP\UC9 _*38g2=F.!4޳_,ƃ{.;}t`@XLW\rСC,\}ݹs',}V VCӭ[7-2)M?,*T9s& s}ժU*@DvFi@7?stOL$q@L \ԦDiOP8f(R@/ٶؚw Ux4ZpM,hqp10y2*ݲX=}"=<# "hHwoW้qfmT- M^3BVPY/6jء[o;46УdWw$>rl<Ƣk՗wXh) fiDmzd@LfށxWKEluj.U&)i6IQv-H;tEN~D.E?~*V? *˗/iAׯ_/Wܫxb|J,'~oⱱV Ŀ+o)** t2ϟHY}-[ɓєtY___x>aɓ& dhӦ |Ͼ0۷i0ߕ)]^hbȑ'1JP+kԨ4iҥ !PQFTZ4WDǎM hŒF;jԃ7< ~UPߡ=goPi1;ٚ*7D@Um[q،2u(uŦhF`4fO 8!H/Ν 6]ভ[^x8̙3 dݺuK.END"̟?fA"vR-ZR/̻z;@ػw/B0;=zJ_~͛7<~8[4[ڽ{7e˖Xȑ#+W ټy ={}:h ߾};,j<F[.n…k֬[gϞN°jCm۶!nO]xΝ rOJJҥ߰@#.G~@O8 7np">&}9 4& .h@:gcݴԝ_dOZvh@}M9ZbӔ&˃wtdr6i?nOK.kCm渴ߖgV [Ux &@P|QZ}sI͖|G2ٚe [SO#ݫ|\Ř!%*^@hԱ$T}{NZ[%JAj``?Y`eiRJ&AlV&꽖|82C:uTZP"``9~j_޺tB jažwbF~޳i`>nM(.3:x ~g :?sӧ AV#CvHޠ_+VgS`XAm(RO?$Lhي-Zx񐐐f̚5 hpΜ9Mp>>>m۶-_<|ڵCV+h0ok?T{QbE, [@@@@¨0Zrb8 'f׼eO.qa.2ZŞ+ YOuWpqIJ4}ĉw悔`҉1sʕ3{{^|:uKG&[p3PDHN {z{YxEsz-QְZkn7OKtY{ѻ#VD}S5ʎ;FR|0*i޾Z?*;b?@Ea^i~pd~% cM6ƻ PQ[L0;JgܣZ`ޅyH5Œ$ESkV(TrPa X yc@f6/s89o鈨 Lxt%  }ٳg~kرc[ qǎAYd !K.>M6%8&MOLzL$ mjK,H.K";x[zZXzCA5 v5Ep0Vf 7 b Dť"@K1!ӥKŬT%ZQ!`ўԍ Xin7/C| 4}D[o=$+p8VxTCg~{s4S1]xAEa̒)?3 6A[hH (b_bEbl=ZHOk׮7 ,믇:h  $7+'N[ի sAHݫ R,0)uȩU'%T``;L\@ N}%~gKršjTV ۬Y;v 0Qkɓ R>SȤI@p_z5BsB;Er]"`x뭷@! }ʕ+ { + &o$E ;Q/$~D*HhHҥ 4OF6ERRq0#% S䲝]Ɯ'Plj9i ygVepzX#`l1ڵkpM~P;)U;СC 7:AEhѢGF18ptw <}an`Xt r9`0p`U L2| 0,LA0 D7*(?CʣG`j*W޽{IcQ>F7#9\ KBԛ)2ǂ ɸ "I%R2MLA*)(4_I>^ te_$6=kQNnfU3ܽry)%:3LӖvwr}%-ߦ(מ(a-B|PP'4B[ݨ}qP*~T~̑h>gGpHKWlEKtWyqc+K`[+JAaLt>Zd~Ju R7ɎZ xYT'+FrF۽4y3$fm޽{-Z7MF),4XbH+HLJD9U B/˖-RnߖqL9S`0Y_^K9qAq@-~*3QЦS9[t3 ,J j@ٳg @W{   nW555 T] T]rq%sl;9jݟ (圧mz-jIgc~tlЖaY=[Ѷm[O}wE$G?X8۷t1R 봏sē .[nʕ/7$ U͚5ADH {kPPaEH|2<~uV($$VZ^ۇdtp&`F1R.{g <m6UNavL O6Nl(#交V;w_t;(V$xX̙LWDI8px]~f`1cRئM3fΘ1 .lZU:(tB$R (eNi ~4R/_>z)G L ఠ#A7,+" 3઻DRˊ M<X~8I9ЃV>gW:{rʍKR8̋ &Fit;SM6?uwqPbkI_+LH*Lz]~l;!%>€`dd'X AΟ?OYP+ `z('R=^š8}Ĉp'TZ*Ukd㐮@ ߿u>c -A(RPs[,99I& B_P{v 畛( h Tu@ w 8ً;y<×e㤙P{Γ<+#6sk׮;Oܿ}Zj &GʕA@d2RDPUAUQ4aQP ,@XڵAЁތ5 5߆ްaClLh="Q '`v ;lOO_]#\uRlʼnS7ْ'O& (ڵk˕/#Gr4 (ƃ2- g]l5l0`իnݚoǎۧXqMCRvڅrR` ͿN@A 6`)S(&H&*m m$V"A'%PbDdE{*P(@n$FpkPPHFk-fGg4O[[{G۲BM|+^_ʌ:+ Ms^X-)O#s8z]*dݦphaLhsnzY TNG>0.rNNhkqH65V Ȱp/@,O\%{ܪp8hjc"0#'2Z>uTXwaܲuQ;{UW_!(vNⶊjjjV4hp5yݷdɒPʦM5'(@AC% Ž`;w%sǏ{V" ,h"R_r'Ht&NT(P>zMT )$^vX{[LIFIw{yNU-ϙBG9qI+el* wWhMh@<4CQ)##2߼y3 Y&)) >>*UQV)w R 9ݺuKIIau"0&`Z`dA]XƃnErσNFS˄'ARxL#m!=|ZJ.mٮ$%9{6**DIAOJ/ޭ{7r0v… ja.SEGIW 'B%J@H0,,;jM|~GP{A8w-|>6Y|邿$5`EԳZ"צسyW8 'RDZ_cqmk)ڙB0dL &&AiA=w;F2m+ {c$a|&M aEޫP9X7ÇOLLfQ|H7>x ÇCd2!Ga?g͚?A%0_jj`TXс@E+n݂y-C +WB|lj*k`ivHm@"{Dd*F)O䃣K@C 5Ied\Ԣ0sK쐩8AK$-*UeU,X p1c*ThO3A† ZlG4Ӏ@U…  gcW?V(hի߿?>>XF#(=LrX {8 *?YU{~]Ylqs ֯T?AUEiHDd___Xݐ, " H͛5EYti"E@տz*_%weja.]h< ԩSnJ t? Pbu˗QFj&CrܴixRR'Ǐ96|@ {OwA-97XM yB`ґn^=C-f.J+1hG޵+#ۮGHa@qvMRHUYF.,vlwPbXk=иwIyt"Myfy(@(t7zj#ۏJ!<Ӎ<;w|HX/a.N'T H€: ʍ{`@UſJ׮]CBB`^(\E ׅ ) P-#眯\2)&N:A~@)0#ad}щs KJ IDATaرcͷl *U@)O>/jԨ?aB_**U~psv͡ h<ąwBN!UwM⑹ys6u\Fu)a/5jTΝA8~ܶmk޿Qf͚ 4hÆ Z [v!>+V̞={:t%>2eAy#7' g)@B f6ܼK@exg GhwaJUx_T#9'-F-̏_r"?, 5}wyСb]k^nE0+U\9l0'NԪU xzg"nsXN:MCjزE˄% \0 ݻW^7Ic)))'0<ӈ#AT8w̘ѰXϚ5+[:A8q"ޛ6m"Sܷo_iխ L|I&ԪfnJ5RN' {J}CL|M8rW::)( 23k~^Iw &@x`NnBJ5qXw5dŠ\[0ړq-6h?ʷCV[ ?|O[QTƠ%s^/܊7q+}x[$[boD]\|[{bŊ7|CAB|`` 駟t {n%`V]2,&& UVGs| tK.EHOOD'`ygƍDև@AX9d.T-hjӦiAC(@5DWǒn=f,ȩ%6ee8rC NN-SPFaYA%z>,VX F_A;v,ij?Gd1Х!4Bi; h0 -[>Ab`pBVy8 QM4!rNI۷om@RoܸΝ[Bb4 #Ó_ޢEKt)cl* [ B7Lə*'`|9Qb٩"ͬˉFz hB|H`7;P xY0q4l-(dd݌zo?Qs&IVt 8̿ kV7 /'DZ^݄*Ui3?ֈti[j}]Vc-ɜSElNW'`DhoV]CjFV@'VҔ 6ˤ`$C ?-Z{ ;ACH}f&= Tg%:f?  jP!Q`U=#GΞ=dp{XPE=zĉaO$& ]Gݽ{ܹsP H󀁨(聗Jj <GTp=:ԅNF{P4 ؠRddy $n 1٢}kzQ!}S_7 +rơ} O?ez^Ng)אh*!*8\b@E7LLϜfCuJEGOlD{`phZ@qAUz(  :H:`yDI@`8-N,h-3JȎxްl#MrzYO0dw$o+ؠD~AiH#:nK/%k6)òem :( W ĉo717tr;R,5Cu›fJTx# ~X#ʀTh-1[l#4NF3 ,&`n4S ~fQTy j@ PG-cEi ;O:+z| |7]Ei3f<{Pr;u3p (*dLvfS!f\] UķGYy(C]#F*~a.mr_Sh-\8qK%L&SBBH @lo`Tא0*9sx q،;g|!$ B"0" $A (""!d@(A!Uq]afg_Ug%s}]W{ޫѳ+Քϰ{VQ׿KpʫK{¿#&g ˗!xArz;@Eh@#8 | Zgb'6 G"}2yH ys(6eJ  0ˆTV fm R\~HB@0@-4+ab &!c74S]G8sxïNV<UYi\B~3grd_yT瞻&{|z;d#@ɿEo_;w\}{1!7}$ɱ;xIr2 H~#o sQxF9k1-IJVl?;[ZZ,X0k֬W_}uƌpzYUMz ᖽU [\03=<Β z莻 =歿Sݲ>=440eXem!"B4<5S!]Eh! B lTʨ2w!\u߇^7*p"Dnb{¹p jT9La?FF ЁsBg']Ho~8'moIkOk1Dk׮˗=* -[lΝ0ܷkPP÷՗/cGZxvƊP/|RTm_vٿ(eqx\x]udW_l+32SUw<ϷNjAI=Un猇l'>[7Α$KnַY˒cY՘Ӏ{x@lrr_ٛ/:f>6 v# BEhg 1/$B!4{p $q @BT.@pXLj8ćA^*c=H}{XB!@!1orBH<5+]p"Nʭ |MQ@u,IVuW lN惘e[.+Y tʝ/H=NƒM烸}ZLTlkqOkqO[ȾNdG)SH+bH&6n+v1y-5Wh h }ŠK !a ؉FD!IՅQvBNE,11U,!d ,P %Gdj8RfLtQޗteABa{X%IrU$E[Z,TΦ񤚞ynK\rzsl';$}iӮ7TNKNUgI_F`^PnIrU:W3s/2s 8K|"mU;H3Q~a`.=cWrΛA&H c0Yp2yp/<$B566^CK!e&^- B Lyh P 㭵aVtD+ 0[BKBi! h W̥-7eۙ oq//e9}p{ޱ#(hOsi,@?؀v"в.rZ@|`~&Y&Jw{;avBe3{  UnIMoLε % (D܅+=|bL0ĩL\fSVO;0r*MX ]Et'uKts= RC BByG 'o@n -Є$ڰnWE <«jl/4\X!j{ 1 ';(]Jc{x7GɉcY{1wmʑNvMi6kf׮`2s(I4_u戵HŽ~vZ})9̮qF#zb_P6b3ᡊ4uvh!r0צ$"m!85laQ` 1 JX'[EoBp oXSl0=d 1BB(1(:;N?`>艀n/-VwrRdLrG@ G3U2iM]!{\s!z0/eI@kz'nHaAA)rιK/;~O< R Hx;)Sxo>|86o| 7qb$N8pO0AeNuEqF#IjuC聠IwXLдiDW=\U9A6ןtc4i A!Ăf0 r"x3@B"ldޔ 6 !X@t( bfX@ [K "X~E}'m8۴*8Ivz n$+sy:~$O W<0>Ha)ntзv0 LOe$@VjC`b0M!A>+Q3qt#z]4AQ@Xikbu 5P:ktg0lq5$Ch0`7\mu?;H%jQ<OHiow5{oM ʏ_X.]T~閺l:y61|v<7a&;~MuWX8(H.G38#"Teee.]~o׿uH>xॗ^n/~7loo _SO?ޡ+V֭jk+d*+RPP︣?//@ ϛ7~ . JhC'?|6W^E\z٥_c$777q⋧z*"QxW7͒]C w IDAT+Z!si@=[->'2-K>Y,[low_ i֒ٿ,#WUX $=y*b96ܝ0@B0l,B[#ΰ/z!W{zːmO  %]M!E7*a*cKnfd8|zsWpVIn7}|9zS]ӮdlJ^2ҚHM(-@COowȆ4.!L5oӃ㰠_K'GEEE}K/t?x'u<$F?XnH=xGpUWr)Hϫ{9o:)oֿ뿢f;Ł#i~ر555H>}ĉ_}^{m}ʕ+vN۽{7K]r%;wܻw/"/ M;@9Ǐ~@RPhH+:u>s9Xm@8IҀb, 2o-MuZ\|r4i'ЮK8[NrL~"tZ(1 0>?%ېDqV AV />* dP ʨ/>Da)$Ch&H @&-M*3?@`NRp:&I{ Ae# |(aaC@bY#ZaaCsϟ=+B4ikmS(D WT~5R#TV[e>b=\EX^~Di|cLyUU: 6 q?ZE2/˾}*+7oެvT={,HjӟN_vR) H@SO=uڵO?#K]v~׿"E'u:rH ?H ѣ'p7U+HMWj-ju"&Kc6/(ؓ|H,\g\'c#HP&1V`cfHPä։Y@xc6'",9PZ#4>wY0[Ut N_B]wnŊ!)5: j2qZ0Z5Ӏ;A"u| VHk?1xxAeU>λ}I\8)Z,r:qg G<Ԝ=)$b峴RWxc/!MQcw|B=je  7 uգI zH4a q4ȸ"ip4IF& p7wWMKu@qV{-+j{VŲ v(.~I;B̷Vܵ-+)? Fǎ馛N=^x!L"9)ӟ^{w@/Df{.]߿t}5 zv Dy$ BW1//s>9IDù睋$x(+H!|3TK`K᜞p<fE`W. @ z"HMW~K;:d?|vrI޶WoU3gny}O} up8|$>w|w~pBn(hٲeַzqq$X.%^t38cƍ:uuu_°Os[pg }Couw"U@ 664 8?!]8tDtCزu+R<׿655W2l0̙3QD0w_W_W|J0ZuŸwW%w/Hm}lsi@[R`q`6@Mhb=D;y%϶ >E :3pj@ 8`2\!=4 p 0-BUPK xxӀr]Ms +n\>KYIՑ4tHYQ ׯXC 9"=i9S1477wy??ƿoH/--3f"Iz-7lH&H꫾۷2e^z9rD|;zT7ߜ:u;u(ŋW: 3ԃVD`(+P  `I@IR4$Wd&.Ov}1_rt}$@pIc D$rb)V+ з'BqQ|N7wѢ^t b 8ԘDЂEXڈ ,^ƹ(VY=娶cͯ Xv}$|aqL\ ކ)Wɸ(l,l!`8x킀-7df'`sk^Cu2a99Q!ޞ0`F't(ݰPˉ&N ϟ??iҤSNwމoߎh$馛H1bo?~_jU[p!" ~'0Ҁ.YO9O=ԡC=zt#tҥw =(Hm@ "/_k֬A E]+°32.S2 kL5!qp(bIu @ ez,݀ xsp R"`&%e   3ko. Bx07 *uڋny"@T>j%EAeN`N5vvm?]S*AjK$m !s4$5djZu)AMwM$Q#mۆD5W6O?lkk[t 9qDT v]:zVƍꫯ";4CPGZO$#l,!C G}dZK1.y:~{ [ПC G2_5/ . NeQB)M7缄XV' fI%dx# ,Ӕg%B6u,JCh4- 1@!oM!*OV ,!e0:gd9@tڀ}'!Ġ@W"JQA2W m`́2 [iP sOyHH0ӌ `8)-E$XW=bWX6hI;H_3Pڔ#`i۱J{q2_~:J(s_s3Iu)@/,u_\5$ Grkk CܜжF,@]t=NTD7{;PA(Di5.mZ!)D6uR PB y:xecH&+q+]_Kvm5go(zduc6|&]fewv|hח}es;-1a`06Ox{9/AnP"%ʘF$bQ@,JaAY@ 5d 0+7V| NM@bc{?$66=Loy=[3]h01u c88,@ǝQ=ܤH(y掹x\ϞeI:YA{<.`6u[lsE@GT˫pZ5nn,Բa#dtώ#ӯb])|G,RUUuW,@2]H-Pr[bO2ϣ~0V8*YZ,ǢSK .xQ#MNj6wĝw'ڑظ];c~IDZ_sv%,wI܂`e"MZ鶌9C[}'`"@\Z=ɽcx@uɦBK#& b0;DT>6 1kD67P%ccXB14&;! ZCU_(ĨfӋb Je"wwAŠkܴ~_kՋFl-:w})z6_V,(6ĭI-a2p48R+Ӳ|dIbuwC-enPSSӯ_=z.:x+bi0]+, JMU3-e(fmm t#dh$ˮ߽3 n߽* &9 5k&ok]XݨSHd2l@_Q2 ˆ KXU?h/ T% e'Šf#qKIh4OBS Q's\k t\ޗ8 3AxPGXH ,@2OT0zgڶPHGɯ45r:㹾6+|tBiWO}q/ ^3Yϻ%\K^Ϧ5[8ZNk6d]2qzp /1,@ID pSrf[J׉RHy-iݷoQ4:0G黨5s Y}{ZݖON%m†azuadXF'%!#`Y !zzgw1=vLL/al HbG$XB"p`|>xxeY`!9Cx!h#tRȢ:f@4]&(1abX3*7S4P, 2\G0QȊ6uԚNrr2=nELxiGTI3h#-ӆo L `W}~gFj!WPF|l߮m H}쾚v$Oe,io"[ld:PG 1QuQ}ɒV`I $I'z.䇖;>X8=NZ>LN7?<3@1gsi+}4eNM[GjqWd9=(2g,=|^:mgWm ۠љpۑoor4p_3Ɉ%S;cfo.fZp#KH^Pnߓ_OTZSXU0ZŒ"4of+em9 Ċ–@33 ^.<$d jY0$!`Ytr6ܝ :c {mʿ.|}O洯|OXӱ#P|IkH55Շk7~> zA-sa5#i4U,2B{06l &vb&p@X঎C'o@H(l@6T %DZ@ q`xȐBL6$ڀ!tAF B<8* fi2eD&pp@f.B ãi@I(MsD)q` NDY :wE"zzu}Eݶfn|ɲl;/׮Gi_IT*cq1rl%5[l2 9ΦWUB/)*@Á]~TTCL|VugmΎާ. [Af3tBخp>PR{r7owؑi%HJyc8QSp옖]b7 L ܎ڢ*엃Cj8C3Z:IQ5_"!iɑʛW,KFܦݔd^o-WNǡɼLNJ 8|׵e1'tI=|$aGˇ+r}XMԆ<2 vՀ^e1;Wnn{ v@9HJ|w9YRap҅K~lT@':L%]'1 ݲ[}wj,WHJɆ# U0Hnneh>X;Q})H\=X(1pF vEC)j pp1 ͏ `R&ߜ8BqɦZiYrS \Yz\sgvggP `HP%QȈԐ+e@iYdU^J@/$@{1?ddYʉ!@ <64d14 Xc@eDD^(X!CÒzMUB `C ou!%Dq8&jվ'^\Zǵ# $3YYº{a݃MӖ5o}AmD(jr.}`X&SP"N>Dvh@#D%&glƴXHc"Y9T¶38& T!b:V ODCBC58Ih#'C\pa*e1 L!s@Țc\: ud:?/@ouDʆ&XĴͪ 7qG%o!$2"`<b!A A]هH *B|{؁e #RRSќzIO0$!~oiOxbȲxWP9T  pvAŋT S-\-gzE2RWR}ugy?ʑ&뛓v]Cb,pwoTyhAnxU̢ :x`%+4S:.!P@,a{)>|;R}~Cza M`! 1 |;L1ĔKAP47p^Tew:@V2L3dhRQ0 ?.:윇1^^yjS ؑ'ni!~p.4 Wnhck+T!ӝm <_J[i  rV&L+!R ' /nMt+j .7 WXA/;D0C9`7StG#or f@ D D!D8@: @ +cL @b2S<:Y>fê)iIbmFѪ$i!o/eg>z~#;sUzUX:NX.1c7h'vz6ݖbG HؒU¹qk,P 0 @(|)sR-a~Ul &iL WWs_ZEqYC mjPuM/~V9t01xP6SM&waCk m$R"$d:~؇>޾kL@Yљab,W@yB6q7"$pVf8/>f}a@HL  {1(FF\C\`б$c ZAx"/\p|:,(8ۓT둣Zh}/w j1jifoӕ #uu' #랙mc8)ӬUA%|-x;pdž@]Jx[U4! H !$Y`99K x.pX PBb<JZXBY 0n_Ura[[U)LW%)mغCb߈M>=o-|UEGĮfV,@S]~)`+\ HY_Be_ VF of!S,ipbdQrK D-{<ο ]ÆFS0M# ɠJxo'BhG ZC bA? 渌["1CVS%0H5ₑ>⯸/zʑmIg64{&ݚcn6tpQ5d葤S \J6衎x6cqՑےvyrQ`_U(;P#:ArL3[<ZCX#z,20u%{$h8ԗBcjiU~OBBVeTtJ wxZA5_Ī#:U ~5*KPWicV!8aL0f-/g2#=a8u;h(VUFFKkUq/NAIurp$veie1 '+EA ~\ 2Cbzb'ĺTHPDpLB,$h{)Te4(e I9ϪUe0mo+q`Z}`zlms:O4YTo^<~_xUQt ԉ>%BVH "9@$8NuKv21={t*fޖl?iI5ݮVE_nl~G6G!BZFbL·V5>mՖ=|]0;CoKW'gI6= ¨ĠyP].2u`& {p!*@?dZQpxUG{ n8% PHxlmJ!yc'2A= Қ[.oo(]:,yo,忞4e ˷|#EF2f5OlISF [[H0v]FFFh*JҤ?0WyamkyhbskV-g[6ZKB N$Hx1"b˺bVb6(p/«p8a0|x0á  13MW*D&2#:|42f](~8Ž,(.YoO}֟M7-h]xvy{l:Z[-{_d}Wcϯu3>;ސ\${=h'veot P j`J}֩ =BHP7`{3D3azA% $;Xg9x(8: i/$q'~ApR7N|Em#_cGm-a*9&u˴pi! yQ PNȃ/r}c&>ymIy%(Fbˌ!9l$GAJEu6MEZ-a\'h q'^"d8;7OH <(2_K! `H@e½! Mu/'3L!,51c/CB؇,H!DBy+B[<+VG=5in!YWݸ<:m+B׎uύ뻥i]M/ڨ\4j]h#MWdJ#^VЗ7i6Vl[YɑTt$CZ;"x$ޡ6{B-qbB&P+̐+Hi Љd2Q 0D^0;pBF;$P w*~Y)hG$Lә C)+k2y PߴP˥_i.@M-$f6ϸd_Xx.YY_EuSnF|f.l1,f2QzS-{xMㄽm'8@>IS, )ӈ\(`u˻@ G 5?H% $f(z`L$'( $!L\%+ +K$^׾c Q^ Rfs0і?˓܍qygk&7o>$3aGf $shyçB, 鷻f W'Jx˔gТ~[s^qpX6k7^9cWܲwDY2JлM8!M9rA@$3zt%\ZI>`;aS: |cJc`&*2@!~S܌8J.TY09Ar$ءu"N o?)KkRQ{E/2$^[bM\0$Sᱯnn#`+ ,1vRl3voI>),i٬%ybȳ@Z$v-;[ë^X\O&Gd&qg g1Xz Q;?oefUd q'74KVoݽ37EVp.e  B,$CBAC-{!H0{Ap!c8 F]9H͢fpÑ2L0=p8oXrqmҿB45_*m?\  .tz[٦e@w+{5No=N zϡӶZڟ)f۫=ܔŇl\c)'s|Dxس0[̸@OLH+kG 8 BVѼ$Bi&!I q7m ƥh9ʀ _Q < _q⻚M(Ywo>'PՏ>詛&%2?}G&\vWE]t[CNᮆkL7}n ^[ '4'l۾+NɪʶEk|V?tAm9JoeA]btq^?˛;ЖIߚRy%cԧ #T6 I2EF[lm;`1t V%1xsSLx{?Pƥ37t)#FUkH#l"1ʌ[ A03$B!.dTt/w<i:kߑV;"ڳi$X|yWk?z@q>\+HEQG&"lMɒ&;cULPBT OBu Pm`lT%^D IV$8<<n9Q=0&Y8}y!twd7<9of<|X +qQ~/̻7Ny{Jo}in7|꣫\9`,->h+8jmĆCHxdh,> MY:avZ_''@N5--37CO8#E9"y F'$1R{ߪq;Z>8O5:u@g4٘8r-Ө3$Cbx(Sp8uyZjG:+h #ZԜ-6sYFzÎ /)N;uaQk0 IDAT&gy98YiO^vXT佽9)_<鶾|#DzkDD{-:ˆWqaѺ/;ҚDYj/ɗ,K˝fVjHé\?< -.^la a,@V{?a}} =$QݠAEB53p~0 , h(܇ \TF oj_Ei3>թnسC_)zrfT၏]xm f)qX"+& G]+n:ƉKϯM7CЖ$9ʍ 2% 5:R۟LJ_O7 Ǯ#K=c!]Cz@zbjNJ*uZa$ՔH+eW,CwxnW+߯ ٌW_C]VJhxb&''%ħCT a~De,ߦyc೐RkS}B 9 BcO$ ,  VhÅ j_ @ v^=G U6t+A=Sd}xkN)Vt~+xY0`uXSRf@x~oGϱ ܾ{pކd//{(cxږhY Pܴ~`y3{Z4d j:Y6RܷuM/ߋ2N4~u]A8"Id[xQfup-\6tAMk\7ʺ*زk%PЦt!׈7F&WxC33I-Z?pe]ζ!_<<--xIENc FWTayN/IP0o- R)B<]]&aI㝀 *Rx*JjWK8;6X: 0OZ1*x_ 5\%j@]r1AK0ZXͰbQ $WKDb5@p2{=xM!W zwZAur×U\PjS[R7ר$ v4\q_$Zwdcur<`s2sCG/v $#1Y=Rj[˻ZTМX_,td6NRG+lk{4._M-3ǒ]g2#r`eO0W_WĸsL?@(٤܈︘*_ {ud[qf`S+2mɟG:{xcOWLsy_Ʈ_繓c?̬(MöhyR7&|:9F}:xjQ}jSs`Q)|sU?mSs!᧌ k(D0`f{{q 1 >c-"x Mf8^d4]O=K`Ո)"=J*/핏]$ HeqDao\٘>x o>dmcvRJNZwWߑ\HtL8o{}Oe|W ܪypiXOIܴ~hyӳKՃT +܎1/2_߬T7̴S:\O 4nx)CW!Hew'=聏7\bCsOY8=PҨL0b±16 i%WDH'2 Uŗona|p$`8,A^~LAk9hK՛˩ "zbڵ4y9|ڻԨ/]{=skNPy XL]G[I8efAao=TKQwt)FWE:>x-}V7lAjԤ缨*9J>ϏW%m)ĈeX^R͇(٦/ϋ2mgB6gΰ߬]I;(O\h@%_>&a{F"glЯBX1i : ~ԚK?(ԝmQa5WWQxU^GB DC \x?ڠn 4; zWOt'~}_/kYcɝZl||Q-Z2ĞO0$dOStI0_oi ͯV'V׫PS$W< _ KnG#aIR tD鋆24bm-vs22L%Iy[Kߐjv;kZiWC*5 ίŝ{{({l'܍=VoLޭz@b\:~q ~hrs)<&3U#'|Ȧx/=k-djU^r!q@Az3VH@eeVL Q[#!r iu)Pc@36tR"xb.)c'S'o}AK 2E7MτwXRRhՃ/.W >: iժ mcҀzTgtSI`X]r/-2GoAqJK{rAMiHIVO'7y#w¡R%s&{dw;URG^EKFI#ouHxL yv>Z˦5ص#.Xo3GWzK~{oiMѐSqzHWɚ7*9J-8%ЮV<&$ %#Nt:/2s yaO*]­±}\Jc j*dc ; E Pۀ_n)P.U+Ad-T(9@:`\3^RpIQ{{xwJkR2j_k'f0nf46oK쪁O?`},|i*DkHY1+AwEF:3ޤ %'aGY&a?.K-wo8cV6wXmR]TzUcjI}j}szo[O˫CŃj3SqAk}IkA4nra$IxBPQ^LP^Hv RlSLek>ךiB7.Pa,o1DT_q0'pϼi@D('UT د,#hSd^H5)=m*_Xw&W۝7LQ2v0 /=7sI}ڐ>. >{ mo/4IUŗkt{Oj45pjI"?xnӕ+8v)FhBDžWXUU!\W^e4E?n!RdF*өIoui}\OX?komSVڛ ;uiL}Q 2OTgw3YX!#jafm!oBaTEs'R]IuKtf J$Z3{x:Z" _A)a$& sхE"}R+{)^H| DcRq+"Ac40k|h ̞0?[={%Gg]uG]p>2 D|SϼfW]{;AvBpI@NvO[s4.BrJTcmroUuMӗ?6ߐ&± Q~2a'+ ^(sa'|j*/m_y;AA5tidzmj~iᣎ']rFl0㽃cb$7zgY~)Cgh3 Ԛ޶m q'Mɾ|Gpޟ>%T L&d e`@CTjAQfKHqۑ# lub+AT"Ex8 !:ئ@%,xP U.:ez?D^Ypֵ.y$z):W{}c3i X+zZOX*5cRoI1{pnm2%|02WֶEه'[o~:>y5qugoTnK.fډe6bNw1ph6.بVu |~MWڦ3Qt8,[3inS_y׸M]GPG }1p]s2FB hbVה']ͳ 3??|`YVkH~q$ޞpJyKg[kjSI"ͿW>{H}h&ִd;ルhq;pwU|7%n\f>H YG0>]|[Qxϸ@eBM<3nls!?q#cS{,`whO-T{P{#ࢢugWn-lnwɲ&+ %75Z5kYw䭟nP6AՂGcxtvԦ+S05 ^P2Ҩ::ښwd13D|#ǖr@?]H3\\owr][/TA*8pg\h`"H\ vs0:D.UB 1q.aP]ޚQd!%g" BvŲ5uT*݊yj[~aWE ^Ӈ,GXhZ0!9rwEB\/ zOCѡŇ䘪$d~Cdxx",e ioL9՚C[[f}PqݑLΫjy{ʌ ޠ74Oj]8v{É-c)L^J-[ Ȱ$gך]5;ntue<]:2]ZI/W/:&ڢS maCunf'Hųڎ!v-p|LdwTӛ*[Ut˯;i˧֤%#|38?'*&Բn}f[7K9d-x];[l*t齭X#5ꓙb;yp0&E$X|,Lb0>s=;Z>\8aa݃"F,3 z\o8 0_mI-oE+2Jeb},vB]S=%YV;_kqk*qtX!q!p|7@Mfe  1}7cgvEilf^5 Ø\8j]{V5'y'BA2Q6v"Z{2)>QPEUX-ctóE7$z>@w.ޯZTz,N*(`#IX43$ j&`0)da}{o}! zr%" 3**"()r:7LtWYWfdVUVVu._wUFdddVVDdD-W8S/LdCffϦ yMĞS Aw=&BJ@b4!w݉P'@a/4JDAb %Bl1{Bce#SΖ=|W?h;埝f.S//}q.P'g6kcgY2c(15Sė+j8oa~lzV5aO>bU3 )}lu^tX.u\ˎgא!^M!HݨQ8nuL-_2\29P9e+^y;neK:c|-c,@W K#+wewz&*5'*>-6eeT:}m]S~sVW7H~uux՞K8VhUX.(Rҏ.ɖ^RjD) gRr7/JsrJ^'\v*-dKpع;l&*1$~{ %!bmlrgA$X^ck?a Hl[d5E=IZ?YS=QL:X?%].]r4-|g&YM xz$ϯ1G=VS) b׏3Qɕd(H'㳧2Kͧ)_u4GBehitKE([z-++W30sT$!iFnhE aY$-*h y.sQc-?3c*%1qH37T1X,RC`@=)bp2^4I$A$s"Zmw +[b#m S&]ƎٕAHF1tzr拷x5Vɹg*.﹕O-r9۪vo8Yn[ +DJ$We+n-*rjI[j6OZl>kW=Z,(GTXt\- IDATۅv/[wqgQ=~ީsê(iO$Z;,l(9NQmD8X ʝD-Jh& ;);=#­?(9ݻ0@$$D@(t[D@H!?z O9@0 THq'i`d3@㝯u0V0}N/-m4wV o%LNI*eO@3hIъHzZå|[._rNzӯK"ӆ!_9Q,r_V`&|ђ'f|[rlQbq%%f삀ϟ|v] 4U&,|)m~a V.6\uߒsm%d( Jq2"?[4rѩVn#}ZSwjCJ^aiD 7̄FƁ &oҾ1N9W\ؗ+*YG$`$3"#ytuk: Yd( ϙ"3k[uCL Hu+xR_hEcO$dwM |5ZA l8g' >Ar"HLגͺ+M5*?SN?:tf׹}i&]}RP5)r~nɲ3p^|/nOۮ빥`EJDܝVeySGOVHwSZvv7;| &GMTX܈k aAIdŮg^?rzfhwj޼_D4fA=24Dh0|Kl<Bi*D^Ţ2ѝY4 U\ dET lhΔ YE%TűYnfdx(O`;7xgM}.aALmmAmXOrnK?A~dx_q!SxkB$#8X2 S:/Mf&ޝ'!Zr_öS V"|M7xM N- [aQ{j^\$覄Kc*6$x%DRuه);57 JɹW@ <]ȉpԕEs`eqX2[3! D#fݲGJ?;R 1biK|C=آo cځ" QX54ǾodХ*+j+ ">r lgI?h>x]|dY.aqD+R27mޱ,)ɓ?P_Tb'GJNࢠKTHax5+rUx8=u֎fWwDh bW92]B@^AkXqJ+~ tyh,z!LrZ`60&&| 7Dq 9 툨OAv)E\*mJp(փ{-ݢ;/eV4}fCHdI h&3"H0?+Wo̴Ah."])]h~+:k%cEHZZce_dU-+i|&ߞބ[%jŸYb#O+䞑ءcO=P-ϗ9m4ɡ2ׄoё+j?lA=/^ˁXߘIyk3~8Yp21f2$_̯{蕕fn4QO9GgGp'N(Q%hVeHitk2WZ|Q~4+Sz^I?r?޾d_^(,*Kaʘ]~JcD `@waBxGס cehiE?[ JRX`22 Bq)YP 6~CL|Aـ$C z|a[#WQ?!ieVgDKKcg..o Y*Q/De8~BYiM޳[ Ƨs|KQ$%(WU\J*k_xYfC&CApZYtᔏ>2{iqȂmwH츀3WNI-Z)kL;P]}軁ީ~ZA1/Xt\$S 冣i%iEiD>0% [.Nk }炜!byKyҎ"Y ,6oWꋰxy +&Dn|=vo*@Xchul@ "! >K"L X03 p!IIkM,CI} ?o5fόocR.R/v$ӎ6Bl˘"-Fo+^d aP Ul9P`k᤯)2͕D(yQ֫Se+G<#zǹŬi|4&("Ȑ'E:;]|W>\= %~= 8^HM$8^0n$cM C5{׻Of*$.,N8muDRc.V"MfɖxlƁf~dt3>8hAQbliqn8Q3[wҥlJ2Kwrr}\)VJ$-PGRN!})Cz%Sztk|4^7_ +8c\"@CYRg*C%6E  mxm *V fxcY`=8U}[0t<cV#uұ|_zyЮS>V|2Ӏz[@b| "7y+e__⺼Ԅˈom5_vZ8ǒ?ἔߑ1,Ku,cy|HeW-^- 0f@pđx.xCL1TMsů >9=CpəǼ4=HqKNE .UKL ˞ [^5TR.eT9UtIK϶y}ĿOvjGdNu|iJ$?j9E8gS>Z392/g*r52Gk2*0zZE Am[BbP~) /H_HאrBYd1p{. Fu ba)A1<@i }J6Xn< ?)w̘p؋`<$pƛ"1 %p:| X@^X .!-C<ΐGx-@c)i2 ٽ)Zu{l29/i,5xo$Ornǯ_VZH(Ozr9ȊdY^HuARŋ]9 0Gl,Q1:rmvě;"])ˌtW7#a KR93Pa]sZP;OƦq* 8Gph6ad-݊! X̭$H8Ĵ iD Jo`Sp4'S<VsAL)vsFR);;|HG+uuv4/m5M^mGK?=_HRFiTdD_":(R,*rVn]F.:tEn0Au@ޫ5JqG-89㋴[oH=Zt*܊ yᢲHI{BҳS/,M9W@ޢMY/|6tg^;&.=Ft$=Ks8=DHMc_[]5Ġ KyW"/(?S>?/dHG:oWi@CA 8@G6%y6 n`!XfN@564ESˁ HfZۿA<%"!)d@ @)Lzݜm[tEVys.(]pڼ~z''5lBWV0P}OJKuK\o+R}QM4` Y^l]BY(BC' ?@N f%riwgw@J{Xa\.Qm4}=\}}?>?ڑs{Đ'GzҳO>M̨ŧ~<*tyS߫n8{ _VSSu5 J[w#Io;RJ>f=!EVUѼH4"zD^(/ˑ^PҏM J8k!ve1YRf @IJ A"1Wzk<'infpL'd bV !a @ZLp_  Y(mE+J ",gS  Bh|ѐdw!p?$h{`nBR.)+V8sګ6YHRsEǎmZ̆!^>SʃrQ$/Ԗ}yx-fsO r*xˑ]v c_R.rői\u-vpKvsLK plRe"~̿ş ;}GbsP:Ea|^3En0hGhD [xr eeo%)P-D3єyy % $#K3y祥j2=8ew@ iƥw _ }rEnz|[t,' =nZpʿZWKn>6Ypj: hu"$GqtOTv+a9tbޜWs)MySH1,> 7"s0fVg . 9ySgʾ[W 1dY .) *-Z뫂><;p.<:@SߏM7>IQ} '(FN>L{OVy'oc$%׎ xϾUpӊW’+\i'Z2-^s "1iL ANR e`vE髼hz4NAuHx@lIf 4"ك(zF~ڠFA" YL0 `|r+߉?oL@͐ۢ`9+hCRwM_f#z "rۮPO^8P]r>+YqnÕS#b[73mű)+ 'އ`CjʈS>UtSrs鋝npcּ5Wf,K}4eWZn}—;/siCJ.wtVR-*rp,ێ ˼CTD9`"kWɲ$Lע6eaWv\shYx9Jl-֧{ IL6Qފ_0,9_EvWLc1"G!-B4! $dӐn)7VH UZ6Nj Hp &A쐐25"c>!cAhD]pQu U%'˖Ul?p"ȉsSJVΫJm[jWh2J[E1}reӛ -yLZ9I9X> ceYceA Lg} bjW vv7TǩçdmKYfq;- qՒıjtV&c^:\>-ȚA? 8` 4T!V @K ]~Hp49%IrQ<"Lw|t\ 3iE2XC%Wvh`J-rpO6Gxdŕb4\j.ﵽP XyDڍ=7%rj)?1,@ڃ,a15?|>;+Z|_3x!XJ ;>H8xu UU該9 YҋY$3"VIv$=o9TB?gʣۋXD ߆HO w GZA) Y$bZYZ˾HC`(&H~ # ÏD.xׅQn5|`||NU*e(Yۮ{P 2u<_)~᪥8'g&4vڭﻵpBJɻ*6E"Uɉq騽 IDAT7/&K,9~ڪE 9S2}m؀J1[ K3IPC4 ]ϐfS+@  KG@L#=Dbh  ~,sS~A#|0c7s|Bl1*E@&EPc"gbpv[@:X{`*s[]ⲯۭVu|1T@{ 'z֢uUi]cHbe9hK#%RvykRwnMDʈ"5kR >U!8Bsb/"ru[-G$U^"j|- v౬#x5vӲ]km| n"B>p:CvDb%PD9x(Z*)'Gd-d5(Au4$agکΔdK_-U2F%JBg 'juhgIGJ 8q$ aϦ#SYbA(<@[# ē#7]s(/)- ";# A"u-'Ǘ='Mz{` 4npK6ZH0O,t2!C@|i  H>L GH€n_uo~_[+(™kz+o_wWS~ݿݪkl+rʝ%U5>\_^%~ Ŵm' ޛXGFd+oXq_U,*JQ->DDۗ2 rd=99i؀ cqL @'~;DF<>C6_"w4aV'@Byz- 1X3Ӱ5B<%b)p_p$$a^Ӯu~ OcQSָhÿu xuCIPQ1i~}iyd[d-Mxܯr!I˜'4,+f~5PU.i]7_2/|C^L\q|yjwEVfW/}~q| VۭPr`{SVvBfݛβ( 83pؒYn+WG:gM\HQ gRU K۽*G&% *Khƞ(|sf,UjU3fkC \}f,@P2 dx,ٖ;/"kܷjƖ G FKKK8c"P IJJJ"gq"jbyUY 0Eq݀2 V8\*?!YZ-c~ Gp_ >p1Ph؆+AZ4jZgec ؟x?rCw>N-{۴QS~ߊc?ECŎ6xn I" ,c AV:2HPE,߼62rDW[C Pt-X P (ݺx,9;|T$)r<8tͼ1㸚9z> W ޘ(NqFvrpP_Kuo>} 0q|98-u|صoxAܮvE3,k Bj)((@.#raaaVVVzz:.**BW ~dl4}͛/ZJee;ӪUG}t̙C'jwĈ'ND#sv1 [}]۶mWZr5k 6mb=zvO~~3~S,.+gJP$'-9/))?dao~9Z(C‹ cxZ:I%C>!,p>BK`jE5uNwýϔsVt~6RtӴQ၏vܙ -$ hfWjlfj&k7ɘ- <}Ѹ̀ziҭvcIf ;di'<33ޛ"y|UNan jphrlŮ]\h9>u#g9ci]3?+e7u***V\ٳg|e˖=H_zu>}M>_~]qq1sّ#Gv i?PCFm6MX߷o7??7Ȗ"N$I-ˑi xƖ.=//e$`~|< y~H dW߭P>d7xbj&ʻkoCz"ӣق>SRvm:kwv X1<ݭդi]=M)צP @H t@z-B58~m?;hQ_}s: =}z*vjaOܴJswr{jԝ-Vƾ3e_2#˙W+v*zt1 qx$ho-ꦧ@MGW!yٲeHEx=`uE:;n8$7y衇 ֨QoY !ٳgϟo駟P~ݻ^z;vAW"Ȭ_F잖q͛k RB~_>|On_*C {\Gz x뭷2gb,VWR#8_=Y=W:;db%"ߐ^̐V/dc HIZ!>7KiMX̎#|u5[jnq^@GCO>7V>k͝|B‹s|@xԉز ŕY,3}]νK$Jo?`B]$7McfgxUzy?P)|+{"7{+ǧEg^+<:});=>dZV |z RzΘ{ܸ HY@Ö_%=>vmUWgS; ƊȒ$$).:)‚7brZH]T~ӯ%tXNG tJ?=JbN|_]q[OPJeBx@гWU< y$ɸ|P)--:toҥHF'.X)}ٯ~o;nIIɧ~nEtiM7/jW¾}"D Jf% 5@cp"%wOӟԹӚsb)b١ԋ|l>QAv TI{OwNzêY]jΎ^~UZӤMQr|ϷEǷ*)K{svX y4G1uش cKsRJ,MyjYNl/GKhZ|?bʵP9'_+n Q.O_<"}ML5>=", 'L۷ ?ݧNĠv[o}f49TױC$#]d#?^6qs+^ZG*ݸ]^z)J +A ?7Sv{@.CFUhJv&8Ժ$i-Pz>:MxXQEi8'voҦ_ެoۏJ۴{?Y2yhqt[@ȬtiAiO(#Y!-JP9~[^@s^d-u,uSGOHe>f5&5*Ӯq48A&q$38vƯc&:Q!?|hC%½؇= KC׎>ˆDV(s Hd堵yJ9X:ơ7$]w7nKLLDjرw̜cǎ!QAia; HFR'5qȑ#ҥH"G]Lm6$v#cH4G8lT;]w_}UW_o6vlvQVZֳgϛniŒ$UVV9i FJOOOMM}7_z%|ᇈ;cƍ8DaΜ7xٳ9sa&A_:@._$3Kd2ۜ ]%=¤'Nc H*5< {泝}G X|ڞ:9r٢{f ~|mZOO}5^1hG L@ 8Q0tMQN+̊+]+ݔ,={tēS+Jz KaRjyWPLnbW7ޥ"%H +:6YHh8GH/2&R3t)_i;:B~VM$4~ 8kY\9jN Ei/h$ZQ$e^(X8$~q=[v?8ؽ[-_dRo;]9Hr~C*:rHb7m`I9C%CD2vՃZ)]bwA 39g9ƹ (kժo#aIHV*55\@".@HG2z:upъ~!@$kun!)|ʕHΞT?Y_zYbzؗ! [Xݛj3L͌LO{LJ-w>4{1[I.t1E,jl'ϕ Uv#}1 FT&VQw="VӞMV95T9v÷[D4l3#)k`Q;q-{"w@09AJOnjP}߸%*Uٞva4J~LD 1b[yYU-Hyfci7^k?E'bA)#WC5B~Cuf"1jlr+HnFs=+V8v5KSRR&MhɂwTGܗ F 7oHݻwfn[d>g\ 3Bzoyƌԯ 6$&&"eDHpڵkn;ypu?̙3b{p뮻n'sTmatwFڏP!!W";.B!#4%Ѝ @Feck5I/|oއi%1Tyhfہj5Vxo}qJ4q:V"qJ9ɝ_EJpz,C=xtTc\MN_†нȚPYt—Zj"$ط6iO[3%'NӬ{6orώW;M(.@u`CrҁiDE6J[fFW^ pzP tbC}F }4|_׏>HX7Y z=z5k=d/ =7ڵk!)_+WnC ˳^ԩV:TGUrss̛7T7 ލM@;7)HѨCge"Nh

    V !S+D{ \.wU-zj9k't^v4 5 X?zbPp#}hh1I¾sk <m^.#DZ*xΖ6tFe4 @tw.@d߱5kӻV1ʁbVhs.m*EO ޵ttK|>:f i "+ |iNfinJݹ`` <_W1  Fm&d~=+D=]9375`- 1d;=[@3R}7{Zץo;hPX` ;uߎdq|.TVVN0As/))A_o:ufGoΝtʀn%K (t/Pƍw[XpĉQC-[<<Ԝ9saU_,qZ>VvnY4Z? -!)z2 @RK/DĂROxN#<1԰ݨ#9-1]7i*.Fmb@' #15z&>3AA ؠ{ڿzTpğ^1lZ TܵOW_nim6Xt1;O4ŘӕS9rٺ5xb`CDFa-F1mnrt<;EN=A1?@ftmM:kvXIL%1k6nӷzFOhS=qav(ہH&}ðUl7+X^|-#) {DO? wq7l؀j[o/Psǎ{CHU|e˖!EA sujLCH?Y8)@!qwY" Y@#ha0r$->CDA )$G}stIO}!)D?8hA؝1d?CoEE[11%s؃I!CI!8#2xgA zJ$!Ig;8H~$*74zwh$ɟt0k7}sx ieH:^onٕt#MNr,{)ىZS}wSeS%@96NH  ;\-v$4ʹ/XvaBT+$g#qn4iRV-$p tׯ߬Yڵk#iH߻woeeepB?m 2[nՈ ,27~_׭[wԨQ-ZQC| E:~^zO>_yE ;GݴiX@7nn馛-Z7brJJJun& : &ZVи "z,sqn 38p|B1D|'}A+ a_6! B@(Bxyj]^Kq|y.RbYbj+Xa}A`W|Ƒ{]J{-;Ç$7-3nۊW75H$quh߾m۶H}ײquuIHF̙3p[nHR?{ljj#¥i&H!jHLOLLԮ#|CE½VInPC ZzuBBQO4)!# !;w|@D"p$#Fk?w\ anݺ\RzB¦Mo~ƌΝCׯG]޲eF؆ :u /[GEi$`-t4\$C?0؇߇ebb'> CC>.@FJB9)6ҏoԖ8hAL{zCP6q#2=!!s)$%?vn$ܓlRj P LCN2FbAZwyo>)tUb z*.IKTٸرw' ^pVTl4SSUZ )rd8+e;W|)ZK$+l͑* 977w޼ywy_d$FWTTUc$HGX$ٗkZAP *oT57X-Nq**U@W]T' Z*{ t -c_383|2k&! !B~|n XC SĈ^%j;ЏILӕPXqOXD8 _2[{NJbAFDôEx SBO h&x(U/.f isպߞKPqA-?E); $rC[ +Wp+\gy:4?W?8{& v0R"L1T,*e$+gddZٳ8=|o}0xE쎝:o;y;) 8yC!W`AȚ@n6DФ<3!īl[! AlTXh:`7#'F͚|.L a>JH1yZA'KXQ?@,JyuPrveאyn7F|If8ޚAsu61-'d8*vWH=S pĕ~fMγW ]◔Fi:6+(ǩbiʁGydĉھ-1@ N׍`ٚr'R?6SvbgE%B7$MH4A")ēANgKh;'7dp7bBv*_5<@=,C;$V ![!7>K H: `Pv4[Y7>Vy\bְ{ߜ8%Z| {14 s1avb\9_:s)(紵K#|>wE'Pk$Ҟx{NQh^kDfe,⍁޸9 xF|p 5bL#q k1\Kʂ΀h:kF Ri hՉ- mH ct={fRD"y{gdf  s\Mݑ U[ӀG0bT썥x[W9zu5Nprlc},솸^nIr%LہV~bEp Wj33$l pBxϋyH$PXXxd(.tP 2 B30PLc⒉8 B~ v@`gA.$a9`Lxrm `*⬋BR*qգKXkZ1k;O%段M|K6 }+3%iݢ-c   1:u}ALe7lpi޲!ʞO= XpZVrZ,@W~rF(nKanUFj cXkHͷDj+XrL%"w-sݎE_ 6Hdv*+5dS\rp-[҅ߨeGGe:Cp"Y ٗIC6`:>A1V_4lMZa}$D ! dhYF,ڈ?OS'27<B6 B3?!ChnB  Vt/6Dt loK3]]|Xv>˻|uJnEkWᏻ#a-v[dzd% hΓsчVƎAdpQׅHCz7\-&Uz*`!&n4HAGʞMc[>c:0Bh7 Ew4pC|@ Ө~J:g֌o՝@Dӎ"{e˩I rGj-?In{xܽ}Tv]!2e|ȭ>T\6Vkż ,sz$F["E3|Sb#F.8~ 28t,MSLJ& xc5; g`,X bq["u c߆!f H'dXƟ9Mimsە.9!h NkNvzcM;1>oqh0_#g,.Nfv-YafG[N!>y_ǂC- g_KlYLRUҞ!!_ d @xAho| B,$h$6DIqS,y ~rYH;nh 6UBCa|~B+lux ٥<7ty.̏yZ5B۲|Net/;LA.;ÂCq?3eOoMF`xQdpyhQEm튥%".z pT ᪹O {5 |ArnuTV1>Ev8] Fռ}\5oˍ` R"uJpf$Z4 jf(oҺcQpO;"'m@ȃr4l+ !&4K2,.LD`=$+NĴ+elA=a 4oX`q/` IDAT'P<؉͟PhPY|Bz~R-7 7T2)DN8 sMq~lO6t͛\\ _p:"MV9Xꋠ{[G-X8;y)^c <r=wu6zlxY3Ou@y&1L eqr";P ">pĄ8 h'JT!eT( 7cS`kc찾N !WdZoYWkx]W[q|OyGTu|{&dbLh1m4x\+5)mWsE"G <+. @I3g*Z5ENA2U6wΑBJ=̡pVX:?U>-+5RnZzWҙZ Ki&lig !953L.Q0b?-US}`h15혘#4C:P166["961ݕ(n%@8n!M7kE8JU’ĵRÅ"I8/n%.U Hv];B~K-.c!Y:.H ?:^Xܢ]n)ncqq_Wb)X7cxi .`yom@|,F!Zc@  3 lAB#M@킮1 `&{h h)0JaĂi$ fzO{q=Dbp^Qr^ a⎗: dG]AYVp+!;,'4c"Q&NVUGeWۥ-+N\0ǂRȟq]gvT__#jAE:"B! $~ {?_a0xCĬ "\I Vt! ]>՚ڠ v'-17{?~DD ؃EEҺ30 [y}m/^݋cezNr# < X(خ9LE:]sW9i\nꋞ1xx4Tak~cW{h(TzM>B m*OBEp/}t֓egc# ܚ0C}w fXLXM%-xHYH9{xjf/An6^ey6}Yڵ$&weU/㓀8`Aؙv٧n,O<XZ W3gn[Eˎp9{įHcg-%xc/UXH/vx .7 -`߲8 Kl=bsB %]猾%*Γ52HF̫4l1\Y1ԢĖO` OC)P AΪ xsntGP18XaZ`_17vEsf] ESPPP* G ;XDA^|PՄ"( "c%s! 1 ;s::U}gS|sN>ؐS5!?Kl"+<`STg MĨ3ǢF7*&H:ν2mH/0<喜J}?&x&@CNN^*+O%_H]H𲲤pq p`wR/f Ѫ6Z IN-:ľcaI\4[?ݹMבtYIhw mlYM$_HHz̽{?#)>me9SgG#_)#Z/߲b><܊)>M jE(_Jv+ɕàR=YEITBRbaXm12%v(X y(ҸWo: "xGM,zroExPg5{wXGRH ,9m, tYc4CRl jڷuzl*!k H H3&',m0,i d0jכB9V,8*_Hdɷ5p`^35E iW f:@^mW`DfΏQg[hKU0+TCǜn&%{|afmW7-.AWg+7&)H~+ݴq}b;e~͆vAbVs*GJmbcRwKuӃHʲmaͣrqwȶ!_f8TWWO%WKJ h% |VSC.J{~׀.օ@.dXuxl6LT Uc*-c Mn6$ÚBU=W.:oHA@wN2k"٩BCnoԱnW>dU#4 xyiiTq$]6Ռ#1 ,ЅMVD N̠F՗Uꈇ4g+- 6^v 籿pI4;5bWw?/uzw䬙.m%ـSxp%V #@j͑c&rc6gSF`aC^ 1 beY\@Gp$ӦTĩyW< "Bm7^qۂ[qXo$fzU6֝FL\)؁}bui^3(Ls[Ñ/$ZFkFU< Vm ~27 TJќg*w>MYz~knhpL8YA<\ɁCN687  bO7t}Dɹtt(ʒBe,^cQb -@w8G\Qhu H.lk=}N+5TjpEy4=C'֍*? w5bd?)H#ŝ&u!iz$n-#)nKt6tY9rgM8 맵QE+(nFH!ΌΔt㍎r,53k5{*%9" O?;ʰ5`Vpv2W`=́bL_O6z7H}#74Vk=1lq'Kk7Feun[=d+I$xvJ`ۉ!r6xܣQXQll]/XȰ*o)hBKۋo @@;_P0HPdPq"w=VC^JZl{A%g2JzK`|'yuת{~n'_sū)cyx٬b&*țXOVsFD7F+Ls$+ nm x6~ww縷b[7IY7mN=!I|+L& ŬE]e&:CIrnv;FD,ۄ ^ٗ?\4dX?_E& 1`c0!!E+J?V4Y0C(I}{&[JM^r@$=@$Ln0 2pnX&I cD{*Iu R .7?88&!PU}n% AqI:d8%O-80lB=n݄yCYT'gS3Gq6jSA@KbVX|S6ӼhFE@CzdאvHb:+v,r,7>(MhD-eTS ݱҜIi/$L:$Oaj )SA O{sל58e^Y-vӘfܰ*NHXoHCĄ k~x^i4Y'sN:>rUnSF?)аr|C2hUfBGҰ%b+$i'}\qdG5QGI+{I/:)W??Ai M!("x!А{9XD`=ćHrp6Pp-9CAJ0YF\Dƣo\0JJn7`aNQJ jLQ398OfXqHm$GM '@wʸ_-pIm)RwVfbc_v<`o:ޮkm4 LydaHrFγv6] ȂǕmRR !Dʬ7ҵCR`3-;;2(<GM"wH +w+B۰BBX`T0NS2J,AQ.xC[UyHyC`3[6.%ǯ|ݬ*ʨ xcr^$+\cYldTh.mԗ5x8[q|be=uwydxZVo[rÉqb|d {qYoo(IEk#ljn8<gdk޷MҍfMec[:$ dEbo2wl1e5{܁-@Yr\!ɅqۡQX`ǮKmF0 3nY{d[[6 }Zn R4԰842%F%1EA+H#eq^3,\I `O`7`gg8*Xs*XD!<+[Ѫ*+.I`\?NZD 4Cbn .C4fN.=_lwRb-+S}A֭Uw)+>[/P*c!?eN%n@ SG?=Jʬ&5iz҂~+k]+ 1,{W/g,2ݽg޸[z䱣R$t^^3RHJ=E;Cq ; bw(G#sH{nA {UHh{H G6<IID'mɤc! נn\)kkjJT'oIM=spjrܞ8Þ ɥ[ ؔ)Y4cG3*q]#Y[PNwOp+q*ͨ8h%fNds*I `4, i[MH`pf![\ZR8eܱaE'hPMis#)9@ j!kLd, N:$$b %B7T1<ʜʷYbyK/?Zc8AmoA$Md~㑘5\Z|~}P@ME foHuz측C''M ;qI^#i&a3Y$lә,\]3#q3)0&ثRpT6sL "rr5ӬcgptB) r<5N c0+%׀ܟc-m\O#Qc>(Ki-8XeKS0c885x<};?}]$٠oRw=$VDHOHyHL[rr-)Kbu F rokXR"nHSh )>Uؼ& s 4cg-R\%UBLhꪮJ34sxuD'*uW HOdGʖAU1KUha}[ -3u˰5` 4 ]k{mMkNҴp"la=6lOoHm36AtKMKd=p0=W ,iSNxF3A=M8rE3P62MrL>~Oi^w`X *W~ͬ>TWؿKvNEmH'wH% (F)i/MCX`5 C}VcgFw}>0E<4!}zAY_Vk6Qoo W9$V=[BWO|z.pc^D2Z650/ر@lJJy@aZ 0ɬbUf<ׅeQS- ]lٲ^W|* C5hJAPbQlT8e1)zp$+&z%m.xz IEsbNhPgnIZ[#H*GAhhe 2OϠckpL̀_6tʬ&f3$E|,{9kp!Hxڿ_MHi:$=HY!H6Q]|:ͬUk\lj:$^X+=;s{Nf1;b_*y{s^f*2WUKm4'9y;N_ȖPjK,y7M /pEp Y ɰ 9zײ4?fcHDz94s\0-_?w}\/A}kX~ ^ZAA rcYozXSlAhϛW0k8"GmGR@Mf2 { [$޸aIٔoD39\ q6u[$))%8ȕFh:Ο isvҥ^{vKAY2~sµ(X' G,Tz>gf@RFX&i̇d#Wbg$YU;i2HW"qs6ߴ Niڕm^LU$4rCU$6rXYαo#zNӒ5j, ÆILfx !`GіlQ40%'3d:%rӂq[(-9y'>gV_}M6䓟?u]w 6x`&_bʔ)sέz{{?Q{[zg7xdUG$YIrT}q=ÞQ#$ZC'jqQNb @eTii^¨+ng.Q4sF=kQ̅}-͙pYw[඼M ?;$}$;h5$}䶌d ?MHf:$́KgHn ifLn[d$}>ɠn-`HD9$33.jf\l.ɢ1M2"a\YxP66hZ7zI.C>$epͽb*'X*Vm.\jM0ᮻzg}+rwzꩧ+rAm8vC`U@p?JVz׻uwx;luYoLJP%jnjsM7$^D^Ɂ2G*;#aFLXRAxdey6ǕXUf$#@DkFrCygD$E?+IX@IZ$Q`4FkΒԛ+ .0KDW(ADTMlK| ,*f?!.7pvbecv͛lٲ-@ $P /̜9Ƿv?5fy晗_~o'?? /YO>d&h?guYk1z oCb#7 IF}de7g?מ{f [d/K}[֝w޹xLӥKs=?d>7|ӯ 8xEA_*n>߽s={DYqM7qmo{g>3f7w_0Q/$Ώ}c >`m80aP*qYb,<|R)#e2RFH)á+rĔ=`p PqoI'lLa.$O_}k^漹snƹs_6~<^{-H~ԩO=u9s A0$!=VYo香3grwf͚կ~uuֹ⋳tXC(?s\,Y+_ H뮻nM6H?$-7ov?я~:.Rϟ '\r%=~;߁w~Sg̘cȣ ѐ|ӟq7@u$swOSH@/}K/W[m5>(u5׀RA'Ogu_WHK88{e@; N;Y?i%|ɣ> v,yʗSNAQB;J[ά`q+uX0U٧̲Ь+p%O6g+ =4rreCNgJzy6Cճ֩*:^{!v؍; |ix؍y䑬7nĬBtꫯB\;f̘/~$Ku]$lu]/6l=$Bt A3ݯ`D @{g /?j'pCJ`R~o?yx [Z8V`w_PA* 7!;d{ T}*K+@/?lfϞַ5]w\.`ѣGwqK.hOg8oCY|9j 6|{WWS @ġ!<QB.Ջ+Y8&vqQcBkH@VȰqQÐR*Q^߸^ IC2l3>, X̂r?}w5ր<$CKb@`>07߼{L9#b-6ldH %q-K̚53Θ2eC0gηv[0? ~_@^z饐AhV3 '<xP<3  Ɛ8O8mNۮ*sk%0ɸ]wu@}!KnFo1e< 4jTƑ+)gB(BOYfz\\3Y\ǟ숒E`w`2c\YI5o"4 =}ddBa<*獨1.H'Oix,y!IrSAMNP_^ QQฐFcl ]iNAyB~䟿ȹpI1!`~ArKCoz I&Aozh 7콛z術9d'~xӍ+=@`٣f x׾ j+H7tSs*{($/>S XĉNY•W^IŸq L{WC,o~rOɣ:hw 'ԩS3qK.܁g}6&&5nXlY@nj?*(܎B22 Z%^iYjH|_b-QHȳ:&> +3iܙ).$^ݓ$[*wx~Ї5"*? I|.h5s=͛g˂@Uٳg DSL1 @V c=6a„UVY /l!opCy'!Vg}3^ }վ;}wV8klя~w},X9$v0#HzI@;w޼w=K^|E0:!8cNhW]uѣ9 8Dn^qg aI_o}HS/~-ⲯ.l!$Yi'~+J+'> FG>;g) 4swZ lĉoYZk]veƌ_o 3fw|23@n'Y B|Slc+HI(?޹.>5c.K IAChZuT(. sK0feB\NlSSyX>H}e CoqM6bvx?=CȻ>dOB{@w7 ;{ K/{QGu” {"s=w5z T0@м냔:OBrYge[>CP/L&ooBy~AI' r3~8f;?Zsρ}Ccv|;>ςW_}uʉ'-裏>ۏ|g?D=vXH2qK.<3Y mu׽KW\ HNM_|q 6&Ȟ83;@ ,|I'ٯM󟬄y/-zwX8Jx2 !Ɋ)\]o}pA, 'KGȵ hxA xYL+ MqN'u2'Ed% }Pcyه3RԜПpU' f $%ֽ֢:D+9ydǎ0%gOPbVV)zGnC={6L0aԩY o[!Ƨq~#Ixh|뭷1xvdN88#[k?~|c>`;Y&M:c a~ ~CZrB z"x_fÑG9eH Gs0LnK@JA-)'Ow}_x̘1@tM`?W_i>Ov@ A~_K/4}c20Hپ_yHf+0p {45k$){  )Ќt$a8s;̈6_(룊lJ#I$J.IObKiW⩶D]UIJŎ}"W'% >~ S YɸP&!X)/pE=/\0cB xH,X/\hы&2/^q68C 4V:+½,d1nDSj%`,(CX ?S dϾ /l, Z rh 7|k_~w2Fm~e !dKoCf5>@qm ϞT6]Rch8SLlXwUan OB OC[dHI+ݲ+s&JE6 .r7?#r ^{+"UJR>T8rK֯j%Cm?F|ruG]%7eWKIZ:l+K8%![S*Oq]öudW%:|F ^Gn$-e8;Wv(V.@N5(߸B[[=˿&U?.s^JÿácHڤ F52\0<ķ%_T:~E[Ϊ[F/_QpƖsó2'iJܷ^T69(rIVYko%1 gRlrN 'HqLPΤcaΤ^J8N/tY91ѫ`V)Y` YqLшdAHD*N 0g\JJUrc% +B2LQvp4%őHCqH]H"H[c` ǍD8q,U[J2LS+T.{47b 6G=}cycVƛy,TlHEG"K6w< IDATHu.X ˙U}HEH]걾QrqcbȘ=p/prZR. %ْ+΀<ɓ3ӊ%a`A;pB'Uٱ0F{& Yc\3Na O6-D,19  (nXMPWTrIUU_73 '%yx(}'|I[U&{V>Aww6X1Ovn1ٞă REAq\&V5 Zd' ظ1`(*lFXM=1 s҃iX'w ֵI9$OSbu SjHkY r H\\vRw?YI+U: rI'D>P5P.?Y5_" ń6R5R";ٳGT Ǟ=H 2g1Bڑvc1p͂IZJO̓l#OrNY Vȱd&PNAҪ9 +Qy-I` XnEP(cdPqZxF.HA32S-ʶ֮%apR~ (THf+SX"rgJc^\`Klc4!גT_.r3/`'r]mH*NZRVsK5ԑ;WӏHV'D\݂dHzGbAZeeHZ+fAb包cc5qϑ;rck)ex9$B-2 LMJz8ɇɲV& 9h2 "};%~1xbA6g?`Z̿$Bf,Aѱ/kǖ`g169nԅ;.i X}d1MȖJQm& a819L4I09Y Ҡ[7+]Jck+`Ae[ 9iFMkWMo( RiHyD6\v2<ڠ[|࣬ Ri[x0450dzՄ\)L9 9#pc瓴_an)pkk'8D-Y# <%vFQ^KN(ג' )P.JЊQG#~ RY(r{ed LIC{0 gI;]5bk٠k+Ǒ,ˌL;T5T% Q<9$`zK³E 4D%Nb ˃Bv$g>ONeg\[8J="0ܰC;9QH10nOZ Jn~ |Hd=@/W|-5$x%ܿ$<\obY0*$, <-eφBKY̟T3>DD׀D.! 9qH7# ku+lOjד r)`|T$N<`3 [[pK1;+w8>2P+c%w+G0l5J|K׭Z3ō9F1O|$)7P4@wB? 6]Pnl'qR"zMƣǣiyZܨf 6nȩ5Jc-(W?0 B? IͼFJ\ 44~(I<X|I$5xw@њQTzHhi1S^l4٭!0J!-)(#X@i}o)8VVcs,TRmH-d]0Oj0*g0Y08jp`whc3- select name, setting, unit, short_desc from pg_settings where name ~ 'pgautofailover.'; -[ RECORD 1 ]---------------------------------------------------------------------------------------------------- name | pgautofailover.enable_sync_wal_log_threshold setting | 16777216 unit | short_desc | Don't enable synchronous replication until secondary xlog is within this many bytes of the primary's -[ RECORD 2 ]---------------------------------------------------------------------------------------------------- name | pgautofailover.health_check_max_retries setting | 2 unit | short_desc | Maximum number of re-tries before marking a node as failed. -[ RECORD 3 ]---------------------------------------------------------------------------------------------------- name | pgautofailover.health_check_period setting | 5000 unit | ms short_desc | Duration between each check (in milliseconds). -[ RECORD 4 ]---------------------------------------------------------------------------------------------------- name | pgautofailover.health_check_retry_delay setting | 2000 unit | ms short_desc | Delay between consecutive retries. -[ RECORD 5 ]---------------------------------------------------------------------------------------------------- name | pgautofailover.health_check_timeout setting | 5000 unit | ms short_desc | Connect timeout (in milliseconds). -[ RECORD 6 ]---------------------------------------------------------------------------------------------------- name | pgautofailover.node_considered_unhealthy_timeout setting | 20000 unit | ms short_desc | Mark node unhealthy if last ping was over this long ago -[ RECORD 7 ]---------------------------------------------------------------------------------------------------- name | pgautofailover.primary_demote_timeout setting | 30000 unit | ms short_desc | Give the primary this long to drain before promoting the secondary -[ RECORD 8 ]---------------------------------------------------------------------------------------------------- name | pgautofailover.promote_wal_log_threshold setting | 16777216 unit | short_desc | Don't promote secondary unless xlog is with this many bytes of the master -[ RECORD 9 ]---------------------------------------------------------------------------------------------------- name | pgautofailover.startup_grace_period setting | 10000 unit | ms short_desc | Wait for at least this much time after startup before initiating a failover. You can edit the parameters as usual with PostgreSQL, either in the ``postgresql.conf`` file or using ``ALTER DATABASE pg_auto_failover SET parameter = value;`` commands, then issuing a reload. pg_auto_failover Keeper Service ------------------------------- For an introduction to the ``pg_autoctl`` commands relevant to the pg_auto_failover Keeper configuration, please see :ref:`pg_autoctl_config`. An example configuration file looks like the following:: [pg_autoctl] role = keeper monitor = postgres://autoctl_node@192.168.1.34:6000/pg_auto_failover formation = default group = 0 hostname = node1.db nodekind = standalone [postgresql] pgdata = /data/pgsql/ pg_ctl = /usr/pgsql-10/bin/pg_ctl dbname = postgres host = /tmp port = 5000 [replication] slot = pgautofailover_standby maximum_backup_rate = 100M backup_directory = /data/backup/node1.db [timeout] network_partition_timeout = 20 postgresql_restart_failure_timeout = 20 postgresql_restart_failure_max_retries = 3 To output, edit and check entries of the configuration, the following commands are provided:: pg_autoctl config check [--pgdata ] pg_autoctl config get [--pgdata ] section.option pg_autoctl config set [--pgdata ] section.option value The ``[postgresql]`` section is discovered automatically by the ``pg_autoctl`` command and is not intended to be changed manually. **pg_autoctl.monitor** PostgreSQL service URL of the pg_auto_failover monitor, as given in the output of the ``pg_autoctl show uri`` command. **pg_autoctl.formation** A single pg_auto_failover monitor may handle several postgres formations. The default formation name `default` is usually fine. **pg_autoctl.group** This information is retrieved by the pg_auto_failover keeper when registering a node to the monitor, and should not be changed afterwards. Use at your own risk. **pg_autoctl.hostname** Node `hostname` used by all the other nodes in the cluster to contact this node. In particular, if this node is a primary then its standby uses that address to setup streaming replication. **replication.slot** Name of the PostgreSQL replication slot used in the streaming replication setup automatically deployed by pg_auto_failover. Replication slots can't be renamed in PostgreSQL. **replication.maximum_backup_rate** When pg_auto_failover (re-)builds a standby node using the ``pg_basebackup`` command, this parameter is given to ``pg_basebackup`` to throttle the network bandwidth used. Defaults to 100Mbps. **replication.backup_directory** When pg_auto_failover (re-)builds a standby node using the ``pg_basebackup`` command, this parameter is the target directory where to copy the bits from the primary server. When the copy has been successful, then the directory is renamed to **postgresql.pgdata**. The default value is computed from ``${PGDATA}/../backup/${hostname}`` and can be set to any value of your preference. Remember that the directory renaming is an atomic operation only when both the source and the target of the copy are in the same filesystem, at least in Unix systems. **timeout** This section allows to setup the behavior of the pg_auto_failover keeper in interesting scenarios. **timeout.network_partition_timeout** Timeout in seconds before we consider failure to communicate with other nodes indicates a network partition. This check is only done on a PRIMARY server, so other nodes mean both the monitor and the standby. When a PRIMARY node is detected to be on the losing side of a network partition, the pg_auto_failover keeper enters the DEMOTE state and stops the PostgreSQL instance in order to protect against split brain situations. The default is 20s. .. would be better not to have to do this, but that'll have to do for now .. raw:: latex \newpage **timeout.postgresql_restart_failure_timeout** **timeout.postgresql_restart_failure_max_retries** When PostgreSQL is not running, the first thing the pg_auto_failover keeper does is try to restart it. In case of a transient failure (e.g. file system is full, or other dynamic OS resource constraint), the best course of action is to try again for a little while before reaching out to the monitor and ask for a failover. The pg_auto_failover keeper tries to restart PostgreSQL ``timeout.postgresql_restart_failure_max_retries`` times in a row (default 3) or up to ``timeout.postgresql_restart_failure_timeout`` (defaults 20s) since it detected that PostgreSQL is not running, whichever comes first. pg_auto_failover-1.6.3/docs/ref/manual.rst000066400000000000000000000010001414244367200205570ustar00rootroot00000000000000.. _manual: Manual Pages ============ The ``pg_autoctl`` tool hosts many commands and sub-commands. Each of them have their own manual page. .. toctree:: :maxdepth: 1 :caption: Manual Pages: pg_autoctl pg_autoctl_create pg_autoctl_drop pg_autoctl_config pg_autoctl_show pg_autoctl_enable pg_autoctl_disable pg_autoctl_get pg_autoctl_set pg_autoctl_perform pg_autoctl_do pg_autoctl_run pg_autoctl_watch pg_autoctl_stop pg_autoctl_reload pg_autoctl_status pg_auto_failover-1.6.3/docs/ref/pg_autoctl.rst000066400000000000000000000120531414244367200214550ustar00rootroot00000000000000.. _pg_autoctl: pg_autoctl ========== pg_autoctl - control a pg_auto_failover node Synopsis -------- pg_autoctl provides the following commands:: + create Create a pg_auto_failover node, or formation + drop Drop a pg_auto_failover node, or formation + config Manages the pg_autoctl configuration + show Show pg_auto_failover information + enable Enable a feature on a formation + disable Disable a feature on a formation + get Get a pg_auto_failover node, or formation setting + set Set a pg_auto_failover node, or formation setting + perform Perform an action orchestrated by the monitor run Run the pg_autoctl service (monitor or keeper) watch Display a dashboard to watch monitor's events and state stop signal the pg_autoctl service for it to stop reload signal the pg_autoctl for it to reload its configuration status Display the current status of the pg_autoctl service help print help message version print pg_autoctl version pg_autoctl create monitor Initialize a pg_auto_failover monitor node postgres Initialize a pg_auto_failover standalone postgres node formation Create a new formation on the pg_auto_failover monitor pg_autoctl drop monitor Drop the pg_auto_failover monitor node Drop a node from the pg_auto_failover monitor formation Drop a formation on the pg_auto_failover monitor pg_autoctl config check Check pg_autoctl configuration get Get the value of a given pg_autoctl configuration variable set Set the value of a given pg_autoctl configuration variable pg_autoctl show uri Show the postgres uri to use to connect to pg_auto_failover nodes events Prints monitor's state of nodes in a given formation and group state Prints monitor's state of nodes in a given formation and group settings Print replication settings for a formation from the monitor standby-names Prints synchronous_standby_names for a given group file List pg_autoctl internal files (config, state, pid) systemd Print systemd service file for this node pg_autoctl enable secondary Enable secondary nodes on a formation maintenance Enable Postgres maintenance mode on this node ssl Enable SSL configuration on this node pg_autoctl disable secondary Disable secondary nodes on a formation maintenance Disable Postgres maintenance mode on this node ssl Disable SSL configuration on this node pg_autoctl get + node get a node property from the pg_auto_failover monitor + formation get a formation property from the pg_auto_failover monitor pg_autoctl get node replication-quorum get replication-quorum property from the monitor candidate-priority get candidate property from the monitor pg_autoctl get formation settings get replication settings for a formation from the monitor number-sync-standbys get number_sync_standbys for a formation from the monitor pg_autoctl set + node set a node property on the monitor + formation set a formation property on the monitor pg_autoctl set node metadata set metadata on the monitor replication-quorum set replication-quorum property on the monitor candidate-priority set candidate property on the monitor pg_autoctl set formation number-sync-standbys set number-sync-standbys for a formation on the monitor pg_autoctl perform failover Perform a failover for given formation and group switchover Perform a switchover for given formation and group promotion Perform a failover that promotes a target node Description ----------- The pg_autoctl tool is the client tool provided by pg_auto_failover to create and manage Postgres nodes and the pg_auto_failover monitor node. The command is built with many sub-commands that each have their own manual page. Help ---- To get the full recursive list of supported commands, use:: pg_autoctl help Version ------- To grab the version of pg_autoctl that you're using, use:: pg_autoctl --version pg_autoctl version A typical output would be:: pg_autoctl version 1.4.2 pg_autoctl extension version 1.4 compiled with PostgreSQL 12.3 on x86_64-apple-darwin16.7.0, compiled by Apple LLVM version 8.1.0 (clang-802.0.42), 64-bit compatible with Postgres 10, 11, 12, and 13 The version is also available as a JSON document when using the ``--json`` option:: pg_autoctl --version --json pg_autoctl version --json A typical JSON output would be:: { "pg_autoctl": "1.4.2", "pgautofailover": "1.4", "pg_major": "12", "pg_version": "12.3", "pg_version_str": "PostgreSQL 12.3 on x86_64-apple-darwin16.7.0, compiled by Apple LLVM version 8.1.0 (clang-802.0.42), 64-bit", "pg_version_num": 120003 } This is for version 1.4.2 of pg_auto_failover. This particular version of the pg_autoctl client tool has been compiled using ``libpq`` for PostgreSQL 12.3 and is compatible with Postgres 10, 11, 12, and 13. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_config.rst000066400000000000000000000003421414244367200230000ustar00rootroot00000000000000.. _pg_autoctl_config: pg_autoctl config ================= pg_autoctl config - Manages the pg_autoctl configuration .. toctree:: :maxdepth: 1 pg_autoctl_config_get pg_autoctl_config_set pg_autoctl_config_check pg_auto_failover-1.6.3/docs/ref/pg_autoctl_config_check.rst000066400000000000000000000035611414244367200241430ustar00rootroot00000000000000.. _pg_autoctl_config_check: pg_autoctl config check ======================= pg_autoctl config check - Check pg_autoctl configuration Synopsis -------- This command implements a very basic list of sanity checks for a pg_autoctl node setup:: usage: pg_autoctl config check [ --pgdata ] [ --json ] --pgdata path to data directory --json output data in the JSON format Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output JSON formated data. Examples -------- :: $ pg_autoctl config check --pgdata node1 18:37:27 63749 INFO Postgres setup for PGDATA "/Users/dim/dev/MS/pg_auto_failover/tmux/node1" is ok, running with PID 5501 and port 99698 18:37:27 63749 INFO Connection to local Postgres ok, using "port=5501 dbname=demo host=/tmp" 18:37:27 63749 INFO Postgres configuration settings required for pg_auto_failover are ok 18:37:27 63749 WARN Postgres 12.1 does not support replication slots on a standby node 18:37:27 63749 INFO Connection to monitor ok, using "postgres://autoctl_node@localhost:5500/pg_auto_failover?sslmode=prefer" 18:37:27 63749 INFO Monitor is running version "1.5.0.1", as expected pgdata: /Users/dim/dev/MS/pg_auto_failover/tmux/node1 pg_ctl: /Applications/Postgres.app/Contents/Versions/12/bin/pg_ctl pg_version: 12.3 pghost: /tmp pgport: 5501 proxyport: 0 pid: 99698 is in recovery: no Control Version: 1201 Catalog Version: 201909212 System Identifier: 6941034382470571312 Latest checkpoint LSN: 0/6000098 Postmaster status: ready pg_auto_failover-1.6.3/docs/ref/pg_autoctl_config_get.rst000066400000000000000000000053121414244367200236410ustar00rootroot00000000000000.. _pg_autoctl_config_get: pg_autoctl config get ===================== pg_autoctl config get - Get the value of a given pg_autoctl configuration variable Synopsis -------- This command prints a ``pg_autoctl`` configuration setting:: usage: pg_autoctl config get [ --pgdata ] [ --json ] [ section.option ] --pgdata path to data directory Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output JSON formated data. Description ----------- When the argument ``section.option`` is used, this is the name of a configuration ooption. The configuration file for ``pg_autoctl`` is stored using the INI format. When no argument is given to ``pg_autoctl config get`` the entire configuration file is given in the output. To figure out where the configuration file is stored, see :ref:`pg_autoctl_show_file` and use ``pg_autoctl show file --config``. Examples -------- Without arguments, we get the entire file:: $ pg_autoctl config get --pgdata node1 [pg_autoctl] role = keeper monitor = postgres://autoctl_node@localhost:5500/pg_auto_failover?sslmode=prefer formation = default group = 0 name = node1 hostname = localhost nodekind = standalone [postgresql] pgdata = /Users/dim/dev/MS/pg_auto_failover/tmux/node1 pg_ctl = /Applications/Postgres.app/Contents/Versions/12/bin/pg_ctl dbname = demo host = /tmp port = 5501 proxyport = 0 listen_addresses = * auth_method = trust hba_level = app [ssl] active = 1 sslmode = require cert_file = /Users/dim/dev/MS/pg_auto_failover/tmux/node1/server.crt key_file = /Users/dim/dev/MS/pg_auto_failover/tmux/node1/server.key [replication] maximum_backup_rate = 100M backup_directory = /Users/dim/dev/MS/pg_auto_failover/tmux/backup/node_1 [timeout] network_partition_timeout = 20 prepare_promotion_catchup = 30 prepare_promotion_walreceiver = 5 postgresql_restart_failure_timeout = 20 postgresql_restart_failure_max_retries = 3 It is possible to pipe JSON formated output to the ``jq`` command line and filter the result down to a specific section of the file:: $ pg_autoctl config get --pgdata node1 --json | jq .pg_autoctl { "role": "keeper", "monitor": "postgres://autoctl_node@localhost:5500/pg_auto_failover?sslmode=prefer", "formation": "default", "group": 0, "name": "node1", "hostname": "localhost", "nodekind": "standalone" } Finally, a single configuration element can be listed:: $ pg_autoctl config get --pgdata node1 ssl.sslmode --json require pg_auto_failover-1.6.3/docs/ref/pg_autoctl_config_set.rst000066400000000000000000000164601414244367200236630ustar00rootroot00000000000000.. _pg_autoctl_config_set: pg_autoctl config set ===================== pg_autoctl config set - Set the value of a given pg_autoctl configuration variable Synopsis -------- This command prints a ``pg_autoctl`` configuration setting:: usage: pg_autoctl config set [ --pgdata ] [ --json ] section.option [ value ] --pgdata path to data directory Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output JSON formated data. Description ----------- This commands allows to set a pg_autoctl configuration setting to a new value. Most settings are possible to change and can be reloaded online. Some of those commands can then be applied with a ``pg_autoctl reload`` command to an already running process. Settings -------- pg_autoctl.role This setting can not be changed. It can be either ``monitor`` or ``keeper`` and the rest of the configuration file is read depending on this value. pg_autoctl.monitor URI of the pg_autoctl monitor Postgres service. Can be changed with a reload. To register an existing node to a new monitor, use ``pg_autoctl disable monitor`` and then ``pg_autoctl enable monitor``. pg_autoctl.formation Formation to which this node has been registered. Changing this setting is not supported. pg_autoctl.group Group in which this node has been registered. Changing this setting is not supported. pg_autoctl.name Name of the node as known to the monitor and listed in ``pg_autoctl show state``. Can be changed with a reload. pg_autoctl.hostname Hostname or IP address of the node, as known to the monitor. Can be changed with a reload. pg_autoctl.nodekind This setting can not be changed and depends on the command that has been used to create this pg_autoctl node. postgresql.pgdata Directory where the managed Postgres instance is to be created (or found) and managed. Can't be changed. postgresql.pg_ctl Path to the ``pg_ctl`` tool used to manage this Postgres instance. Absolute path depends on the major version of Postgres and looks like ``/usr/lib/postgresql/13/bin/pg_ctl`` when using a debian or ubuntu OS. Can be changed after a major upgrade of Postgres. postgresql.dbname Name of the database that is used to connect to Postgres. Can be changed, but then must be changed manually on the monitor's ``pgautofailover.formation`` table with a SQL command. .. warning:: When using pg_auto_failover enterprise edition with Citus support, this is the database where pg_autoctl maintains the list of Citus nodes on the coordinator. Using the same database name as your application that uses Citus is then crucial. postgresql.host Hostname to use in connection strings when connecting from the local ``pg_autoctl`` process to the local Postgres database. Defaults to using the Operating System default value for the Unix Domain Socket directory, either ``/tmp`` or when using debian or ubuntu ``/var/run/postgresql``. Can be changed with a reload. postgresql.port Port on which Postgres should be managed. Can be changed offline, between a ``pg_autoctl stop`` and a subsequent ``pg_autoctl start``. postgresql.listen_addresses Value to set to Postgres parameter of the same name. At the moment ``pg_autoctl`` only supports a single address for this parameter. postgresql.auth_method Authentication method to use when editing HBA rules to allow the Postgres nodes of a formation to connect to each other, and to the monitor, and to allow the monitor to connect to the nodes. Can be changed online with a reload, but actually adding new HBA rules requires a restart of the "node-active" service. postgresql.hba_level This setting reflects the choice of ``--skip-pg-hba`` or ``--pg-hba-lan`` that has been used when creating this pg_autoctl node. Can be changed with a reload, though the HBA rules that have been previously added will not get removed. ssl.active, ssl.sslmode, ssl.cert_file, ssl.key_file, etc Please use the command ``pg_autoctl enable ssl`` or ``pg_autoctl disable ssl`` to manage the SSL settings in the ``ssl`` section of the configuration. Using those commands, the settings can be changed online. replication.maximum_backup_rate Used as a parameter to ``pg_basebackup``, defaults to ``100M``. Can be changed with a reload. Changing this value does not affect an already running ``pg_basebackup`` command. Limiting the bandwidth used by ``pg_basebackup`` makes the operation slower, and still has the advantage of limiting the impact on the disks of the primary server. replication.backup_directory Target location of the ``pg_basebackup`` command used by pg_autoctl when creating a secondary node. When done with fetching the data over the network, then pg_autoctl uses the *rename(2)* system-call to rename the temporary download location to the target PGDATA location. The *rename(2)* system-call is known to be atomic when both the source and the target of the operation are using the same file system / mount point. Can be changed online with a reload, will not affect already running ``pg_basebackup`` sub-processes. replication.password Used as a parameter in the connection string to the upstream Postgres node. The "replication" connection uses the password set-up in the pg_autoctl configuration file. Changing the ``replication.password`` of a pg_autoctl configuration has no effect on the Postgres database itself. The password must match what the Postgres upstream node expects, which can be set with the following SQL command run on the upstream server (primary or other standby node):: alter user pgautofailover_replicator password 'h4ckm3m0r3'; The ``replication.password`` can be changed online with a reload, but requires restarting the Postgres service to be activated. Postgres only reads the ``primary_conninfo`` connection string at start-up, up to and including Postgres 12. With Postgres 13 and following, it is possible to *reload* this Postgres paramater. timeout.network_partition_timeout Timeout (in seconds) that pg_autoctl waits before deciding that it is on the losing side of a network partition. When pg_autoctl fails to connect to the monitor and when the local Postgres instance ``pg_stat_replication`` system view is empty, and after this many seconds have passed, then pg_autoctl demotes itself. Can be changed with a reload. timeout.prepare_promotion_catchup Currently not used in the source code. Can be changed with a reload. timeout.prepare_promotion_walreceiver Currently not used in the source code. Can be changed with a reload. timeout.postgresql_restart_failure_timeout When pg_autoctl fails to start Postgres for at least this duration from the first attempt, then it starts reporting that Postgres is not running to the monitor, which might then decide to implement a failover. Can be changed with a reload. timeout.postgresql_restart_failure_max_retries When pg_autoctl fails to start Postgres for at least this many times then it starts reporting that Postgres is not running to the monitor, which them might decide to implement a failover. Can be changed with a reload. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_create.rst000066400000000000000000000003671414244367200230050ustar00rootroot00000000000000.. _pg_autoctl_create: pg_autoctl create ================= pg_autoctl create - Create a pg_auto_failover node, or formation .. toctree:: :maxdepth: 1 pg_autoctl_create_monitor pg_autoctl_create_postgres pg_autoctl_create_formation pg_auto_failover-1.6.3/docs/ref/pg_autoctl_create_formation.rst000066400000000000000000000060121414244367200250540ustar00rootroot00000000000000.. _pg_autoctl_create_formation: pg_autoctl create formation =========================== pg_autoctl create formation - Create a new formation on the pg_auto_failover monitor Synopsis -------- This command registers a new formation on the monitor, with the specified kind:: usage: pg_autoctl create formation [ --pgdata --monitor --formation --kind --dbname --with-secondary --without-secondary ] --pgdata path to data directory --monitor pg_auto_failover Monitor Postgres URL --formation name of the formation to create --kind formation kind, either "pgsql" or "citus" --dbname name for postgres database to use in this formation --enable-secondary create a formation that has multiple nodes that can be used for fail over when others have issues --disable-secondary create a citus formation without nodes to fail over to --number-sync-standbys minimum number of standbys to confirm write Description ----------- A single pg_auto_failover monitor may manage any number of formations, each composed of at least one Postgres service group. This commands creates a new formation so that it is then possible to register Postgres nodes in the new formation. Options ------- The following options are available to ``pg_autoctl create formation``: --pgdata Location where to initialize a Postgres database cluster, using either ``pg_ctl initdb`` or ``pg_basebackup``. Defaults to the environment variable ``PGDATA``. --monitor Postgres URI used to connect to the monitor. Must use the ``autoctl_node`` username and target the ``pg_auto_failover`` database name. It is possible to show the Postgres URI from the monitor node using the command :ref:`pg_autoctl_show_uri`. --formation Name of the formation to create. --kind A pg_auto_failover formation could be of kind ``pgsql`` or of kind ``citus``. At the moment ``citus`` formation kinds are not managed in the Open Source version of pg_auto_failover. --dbname Name of the database to use in the formation, mostly useful to formation kinds ``citus`` where the Citus extension is only installed in a single target database. --enable-secondary The formation to be created allows using standby nodes. Defaults to ``true``. Mostly useful for Citus formations. --disable-secondary See ``--enable-secondary`` above. --number-sync-standby Postgres streaming replication uses ``synchronous_standby_names`` to setup how many standby nodes should have received a copy of the transaction data. When using pg_auto_failover this setup is handled at the formation level. Defaults to zero when creating the first two Postgres nodes in a formation in the same group. When set to zero pg_auto_failover uses synchronous replication only when a standby node is available: the idea is to allow failover, this setting does not allow proper HA for Postgres. When adding a third node that participates in the quorum (one primary, two secondaries), the setting is automatically changed from zero to one. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_create_monitor.rst000066400000000000000000000122451414244367200245520ustar00rootroot00000000000000.. _pg_autoctl_create_monitor: pg_autoctl create monitor ========================= pg_autoctl create monitor - Initialize a pg_auto_failover monitor node Synopsis -------- This command initializes a PostgreSQL cluster and installs the `pgautofailover` extension so that it's possible to use the new instance to monitor PostgreSQL services:: usage: pg_autoctl create monitor [ --pgdata --pgport --pgctl --hostname ] --pgctl path to pg_ctl --pgdata path to data directory --pgport PostgreSQL's port number --hostname hostname by which postgres is reachable --auth authentication method for connections from data nodes --skip-pg-hba skip editing pg_hba.conf rules --run create node then run pg_autoctl service --ssl-self-signed setup network encryption using self signed certificates (does NOT protect against MITM) --ssl-mode use that sslmode in connection strings --ssl-ca-file set the Postgres ssl_ca_file to that file path --ssl-crl-file set the Postgres ssl_crl_file to that file path --no-ssl don't enable network encryption (NOT recommended, prefer --ssl-self-signed) --server-key set the Postgres ssl_key_file to that file path --server-cert set the Postgres ssl_cert_file to that file path Description ----------- The pg_autoctl tool is the client tool provided by pg_auto_failover to create and manage Postgres nodes and the pg_auto_failover monitor node. The command is built with many sub-commands that each have their own manual page. Options ------- The following options are available to ``pg_autoctl create monitor``: --pgctl Path to the ``pg_ctl`` tool to use for the version of PostgreSQL you want to use. Defaults to the ``pg_ctl`` found in the PATH when there is a single entry for ``pg_ctl`` in the PATH. Check your setup using ``which -a pg_ctl``. When using an RPM based distribution such as RHEL or CentOS, the path would usually be ``/usr/pgsql-13/bin/pg_ctl`` for Postgres 13. When using a debian based distribution such as debian or ubuntu, the path would usually be ``/usr/lib/postgresql/13/bin/pg_ctl`` for Postgres 13. Those distributions also use the package ``postgresql-common`` which provides ``/usr/bin/pg_config``. This tool can be automatically used by ``pg_autoctl`` to discover the default version of Postgres to use on your setup. --pgdata Location where to initialize a Postgres database cluster, using either ``pg_ctl initdb`` or ``pg_basebackup``. Defaults to the environment variable ``PGDATA``. --pgport Postgres port to use, defaults to 5432. --hostname Hostname or IP address (both v4 and v6 are supported) to use from any other node to connect to this node. When not provided, a default value is computed by running the following algorithm. 1. We get this machine's "public IP" by opening a connection to the 8.8.8.8:53 public service. Then we get TCP/IP client address that has been used to make that connection. 2. We then do a reverse DNS lookup on the IP address found in the previous step to fetch a hostname for our local machine. 3. If the reverse DNS lookup is successful , then ``pg_autoctl`` does a forward DNS lookup of that hostname. When the forward DNS lookup response in step 3. is an IP address found in one of our local network interfaces, then ``pg_autoctl`` uses the hostname found in step 2. as the default ``--hostname``. Otherwise it uses the IP address found in step 1. You may use the ``--hostname`` command line option to bypass the whole DNS lookup based process and force the local node name to a fixed value. --auth Authentication method used by ``pg_autoctl`` when editing the Postgres HBA file to open connections to other nodes. No default value, must be provided by the user. The value ``--trust`` is only a good choice for testing and evaluation of pg_auto_failover, see :ref:`security` for more information. --skip-pg-hba When this option is used then ``pg_autoctl`` refrains from any editing of the Postgres HBA file. Please note that editing the HBA file is still needed so that other nodes can connect using either read privileges or replication streaming privileges. When ``--skip-pg-hba`` is used, ``pg_autoctl`` still outputs the HBA entries it needs in the logs, it only skips editing the HBA file. --run Immediately run the ``pg_autoctl`` service after having created this node. --ssl-self-signed Generate SSL self-signed certificates to provide network encryption. This does not protect against man-in-the-middle kinds of attacks. See :ref:`security` for more about our SSL settings. --ssl-mode SSL Mode used by ``pg_autoctl`` when connecting to other nodes, including when connecting for streaming replication. --ssl-ca-file Set the Postgres ``ssl_ca_file`` to that file path. --ssl-crl-file Set the Postgres ``ssl_crl_file`` to that file path. --no-ssl Don't enable network encryption. This is not recommended, prefer ``--ssl-self-signed``. --server-key Set the Postgres ``ssl_key_file`` to that file path. --server-cert Set the Postgres ``ssl_cert_file`` to that file path. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_create_postgres.rst000066400000000000000000000272231414244367200247330ustar00rootroot00000000000000.. _pg_autoctl_create_postgres: pg_autoctl create postgres ========================== pg_autoctl create postgres - Initialize a pg_auto_failover postgres node Synopsis -------- The command ``pg_autoctl create postgres`` initializes a standalone Postgres node to a pg_auto_failover monitor. The monitor is then handling auto-failover for this Postgres node (as soon as a secondary has been registered too, and is known to be healthy). :: usage: pg_autoctl create postgres --pgctl path to pg_ctl --pgdata path to data directory --pghost PostgreSQL's hostname --pgport PostgreSQL's port number --listen PostgreSQL's listen_addresses --username PostgreSQL's username --dbname PostgreSQL's database name --name pg_auto_failover node name --hostname hostname used to connect from the other nodes --formation pg_auto_failover formation --monitor pg_auto_failover Monitor Postgres URL --auth authentication method for connections from monitor --skip-pg-hba skip editing pg_hba.conf rules --pg-hba-lan edit pg_hba.conf rules for --dbname in detected LAN --ssl-self-signed setup network encryption using self signed certificates (does NOT protect against MITM) --ssl-mode use that sslmode in connection strings --ssl-ca-file set the Postgres ssl_ca_file to that file path --ssl-crl-file set the Postgres ssl_crl_file to that file path --no-ssl don't enable network encryption (NOT recommended, prefer --ssl-self-signed) --server-key set the Postgres ssl_key_file to that file path --server-cert set the Postgres ssl_cert_file to that file path --candidate-priority priority of the node to be promoted to become primary --replication-quorum true if node participates in write quorum --maximum-backup-rate maximum transfer rate of data transferred from the server during initial sync Description ----------- Three different modes of initialization are supported by this command, corresponding to as many implementation strategies. 1. Initialize a primary node from scratch This happens when ``--pgdata`` (or the environment variable ``PGDATA``) points to an non-existing or empty directory. Then the given ``--hostname`` is registered to the pg_auto_failover ``--monitor`` as a member of the ``--formation``. The monitor answers to the registration call with a state to assign to the new member of the group, either *SINGLE* or *WAIT_STANDBY*. When the assigned state is *SINGLE*, then ``pg_autoctl create postgres`` proceedes to initialize a new PostgreSQL instance. 2. Initialize an already existing primary server This happens when ``--pgdata`` (or the environment variable ``PGDATA``) points to an already existing directory that belongs to a PostgreSQL instance. The standard PostgreSQL tool ``pg_controldata`` is used to recognize whether the directory belongs to a PostgreSQL instance. In that case, the given ``--hostname`` is registered to the monitor in the tentative *SINGLE* state. When the given ``--formation`` and ``--group`` is currently empty, then the monitor accepts the registration and the ``pg_autoctl create`` prepares the already existing primary server for pg_auto_failover. 3. Initialize a secondary node from scratch This happens when ``--pgdata`` (or the environment variable ``PGDATA``) points to a non-existing or empty directory, and when the monitor registration call assigns the state *WAIT_STANDBY* in step 1. In that case, the ``pg_autoctl create`` command steps through the initial states of registering a secondary server, which includes preparing the primary server PostgreSQL HBA rules and creating a replication slot. When the command ends successfully, a PostgreSQL secondary server has been created with ``pg_basebackup`` and is now started, catching-up to the primary server. 4. Initialize a secondary node from an existing data directory When the data directory pointed to by the option ``--pgdata`` or the environment variable ``PGDATA`` already exists, then pg_auto_failover verifies that the system identifier matches the one of the other nodes already existing in the same group. The system identifier can be obtained with the command ``pg_controldata``. All nodes in a physical replication setting must have the same system identifier, and so in pg_auto_failover all the nodes in a same group have that constraint too. When the system identifier matches the already registered system identifier of other nodes in the same group, then the node is set-up as a standby and Postgres is started with the primary conninfo pointed at the current primary. The ``--auth`` option allows setting up authentication method to be used when monitor node makes a connection to data node with ``pgautofailover_monitor`` user. As with the :ref:`pg_autoctl_create_monitor` command, you could use ``--auth trust`` when playing with pg_auto_failover at first and consider something production grade later. Also, consider using ``--skip-pg-hba`` if you already have your own provisioning tools with a security compliance process. See :ref:`security` for notes on `.pgpass` Options ------- The following options are available to ``pg_autoctl create postgres``: --pgctl Path to the ``pg_ctl`` tool to use for the version of PostgreSQL you want to use. Defaults to the ``pg_ctl`` found in the PATH when there is a single entry for ``pg_ctl`` in the PATH. Check your setup using ``which -a pg_ctl``. When using an RPM based distribution such as RHEL or CentOS, the path would usually be ``/usr/pgsql-13/bin/pg_ctl`` for Postgres 13. When using a debian based distribution such as debian or ubuntu, the path would usually be ``/usr/lib/postgresql/13/bin/pg_ctl`` for Postgres 13. Those distributions also use the package ``postgresql-common`` which provides ``/usr/bin/pg_config``. This tool can be automatically used by ``pg_autoctl`` to discover the default version of Postgres to use on your setup. --pgdata Location where to initialize a Postgres database cluster, using either ``pg_ctl initdb`` or ``pg_basebackup``. Defaults to the environment variable ``PGDATA``. --pghost Hostname to use when connecting to the local Postgres instance from the ``pg_autoctl`` process. By default, this field is left blank in the connection string, allowing to use Unix Domain Sockets with the default path compiled in your ``libpq`` version, usually provided by the Operating System. That would be ``/var/run/postgresql`` when using debian or ubuntu. --pgport Postgres port to use, defaults to 5432. --listen PostgreSQL's ``listen_addresses`` to setup. At the moment only one address is supported in this command line option. --username PostgreSQL's username to use when connecting to the local Postgres instance to manage it. --dbname PostgreSQL's database name to use in your application. Defaults to being the same as the ``--username``, or to ``postgres`` when none of those options are used. --name Node name used on the monitor to refer to this node. The hostname is a technical information, and given Postgres requirements on the HBA setup and DNS resolution (both forward and reverse lookups), IP addresses are often used for the hostname. The ``--name`` option allows using a user-friendly name for your Postgres nodes. --hostname Hostname or IP address (both v4 and v6 are supported) to use from any other node to connect to this node. When not provided, a default value is computed by running the following algorithm. 1. We get this machine's "public IP" by opening a connection to the given monitor hostname or IP address. Then we get TCP/IP client address that has been used to make that connection. 2. We then do a reverse DNS lookup on the IP address found in the previous step to fetch a hostname for our local machine. 3. If the reverse DNS lookup is successful , then ``pg_autoctl`` does a forward DNS lookup of that hostname. When the forward DNS lookup response in step 3. is an IP address found in one of our local network interfaces, then ``pg_autoctl`` uses the hostname found in step 2. as the default ``--hostname``. Otherwise it uses the IP address found in step 1. You may use the ``--hostname`` command line option to bypass the whole DNS lookup based process and force the local node name to a fixed value. --formation Formation to register the node into on the monitor. Defaults to the ``default`` formation, that is automatically created in the monitor in the :ref:`pg_autoctl_create_monitor` command. --monitor Postgres URI used to connect to the monitor. Must use the ``autoctl_node`` username and target the ``pg_auto_failover`` database name. It is possible to show the Postgres URI from the monitor node using the command :ref:`pg_autoctl_show_uri`. --auth Authentication method used by ``pg_autoctl`` when editing the Postgres HBA file to open connections to other nodes. No default value, must be provided by the user. The value ``--trust`` is only a good choice for testing and evaluation of pg_auto_failover, see :ref:`security` for more information. --skip-pg-hba When this option is used then ``pg_autoctl`` refrains from any editing of the Postgres HBA file. Please note that editing the HBA file is still needed so that other nodes can connect using either read privileges or replication streaming privileges. When ``--skip-pg-hba`` is used, ``pg_autoctl`` still outputs the HBA entries it needs in the logs, it only skips editing the HBA file. --pg-hba-lan When this option is used ``pg_autoctl`` determines the local IP address used to connect to the monitor, and retrieves its netmask, and uses that to compute your local area network CIDR. This CIDR is then opened for connections in the Postgres HBA rules. For instance, when the monitor resolves to ``192.168.0.1`` and your local Postgres node uses an inferface with IP address ``192.168.0.2/255.255.255.0`` to connect to the monitor, then the LAN CIDR is computed to be ``192.168.0.0/24``. --candidate-priority Sets this node replication setting for candidate priority to the given value (between 0 and 100) at node registration on the monitor. Defaults to 50. --replication-quorum Sets this node replication setting for replication quorum to the given value (either ``true`` or ``false``) at node registration on the monitor. Defaults to ``true``, which enables synchronous replication. --maximum-backup-rate Sets the maximum transfer rate of data transferred from the server during initial sync. This is used by ``pg_basebackup``. Defaults to ``100M``. --run Immediately run the ``pg_autoctl`` service after having created this node. --ssl-self-signed Generate SSL self-signed certificates to provide network encryption. This does not protect against man-in-the-middle kinds of attacks. See :ref:`security` for more about our SSL settings. --ssl-mode SSL Mode used by ``pg_autoctl`` when connecting to other nodes, including when connecting for streaming replication. --ssl-ca-file Set the Postgres ``ssl_ca_file`` to that file path. --ssl-crl-file Set the Postgres ``ssl_crl_file`` to that file path. --no-ssl Don't enable network encryption. This is not recommended, prefer ``--ssl-self-signed``. --server-key Set the Postgres ``ssl_key_file`` to that file path. --server-cert Set the Postgres ``ssl_cert_file`` to that file path. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_disable.rst000066400000000000000000000004171414244367200231410ustar00rootroot00000000000000.. _pg_autoctl_disable: pg_autoctl disable ================== pg_autoctl disable - Disable a feature on a formation .. toctree:: :maxdepth: 1 pg_autoctl_disable_secondary pg_autoctl_disable_maintenance pg_autoctl_disable_ssl pg_autoctl_disable_monitor pg_auto_failover-1.6.3/docs/ref/pg_autoctl_disable_maintenance.rst000066400000000000000000000055761414244367200255160ustar00rootroot00000000000000.. _pg_autoctl_disable_maintenance: pg_autoctl disable maintenance ============================== pg_autoctl disable maintenance - Disable Postgres maintenance mode on this node Synopsis -------- A pg_auto_failover can be put to a maintenance state. The Postgres node is then still registered to the monitor, and is known to be unreliable until maintenance is disabled. A node in the maintenance state is not a candidate for promotion. Typical use of the maintenance state include Operating System or Postgres reboot, e.g. when applying security upgrades. :: usage: pg_autoctl disable maintenance [ --pgdata --allow-failover ] --pgdata path to data directory Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --formation Target formation where to disable secondary feature. Examples -------- :: $ pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/4000810 | read-write | primary | primary node2 | 2 | localhost:5502 | 0/4000810 | read-only | secondary | secondary node3 | 3 | localhost:5503 | 0/4000810 | none | maintenance | maintenance $ pg_autoctl disable maintenance --pgdata node3 12:06:37 47542 INFO Listening monitor notifications about state changes in formation "default" and group 0 12:06:37 47542 INFO Following table displays times when notifications are received Time | Name | Node | Host:Port | Current State | Assigned State ---------+-------+-------+----------------+---------------------+-------------------- 12:06:37 | node3 | 3 | localhost:5503 | maintenance | catchingup 12:06:37 | node3 | 3 | localhost:5503 | catchingup | catchingup 12:06:37 | node3 | 3 | localhost:5503 | catchingup | secondary 12:06:37 | node3 | 3 | localhost:5503 | secondary | secondary $ pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/4000848 | read-write | primary | primary node2 | 2 | localhost:5502 | 0/4000848 | read-only | secondary | secondary node3 | 3 | localhost:5503 | 0/4000000 | read-only | secondary | secondary pg_auto_failover-1.6.3/docs/ref/pg_autoctl_disable_monitor.rst000066400000000000000000000055121414244367200247110ustar00rootroot00000000000000.. _pg_autoctl_disable_monitor: pg_autoctl disable monitor ========================== pg_autoctl disable monitor - Disable the monitor for this node Synopsis -------- It is possible to disable the pg_auto_failover monitor and enable it again online in a running pg_autoctl Postgres node. The main use-cases where this operation is useful is when the monitor node has to be replaced, either after a full crash of the previous monitor node, of for migrating to a new monitor node (hardware replacement, region or zone migration, etc). :: usage: pg_autoctl disable monitor [ --pgdata --force ] --pgdata path to data directory --force force unregistering from the monitor Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --force The ``--force`` covers the two following situations: 1. By default, the command expects to be able to connect to the current monitor. When the current known monitor in the setup is not running anymore, use ``--force`` to skip this step. 2. When ``pg_autoctl`` could connect to the monitor and the node is found there, this is normally an error that prevents from disabling the monitor. Using ``--force`` allows the command to drop the node from the monitor and continue with disabling the monitor. Examples -------- :: $ pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/4000148 | read-write | primary | primary node2 | 2 | localhost:5502 | 0/4000148 | read-only | secondary | secondary node3 | 3 | localhost:5503 | 0/4000148 | read-only | secondary | secondary $ pg_autoctl disable monitor --pgdata node3 12:41:21 43039 INFO Found node 3 "node3" (localhost:5503) on the monitor 12:41:21 43039 FATAL Use --force to remove the node from the monitor $ pg_autoctl disable monitor --pgdata node3 --force 12:41:32 43219 INFO Removing node 3 "node3" (localhost:5503) from monitor $ pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/4000760 | read-write | primary | primary node2 | 2 | localhost:5502 | 0/4000760 | read-only | secondary | secondary pg_auto_failover-1.6.3/docs/ref/pg_autoctl_disable_secondary.rst000066400000000000000000000020541414244367200252070ustar00rootroot00000000000000.. _pg_autoctl_disable_secondary: pg_autoctl disable secondary ============================ pg_autoctl disable secondary - Disable secondary nodes on a formation Synopsis -------- This feature makes the most sense when using the Enterprise Edition of pg_auto_failover, which is fully compatible with Citus formations. When ``secondary`` are disabled, then Citus workers creation policy is to assign a primary node then a standby node for each group. When ``secondary`` is disabled the Citus workers creation policy is to assign only the primary nodes. :: usage: pg_autoctl disable secondary [ --pgdata --formation ] --pgdata path to data directory --formation Formation to disable secondary on Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --formation Target formation where to disable secondary feature. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_disable_ssl.rst000066400000000000000000000037451414244367200240310ustar00rootroot00000000000000.. _pg_autoctl_disable_ssl: pg_autoctl disable ssl ====================== pg_autoctl disable ssl - Disable SSL configuration on this node Synopsis -------- It is possible to manage Postgres SSL settings with the ``pg_autoctl`` command, both at :ref:`pg_autoctl_create_postgres` time and then again to change your mind and update the SSL settings at run-time. :: usage: pg_autoctl disable ssl [ --pgdata ] [ --json ] --pgdata path to data directory --ssl-self-signed setup network encryption using self signed certificates (does NOT protect against MITM) --ssl-mode use that sslmode in connection strings --ssl-ca-file set the Postgres ssl_ca_file to that file path --ssl-crl-file set the Postgres ssl_crl_file to that file path --no-ssl don't disable network encryption (NOT recommended, prefer --ssl-self-signed) --server-key set the Postgres ssl_key_file to that file path --server-cert set the Postgres ssl_cert_file to that file path Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --ssl-self-signed Generate SSL self-signed certificates to provide network encryption. This does not protect against man-in-the-middle kinds of attacks. See :ref:`security` for more about our SSL settings. --ssl-mode SSL Mode used by ``pg_autoctl`` when connecting to other nodes, including when connecting for streaming replication. --ssl-ca-file Set the Postgres ``ssl_ca_file`` to that file path. --ssl-crl-file Set the Postgres ``ssl_crl_file`` to that file path. --no-ssl Don't disable network encryption. This is not recommended, prefer ``--ssl-self-signed``. --server-key Set the Postgres ``ssl_key_file`` to that file path. --server-cert Set the Postgres ``ssl_cert_file`` to that file path. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_do.rst000066400000000000000000000157371414244367200221530ustar00rootroot00000000000000.. _pg_autoctl_do: pg_autoctl do ============= pg_autoctl do - Internal commands and internal QA tooling The debug commands for ``pg_autoctl`` are only available when the environment variable ``PG_AUTOCTL_DEBUG`` is set (to any value). When testing pg_auto_failover, it is helpful to be able to play with the local nodes using the same lower-level API as used by the pg_auto_failover Finite State Machine transitions. Some commands could be useful in contexts other than pg_auto_failover development and QA work, so some documentation has been made available. .. toctree:: :maxdepth: 1 pg_autoctl_do_tmux pg_autoctl_do_demo pg_autoctl_do_service_restart pg_autoctl_do_show pg_autoctl_do_pgsetup The low-level API is made available through the following ``pg_autoctl do`` commands, only available in debug environments:: pg_autoctl do + monitor Query a pg_auto_failover monitor + fsm Manually manage the keeper's state + primary Manage a PostgreSQL primary server + standby Manage a PostgreSQL standby server + show Show some debug level information + pgsetup Manage a local Postgres setup + pgctl Signal the pg_autoctl postgres service + service Run pg_autoctl sub-processes (services) + tmux Set of facilities to handle tmux interactive sessions + azure Manage a set of Azure resources for a pg_auto_failover demo + demo Use a demo application for pg_auto_failover pg_autoctl do monitor + get Get information from the monitor register Register the current node with the monitor active Call in the pg_auto_failover Node Active protocol version Check that monitor version is 1.5.0.1; alter extension update if not parse-notification parse a raw notification message pg_autoctl do monitor get primary Get the primary node from pg_auto_failover in given formation/group others Get the other nodes from the pg_auto_failover group of hostname/port coordinator Get the coordinator node from the pg_auto_failover formation pg_autoctl do fsm init Initialize the keeper's state on-disk state Read the keeper's state from disk and display it list List reachable FSM states from current state gv Output the FSM as a .gv program suitable for graphviz/dot assign Assign a new goal state to the keeper step Make a state transition if instructed by the monitor + nodes Manually manage the keeper's nodes list pg_autoctl do fsm nodes get Get the list of nodes from file (see --disable-monitor) set Set the list of nodes to file (see --disable-monitor) pg_autoctl do primary + slot Manage replication slot on the primary server + adduser Create users on primary defaults Add default settings to postgresql.conf identify Run the IDENTIFY_SYSTEM replication command on given host pg_autoctl do primary slot create Create a replication slot on the primary server drop Drop a replication slot on the primary server pg_autoctl do primary adduser monitor add a local user for queries from the monitor replica add a local user with replication privileges pg_autoctl do standby init Initialize the standby server using pg_basebackup rewind Rewind a demoted primary server using pg_rewind promote Promote a standby server to become writable pg_autoctl do show ipaddr Print this node's IP address information cidr Print this node's CIDR information lookup Print this node's DNS lookup information hostname Print this node's default hostname reverse Lookup given hostname and check reverse DNS setup pg_autoctl do pgsetup pg_ctl Find a non-ambiguous pg_ctl program and Postgres version discover Discover local PostgreSQL instance, if any ready Return true is the local Postgres server is ready wait Wait until the local Postgres server is ready logs Outputs the Postgres startup logs tune Compute and log some Postgres tuning options pg_autoctl do pgctl on Signal pg_autoctl postgres service to ensure Postgres is running off Signal pg_autoctl postgres service to ensure Postgres is stopped pg_autoctl do service + getpid Get the pid of pg_autoctl sub-processes (services) + restart Restart pg_autoctl sub-processes (services) pgcontroller pg_autoctl supervised postgres controller postgres pg_autoctl service that start/stop postgres when asked listener pg_autoctl service that listens to the monitor notifications node-active pg_autoctl service that implements the node active protocol pg_autoctl do service getpid postgres Get the pid of the pg_autoctl postgres controller service listener Get the pid of the pg_autoctl monitor listener service node-active Get the pid of the pg_autoctl keeper node-active service pg_autoctl do service restart postgres Restart the pg_autoctl postgres controller service listener Restart the pg_autoctl monitor listener service node-active Restart the pg_autoctl keeper node-active service pg_autoctl do tmux script Produce a tmux script for a demo or a test case (debug only) session Run a tmux session for a demo or a test case stop Stop pg_autoctl processes that belong to a tmux session wait Wait until a given node has been registered on the monitor clean Clean-up a tmux session processes and root dir pg_autoctl do azure + provision provision azure resources for a pg_auto_failover demo + tmux Run a tmux session with an Azure setup for QA/testing + show show azure resources for a pg_auto_failover demo deploy Deploy a pg_autoctl VMs, given by name create Create an azure QA environment drop Drop an azure QA environment: resource group, network, VMs ls List resources in a given azure region ssh Runs ssh -l ha-admin for a given VM name sync Rsync pg_auto_failover sources on all the target region VMs pg_autoctl do azure provision region Provision an azure region: resource group, network, VMs nodes Provision our pre-created VM with pg_autoctl Postgres nodes pg_autoctl do azure tmux session Create or attach a tmux session for the created Azure VMs kill Kill an existing tmux session for Azure VMs pg_autoctl do azure show ips Show public and private IP addresses for selected VMs state Connect to the monitor node to show the current state pg_autoctl do demo run Run the pg_auto_failover demo application uri Grab the application connection string from the monitor ping Attempt to connect to the application URI summary Display a summary of the previous demo app run pg_auto_failover-1.6.3/docs/ref/pg_autoctl_do_demo.rst000066400000000000000000000116351414244367200231500ustar00rootroot00000000000000.. _pg_autoctl_do_demo: pg_autoctl do demo ================== pg_autoctl do demo - Use a demo application for pg_auto_failover Synopsis -------- pg_autoctl do demo provides the following commands:: pg_autoctl do demo run Run the pg_auto_failover demo application uri Grab the application connection string from the monitor ping Attempt to connect to the application URI summary Display a summary of the previous demo app run To run a demo, use ``pg_autoctl do demo run``:: usage: pg_autoctl do demo run [option ...] --monitor Postgres URI of the pg_auto_failover monitor --formation Formation to use (default) --group Group Id to failover (0) --username PostgreSQL's username --clients How many client processes to use (1) --duration Duration of the demo app, in seconds (30) --first-failover Timing of the first failover (10) --failover-freq Seconds between subsequent failovers (45) Description ----------- The ``pg_autoctl`` debug tooling includes a demo application. The demo prepare its Postgres schema on the target database, and then starts several clients (see ``--clients``) that concurrently connect to the target application URI and record the time it took to establish the Postgres connection to the current read-write node, with information about the retry policy metrics. Example ------- :: $ pg_autoctl do demo run --monitor 'postgres://autoctl_node@localhost:5500/pg_auto_failover?sslmode=prefer' --clients 10 14:43:35 19660 INFO Using application connection string "postgres://localhost:5502,localhost:5503,localhost:5501/demo?target_session_attrs=read-write&sslmode=prefer" 14:43:35 19660 INFO Using Postgres user PGUSER "dim" 14:43:35 19660 INFO Preparing demo schema: drop schema if exists demo cascade 14:43:35 19660 WARN NOTICE: schema "demo" does not exist, skipping 14:43:35 19660 INFO Preparing demo schema: create schema demo 14:43:35 19660 INFO Preparing demo schema: create table demo.tracking(ts timestamptz default now(), client integer, loop integer, retries integer, us bigint, recovery bool) 14:43:36 19660 INFO Preparing demo schema: create table demo.client(client integer, pid integer, retry_sleep_ms integer, retry_cap_ms integer, failover_count integer) 14:43:36 19660 INFO Starting 10 concurrent clients as sub-processes 14:43:36 19675 INFO Failover client is started, will failover in 10s and every 45s after that ... $ pg_autoctl do demo summary --monitor 'postgres://autoctl_node@localhost:5500/pg_auto_failover?sslmode=prefer' --clients 10 14:44:27 22789 INFO Using application connection string "postgres://localhost:5503,localhost:5501,localhost:5502/demo?target_session_attrs=read-write&sslmode=prefer" 14:44:27 22789 INFO Using Postgres user PGUSER "dim" 14:44:27 22789 INFO Summary for the demo app running with 10 clients for 30s Client | Connections | Retries | Min Connect Time (ms) | max | p95 | p99 ----------------------+-------------+---------+-----------------------+----------+---------+--------- Client 1 | 136 | 14 | 58.318 | 2601.165 | 244.443 | 261.809 Client 2 | 136 | 5 | 55.199 | 2514.968 | 242.362 | 259.282 Client 3 | 134 | 6 | 55.815 | 2974.247 | 241.740 | 262.908 Client 4 | 135 | 7 | 56.542 | 2970.922 | 238.995 | 251.177 Client 5 | 136 | 8 | 58.339 | 2758.106 | 238.720 | 252.439 Client 6 | 134 | 9 | 58.679 | 2813.653 | 244.696 | 254.674 Client 7 | 134 | 11 | 58.737 | 2795.974 | 243.202 | 253.745 Client 8 | 136 | 12 | 52.109 | 2354.952 | 242.664 | 254.233 Client 9 | 137 | 19 | 59.735 | 2628.496 | 235.668 | 253.582 Client 10 | 133 | 6 | 57.994 | 3060.489 | 242.156 | 256.085 All Clients Combined | 1351 | 97 | 52.109 | 3060.489 | 241.848 | 258.450 (11 rows) Min Connect Time (ms) | max | freq | bar -----------------------+----------+------+----------------------------------------------- 52.109 | 219.105 | 1093 | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ 219.515 | 267.168 | 248 | ▒▒▒▒▒▒▒▒▒▒ 2354.952 | 2354.952 | 1 | 2514.968 | 2514.968 | 1 | 2601.165 | 2628.496 | 2 | 2758.106 | 2813.653 | 3 | 2970.922 | 2974.247 | 2 | 3060.489 | 3060.489 | 1 | (8 rows) pg_auto_failover-1.6.3/docs/ref/pg_autoctl_do_pgsetup.rst000066400000000000000000000173521414244367200237150ustar00rootroot00000000000000.. _pg_autoctl_do_pgsetup: pg_autoctl do pgsetup ===================== pg_autoctl do pgsetup - Manage a local Postgres setup Synopsis -------- The main ``pg_autoctl`` commands implement low-level management tooling for a local Postgres instance. Some of the low-level Postgres commands can be used as their own tool in some cases. pg_autoctl do pgsetup provides the following commands:: pg_autoctl do pgsetup pg_ctl Find a non-ambiguous pg_ctl program and Postgres version discover Discover local PostgreSQL instance, if any ready Return true is the local Postgres server is ready wait Wait until the local Postgres server is ready logs Outputs the Postgres startup logs tune Compute and log some Postgres tuning options pg_autoctl do pgsetup pg_ctl ---------------------------- In a similar way to ``which -a``, this commands scans your PATH for ``pg_ctl`` commands. Then it runs the ``pg_ctl --version`` command and parses the output to determine the version of Postgres that is available in the path. :: $ pg_autoctl do pgsetup pg_ctl --pgdata node1 16:49:18 69684 INFO Environment variable PG_CONFIG is set to "/Applications/Postgres.app//Contents/Versions/12/bin/pg_config" 16:49:18 69684 INFO `pg_autoctl create postgres` would use "/Applications/Postgres.app/Contents/Versions/12/bin/pg_ctl" for Postgres 12.3 16:49:18 69684 INFO `pg_autoctl create monitor` would use "/Applications/Postgres.app/Contents/Versions/12/bin/pg_ctl" for Postgres 12.3 pg_autoctl do pgsetup discover ------------------------------ Given a PGDATA or ``--pgdata`` option, the command discovers if a running Postgres service matches the pg_autoctl setup, and prints the information that ``pg_autoctl`` typically needs when managing a Postgres instance. :: $ pg_autoctl do pgsetup discover --pgdata node1 pgdata: /Users/dim/dev/MS/pg_auto_failover/tmux/node1 pg_ctl: /Applications/Postgres.app/Contents/Versions/12/bin/pg_ctl pg_version: 12.3 pghost: /tmp pgport: 5501 proxyport: 0 pid: 21029 is in recovery: no Control Version: 1201 Catalog Version: 201909212 System Identifier: 6942422768095393833 Latest checkpoint LSN: 0/4059C18 Postmaster status: ready pg_autoctl do pgsetup ready --------------------------- Similar to the `pg_isready`__ command, though uses the Postgres specifications found in the pg_autoctl node setup. __ https://www.postgresql.org/docs/current/app-pg-isready.html :: $ pg_autoctl do pgsetup ready --pgdata node1 16:50:08 70582 INFO Postgres status is: "ready" pg_autoctl do pgsetup wait -------------------------- When ``pg_autoctl do pgsetup ready`` would return false because Postgres is not ready yet, this command continues probing every second for 30 seconds, and exists as soon as Postgres is ready. :: $ pg_autoctl do pgsetup wait --pgdata node1 16:50:22 70829 INFO Postgres is now serving PGDATA "/Users/dim/dev/MS/pg_auto_failover/tmux/node1" on port 5501 with pid 21029 16:50:22 70829 INFO Postgres status is: "ready" pg_autoctl do pgsetup logs -------------------------- Outputs the Postgres logs from the most recent log file in the ``PGDATA/log`` directory. :: $ pg_autoctl do pgsetup logs --pgdata node1 16:50:39 71126 WARN Postgres logs from "/Users/dim/dev/MS/pg_auto_failover/tmux/node1/startup.log": 16:50:39 71126 INFO 2021-03-22 14:43:48.911 CET [21029] LOG: starting PostgreSQL 12.3 on x86_64-apple-darwin16.7.0, compiled by Apple LLVM version 8.1.0 (clang-802.0.42), 64-bit 16:50:39 71126 INFO 2021-03-22 14:43:48.913 CET [21029] LOG: listening on IPv6 address "::", port 5501 16:50:39 71126 INFO 2021-03-22 14:43:48.913 CET [21029] LOG: listening on IPv4 address "0.0.0.0", port 5501 16:50:39 71126 INFO 2021-03-22 14:43:48.913 CET [21029] LOG: listening on Unix socket "/tmp/.s.PGSQL.5501" 16:50:39 71126 INFO 2021-03-22 14:43:48.931 CET [21029] LOG: redirecting log output to logging collector process 16:50:39 71126 INFO 2021-03-22 14:43:48.931 CET [21029] HINT: Future log output will appear in directory "log". 16:50:39 71126 WARN Postgres logs from "/Users/dim/dev/MS/pg_auto_failover/tmux/node1/log/postgresql-2021-03-22_144348.log": 16:50:39 71126 INFO 2021-03-22 14:43:48.937 CET [21033] LOG: database system was shut down at 2021-03-22 14:43:46 CET 16:50:39 71126 INFO 2021-03-22 14:43:48.937 CET [21033] LOG: entering standby mode 16:50:39 71126 INFO 2021-03-22 14:43:48.942 CET [21033] LOG: consistent recovery state reached at 0/4022E88 16:50:39 71126 INFO 2021-03-22 14:43:48.942 CET [21033] LOG: invalid record length at 0/4022E88: wanted 24, got 0 16:50:39 71126 INFO 2021-03-22 14:43:48.946 CET [21029] LOG: database system is ready to accept read only connections 16:50:39 71126 INFO 2021-03-22 14:43:49.032 CET [21038] LOG: fetching timeline history file for timeline 4 from primary server 16:50:39 71126 INFO 2021-03-22 14:43:49.037 CET [21038] LOG: started streaming WAL from primary at 0/4000000 on timeline 3 16:50:39 71126 INFO 2021-03-22 14:43:49.046 CET [21038] LOG: replication terminated by primary server 16:50:39 71126 INFO 2021-03-22 14:43:49.046 CET [21038] DETAIL: End of WAL reached on timeline 3 at 0/4022E88. 16:50:39 71126 INFO 2021-03-22 14:43:49.047 CET [21033] LOG: new target timeline is 4 16:50:39 71126 INFO 2021-03-22 14:43:49.049 CET [21038] LOG: restarted WAL streaming at 0/4000000 on timeline 4 16:50:39 71126 INFO 2021-03-22 14:43:49.210 CET [21033] LOG: redo starts at 0/4022E88 16:50:39 71126 INFO 2021-03-22 14:52:06.692 CET [21029] LOG: received SIGHUP, reloading configuration files 16:50:39 71126 INFO 2021-03-22 14:52:06.906 CET [21029] LOG: received SIGHUP, reloading configuration files 16:50:39 71126 FATAL 2021-03-22 15:34:24.920 CET [21038] FATAL: terminating walreceiver due to timeout 16:50:39 71126 INFO 2021-03-22 15:34:24.973 CET [21033] LOG: invalid record length at 0/4059CC8: wanted 24, got 0 16:50:39 71126 INFO 2021-03-22 15:34:25.105 CET [35801] LOG: started streaming WAL from primary at 0/4000000 on timeline 4 16:50:39 71126 FATAL 2021-03-22 16:12:56.918 CET [35801] FATAL: terminating walreceiver due to timeout 16:50:39 71126 INFO 2021-03-22 16:12:57.086 CET [38741] LOG: started streaming WAL from primary at 0/4000000 on timeline 4 16:50:39 71126 FATAL 2021-03-22 16:23:39.349 CET [38741] FATAL: terminating walreceiver due to timeout 16:50:39 71126 INFO 2021-03-22 16:23:39.497 CET [41635] LOG: started streaming WAL from primary at 0/4000000 on timeline 4 pg_autoctl do pgsetup tune -------------------------- Outputs the pg_autoclt automated tuning options. Depending on the number of CPU and amount of RAM detected in the environment where it is run, ``pg_autoctl`` can adjust some very basic Postgres tuning knobs to get started. :: $ pg_autoctl do pgsetup tune --pgdata node1 -vv 13:25:25 77185 DEBUG pgtuning.c:85: Detected 12 CPUs and 16 GB total RAM on this server 13:25:25 77185 DEBUG pgtuning.c:225: Setting autovacuum_max_workers to 3 13:25:25 77185 DEBUG pgtuning.c:228: Setting shared_buffers to 4096 MB 13:25:25 77185 DEBUG pgtuning.c:231: Setting work_mem to 24 MB 13:25:25 77185 DEBUG pgtuning.c:235: Setting maintenance_work_mem to 512 MB 13:25:25 77185 DEBUG pgtuning.c:239: Setting effective_cache_size to 12 GB # basic tuning computed by pg_auto_failover track_functions = pl shared_buffers = '4096 MB' work_mem = '24 MB' maintenance_work_mem = '512 MB' effective_cache_size = '12 GB' autovacuum_max_workers = 3 autovacuum_vacuum_scale_factor = 0.08 autovacuum_analyze_scale_factor = 0.02 pg_auto_failover-1.6.3/docs/ref/pg_autoctl_do_service_restart.rst000066400000000000000000000020061414244367200254200ustar00rootroot00000000000000.. _pg_autoctl_do_service_restart: pg_autoctl do service restart ============================= pg_autoctl do service restart - Run pg_autoctl sub-processes (services) Synopsis -------- pg_autoctl do service restart provides the following commands:: pg_autoctl do service restart postgres Restart the pg_autoctl postgres controller service listener Restart the pg_autoctl monitor listener service node-active Restart the pg_autoctl keeper node-active service Description ----------- It is possible to restart the ``pg_autoctl`` or the Postgres service without affecting the other running service. Typically, to restart the ``pg_autoctl`` parts without impacting Postgres:: $ pg_autoctl do service restart node-active --pgdata node1 14:52:06 31223 INFO Sending the TERM signal to service "node-active" with pid 26626 14:52:06 31223 INFO Service "node-active" has been restarted with pid 31230 31230 The Postgres service has not been impacted by the restart of the ``pg_autoctl`` process. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_do_show.rst000066400000000000000000000135661414244367200232110ustar00rootroot00000000000000.. _pg_autoctl_do_show: pg_autoctl do show ================== pg_autoctl do show - Show some debug level information Synopsis -------- The commands :ref:`pg_autoctl_create_monitor` and :ref:`pg_autoctl_create_postgres` both implement some level of automated detection of the node network settings when the option ``--hostname`` is not used. Adding to those commands, when a new node is registered to the monitor, other nodes also edit their Postgres HBA rules to allow the new node to connect, unless the option ``--skip-pg-hba`` has been used. The debug sub-commands for ``pg_autoctl do show`` can be used to see in details the network discovery done by ``pg_autoctl``. pg_autoctl do show provides the following commands:: pg_autoctl do show ipaddr Print this node's IP address information cidr Print this node's CIDR information lookup Print this node's DNS lookup information hostname Print this node's default hostname reverse Lookup given hostname and check reverse DNS setup pg_autoctl do show ipaddr ------------------------- Connects to an external IP address and uses ``getsockname(2)`` to retrieve the current address to which the socket is bound. The external IP address defaults to ``8.8.8.8``, the IP address of a Google provided public DNS server, or to the monitor IP address or hostname in the context of :ref:`pg_autoctl_create_postgres`. :: $ pg_autoctl do show ipaddr 16:42:40 62631 INFO ipaddr.c:107: Connecting to 8.8.8.8 (port 53) 192.168.1.156 pg_autoctl do show cidr ----------------------- Connects to an external IP address in the same way as the previous command ``pg_autoctl do show ipaddr`` and then matches the local socket name with the list of local network interfaces. When a match is found, uses the netmask of the interface to compute the CIDR notation from the IP address. The computed CIDR notation is then used in HBA rules. :: $ pg_autoctl do show cidr 16:43:19 63319 INFO Connecting to 8.8.8.8 (port 53) 192.168.1.0/24 pg_autoctl do show hostname --------------------------- Uses either its first (and only) argument or the result of ``gethostname(2)`` as the candidate hostname to use in HBA rules, and then check that the hostname resolves to an IP address that belongs to one of the machine network interfaces. When the hostname forward-dns lookup resolves to an IP address that is local to the node where the command is run, then a reverse-lookup from the IP address is made to see if it matches with the candidate hostname. :: $ pg_autoctl do show hostname DESKTOP-IC01GOOS.europe.corp.microsoft.com $ pg_autoctl -vv do show hostname 'postgres://autoctl_node@localhost:5500/pg_auto_failover' 13:45:00 93122 INFO cli_do_show.c:256: Using monitor hostname "localhost" and port 5500 13:45:00 93122 INFO ipaddr.c:107: Connecting to ::1 (port 5500) 13:45:00 93122 DEBUG cli_do_show.c:272: cli_show_hostname: ip ::1 13:45:00 93122 DEBUG cli_do_show.c:283: cli_show_hostname: host localhost 13:45:00 93122 DEBUG cli_do_show.c:294: cli_show_hostname: ip ::1 localhost pg_autoctl do show lookup ------------------------- Checks that the given argument is an hostname that resolves to a local IP address, that is an IP address associated with a local network interface. :: $ pg_autoctl do show lookup DESKTOP-IC01GOOS.europe.corp.microsoft.com DESKTOP-IC01GOOS.europe.corp.microsoft.com: 192.168.1.156 pg_autoctl do show reverse -------------------------- Implements the same DNS checks as Postgres HBA matching code: first does a forward DNS lookup of the given hostname, and then a reverse-lookup from all the IP addresses obtained. Success is reached when at least one of the IP addresses from the forward lookup resolves back to the given hostname (as the first answer to the reverse DNS lookup). :: $ pg_autoctl do show reverse DESKTOP-IC01GOOS.europe.corp.microsoft.com 16:44:49 64910 FATAL Failed to find an IP address for hostname "DESKTOP-IC01GOOS.europe.corp.microsoft.com" that matches hostname again in a reverse-DNS lookup. 16:44:49 64910 INFO Continuing with IP address "192.168.1.156" $ pg_autoctl -vv do show reverse DESKTOP-IC01GOOS.europe.corp.microsoft.com 16:44:45 64832 DEBUG ipaddr.c:719: DESKTOP-IC01GOOS.europe.corp.microsoft.com has address 192.168.1.156 16:44:45 64832 DEBUG ipaddr.c:733: reverse lookup for "192.168.1.156" gives "desktop-ic01goos.europe.corp.microsoft.com" first 16:44:45 64832 DEBUG ipaddr.c:719: DESKTOP-IC01GOOS.europe.corp.microsoft.com has address 192.168.1.156 16:44:45 64832 DEBUG ipaddr.c:733: reverse lookup for "192.168.1.156" gives "desktop-ic01goos.europe.corp.microsoft.com" first 16:44:45 64832 DEBUG ipaddr.c:719: DESKTOP-IC01GOOS.europe.corp.microsoft.com has address 2a01:110:10:40c::2ad 16:44:45 64832 DEBUG ipaddr.c:728: Failed to resolve hostname from address "192.168.1.156": nodename nor servname provided, or not known 16:44:45 64832 DEBUG ipaddr.c:719: DESKTOP-IC01GOOS.europe.corp.microsoft.com has address 2a01:110:10:40c::2ad 16:44:45 64832 DEBUG ipaddr.c:728: Failed to resolve hostname from address "192.168.1.156": nodename nor servname provided, or not known 16:44:45 64832 DEBUG ipaddr.c:719: DESKTOP-IC01GOOS.europe.corp.microsoft.com has address 100.64.34.213 16:44:45 64832 DEBUG ipaddr.c:728: Failed to resolve hostname from address "192.168.1.156": nodename nor servname provided, or not known 16:44:45 64832 DEBUG ipaddr.c:719: DESKTOP-IC01GOOS.europe.corp.microsoft.com has address 100.64.34.213 16:44:45 64832 DEBUG ipaddr.c:728: Failed to resolve hostname from address "192.168.1.156": nodename nor servname provided, or not known 16:44:45 64832 FATAL cli_do_show.c:333: Failed to find an IP address for hostname "DESKTOP-IC01GOOS.europe.corp.microsoft.com" that matches hostname again in a reverse-DNS lookup. 16:44:45 64832 INFO cli_do_show.c:334: Continuing with IP address "192.168.1.156" pg_auto_failover-1.6.3/docs/ref/pg_autoctl_do_tmux.rst000066400000000000000000000032711414244367200232160ustar00rootroot00000000000000.. _pg_autoctl_do_tmux: pg_autoctl do tmux ================== pg_autoctl do tmux - Set of facilities to handle tmux interactive sessions Synopsis -------- pg_autoctl do tmux provides the following commands:: pg_autoctl do tmux script Produce a tmux script for a demo or a test case (debug only) session Run a tmux session for a demo or a test case stop Stop pg_autoctl processes that belong to a tmux session wait Wait until a given node has been registered on the monitor clean Clean-up a tmux session processes and root dir Description ----------- An easy way to get started with pg_auto_failover in a localhost only formation with three nodes is to run the following command:: $ PG_AUTOCTL_DEBUG=1 pg_autoctl do tmux session \ --root /tmp/pgaf \ --first-pgport 9000 \ --nodes 4 \ --layout tiled This requires the command ``tmux`` to be available in your PATH. The ``pg_autoctl do tmux session`` commands prepares a self-contained root directory where to create pg_auto_failover nodes and their configuration, then prepares a tmux script, and then runs the script with a command such as:: /usr/local/bin/tmux -v start-server ; source-file /tmp/pgaf/script-9000.tmux The tmux session contains a single tmux window multiple panes: - one pane for the monitor - one pane per Postgres nodes, here 4 of them - one pane for running ``watch pg_autoctl show state`` - one extra pane for an interactive shell. Usually the first two commands to run in the interactive shell, once the formation is stable (one node is primary, the other ones are all secondary), are the following:: $ pg_autoctl get formation settings $ pg_autoctl perform failover pg_auto_failover-1.6.3/docs/ref/pg_autoctl_drop.rst000066400000000000000000000003431414244367200225000ustar00rootroot00000000000000.. _pg_autoctl_drop: pg_autoctl drop =============== pg_autoctl drop - Drop a pg_auto_failover node, or formation .. toctree:: :maxdepth: 1 pg_autoctl_drop_monitor pg_autoctl_drop_node pg_autoctl_drop_formation pg_auto_failover-1.6.3/docs/ref/pg_autoctl_drop_formation.rst000066400000000000000000000020331414244367200245540ustar00rootroot00000000000000.. _pg_autoctl_drop_formation: pg_autoctl drop formation ========================= pg_autoctl drop formation - Drop a formation on the pg_auto_failover monitor Synopsis -------- This command drops an existing formation on the monitor:: usage: pg_autoctl drop formation [ --pgdata --formation ] --pgdata path to data directory --monitor pg_auto_failover Monitor Postgres URL --formation name of the formation to drop Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --monitor Postgres URI used to connect to the monitor. Must use the ``autoctl_node`` username and target the ``pg_auto_failover`` database name. It is possible to show the Postgres URI from the monitor node using the command :ref:`pg_autoctl_show_uri`. --formation Name of the formation to drop from the monitor. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_drop_monitor.rst000066400000000000000000000016001414244367200242440ustar00rootroot00000000000000.. _pg_autoctl_drop_monitor: pg_autoctl drop monitor ======================= pg_autoctl drop monitor - Drop the pg_auto_failover monitor Synopsis -------- This command allows to review all the replication settings of a given formation (defaults to `'default'` as usual):: usage: pg_autoctl drop monitor [ --pgdata --destroy ] --pgdata path to data directory --destroy also destroy Postgres database Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --destroy By default the ``pg_autoctl drop monitor`` commands does not remove the Postgres database for the monitor. When using ``--destroy``, the Postgres installation is also deleted. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_drop_node.rst000066400000000000000000000120731414244367200235100ustar00rootroot00000000000000.. _pg_autoctl_drop_node: pg_autoctl drop node ==================== pg_autoctl drop node - Drop a node from the pg_auto_failover monitor Synopsis -------- This command drops a Postgres node from the pg_auto_failover monitor:: usage: pg_autoctl drop node [ [ [ --pgdata ] [ --destroy ] ] | [ --monitor [ [ --hostname --pgport ] | [ --formation --name ] ] ] ] --pgdata path to data directory --monitor pg_auto_failover Monitor Postgres URL --formation pg_auto_failover formation --name drop the node with the given node name --hostname drop the node with given hostname and pgport --pgport drop the node with given hostname and pgport --destroy also destroy Postgres database --force force dropping the node from the monitor --wait how many seconds to wait, default to 60 Description ----------- Two modes of operations are implemented in the ``pg_autoctl drop node`` command. When removing a node that still exists, it is possible to use ``pg_autoctl drop node --destroy`` to remove the node both from the monitor and also delete the local Postgres instance entirely. When removing a node that doesn't exist physically anymore, or when the VM that used to host the node has been lost entirely, use either the pair of options ``--hostname`` and ``--pgport`` or the pair of options ``--formation`` and ``--name`` to match the node registration record on the monitor database, and get it removed from the known list of nodes on the monitor. Then option ``--force`` can be used when the target node to remove does not exist anymore. When a node has been lost entirely, it's not going to be able to finish the procedure itself, and it is then possible to instruct the monitor of the situation. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --monitor Postgres URI used to connect to the monitor. Must use the ``autoctl_node`` username and target the ``pg_auto_failover`` database name. It is possible to show the Postgres URI from the monitor node using the command :ref:`pg_autoctl_show_uri`. --hostname Hostname of the Postgres node to remove from the monitor. Use either ``--name`` or ``--hostname --pgport``, but not both. --pgport Port of the Postgres node to remove from the monitor. Use either ``--name`` or ``--hostname --pgport``, but not both. --name Name of the node to remove from the monitor. Use either ``--name`` or ``--hostname --pgport``, but not both. --destroy By default the ``pg_autoctl drop monitor`` commands does not remove the Postgres database for the monitor. When using ``--destroy``, the Postgres installation is also deleted. --force By default a node is expected to reach the assigned state DROPPED when it is removed from the monitor, and has the opportunity to implement clean-up actions. When the target node to remove is not available anymore, it is possible to use the option ``--force`` to immediately remove the node from the monitor. --wait How many seconds to wait for the node to be dropped entirely. The command stops when the target node is not to be found on the monitor anymore, or when the timeout has elapsed, whichever comes first. The value 0 (zero) disables the timeout and disables waiting entirely, making the command async. Examples -------- :: $ pg_autoctl drop node --destroy --pgdata ./node3 17:52:21 54201 INFO Reaching assigned state "secondary" 17:52:21 54201 INFO Removing node with name "node3" in formation "default" from the monitor 17:52:21 54201 WARN Postgres is not running and we are in state secondary 17:52:21 54201 WARN Failed to update the keeper's state from the local PostgreSQL instance, see above for details. 17:52:21 54201 INFO Calling node_active for node default/4/0 with current state: PostgreSQL is running is false, sync_state is "", latest WAL LSN is 0/0. 17:52:21 54201 INFO FSM transition to "dropped": This node is being dropped from the monitor 17:52:21 54201 INFO Transition complete: current state is now "dropped" 17:52:21 54201 INFO This node with id 4 in formation "default" and group 0 has been dropped from the monitor 17:52:21 54201 INFO Stopping PostgreSQL at "/Users/dim/dev/MS/pg_auto_failover/tmux/node3" 17:52:21 54201 INFO /Applications/Postgres.app/Contents/Versions/12/bin/pg_ctl --pgdata /Users/dim/dev/MS/pg_auto_failover/tmux/node3 --wait stop --mode fast 17:52:21 54201 INFO /Applications/Postgres.app/Contents/Versions/12/bin/pg_ctl status -D /Users/dim/dev/MS/pg_auto_failover/tmux/node3 [3] 17:52:21 54201 INFO pg_ctl: no server running 17:52:21 54201 INFO pg_ctl stop failed, but PostgreSQL is not running anyway 17:52:21 54201 INFO Removing "/Users/dim/dev/MS/pg_auto_failover/tmux/node3" 17:52:21 54201 INFO Removing "/Users/dim/dev/MS/pg_auto_failover/tmux/config/pg_autoctl/Users/dim/dev/MS/pg_auto_failover/tmux/node3/pg_autoctl.cfg" pg_auto_failover-1.6.3/docs/ref/pg_autoctl_enable.rst000066400000000000000000000004061414244367200227620ustar00rootroot00000000000000.. _pg_autoctl_enable: pg_autoctl enable ================= pg_autoctl enable - Enable a feature on a formation .. toctree:: :maxdepth: 1 pg_autoctl_enable_secondary pg_autoctl_enable_maintenance pg_autoctl_enable_ssl pg_autoctl_enable_monitor pg_auto_failover-1.6.3/docs/ref/pg_autoctl_enable_maintenance.rst000066400000000000000000000061771414244367200253370ustar00rootroot00000000000000.. _pg_autoctl_enable_maintenance: pg_autoctl enable maintenance ============================= pg_autoctl enable maintenance - Enable Postgres maintenance mode on this node Synopsis -------- A pg_auto_failover can be put to a maintenance state. The Postgres node is then still registered to the monitor, and is known to be unreliable until maintenance is disabled. A node in the maintenance state is not a candidate for promotion. Typical use of the maintenance state include Operating System or Postgres reboot, e.g. when applying security upgrades. :: usage: pg_autoctl enable maintenance [ --pgdata --allow-failover ] --pgdata path to data directory Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --formation Target formation where to enable secondary feature. Examples -------- :: pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/4000760 | read-write | primary | primary node2 | 2 | localhost:5502 | 0/4000760 | read-only | secondary | secondary node3 | 3 | localhost:5503 | 0/4000760 | read-only | secondary | secondary $ pg_autoctl enable maintenance --pgdata node3 12:06:12 47086 INFO Listening monitor notifications about state changes in formation "default" and group 0 12:06:12 47086 INFO Following table displays times when notifications are received Time | Name | Node | Host:Port | Current State | Assigned State ---------+-------+-------+----------------+---------------------+-------------------- 12:06:12 | node1 | 1 | localhost:5501 | primary | join_primary 12:06:12 | node3 | 3 | localhost:5503 | secondary | wait_maintenance 12:06:12 | node3 | 3 | localhost:5503 | wait_maintenance | wait_maintenance 12:06:12 | node1 | 1 | localhost:5501 | join_primary | join_primary 12:06:12 | node3 | 3 | localhost:5503 | wait_maintenance | maintenance 12:06:12 | node1 | 1 | localhost:5501 | join_primary | primary 12:06:13 | node3 | 3 | localhost:5503 | maintenance | maintenance $ pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/4000810 | read-write | primary | primary node2 | 2 | localhost:5502 | 0/4000810 | read-only | secondary | secondary node3 | 3 | localhost:5503 | 0/4000810 | none | maintenance | maintenance pg_auto_failover-1.6.3/docs/ref/pg_autoctl_enable_monitor.rst000066400000000000000000000050271414244367200245350ustar00rootroot00000000000000.. _pg_autoctl_enable_monitor: pg_autoctl enable monitor ========================== pg_autoctl enable monitor - Enable a monitor for this node to be orchestrated from Synopsis -------- It is possible to disable the pg_auto_failover monitor and enable it again online in a running pg_autoctl Postgres node. The main use-cases where this operation is useful is when the monitor node has to be replaced, either after a full crash of the previous monitor node, of for migrating to a new monitor node (hardware replacement, region or zone migration, etc). :: usage: pg_autoctl enable monitor [ --pgdata --allow-failover ] postgres://autoctl_node@new.monitor.add.ress/pg_auto_failover --pgdata path to data directory Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. Examples -------- :: $ pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/4000760 | read-write | primary | primary node2 | 2 | localhost:5502 | 0/4000760 | read-only | secondary | secondary $ pg_autoctl enable monitor --pgdata node3 'postgres://autoctl_node@localhost:5500/pg_auto_failover?sslmode=require' 12:42:07 43834 INFO Registered node 3 (localhost:5503) with name "node3" in formation "default", group 0, state "wait_standby" 12:42:07 43834 INFO Successfully registered to the monitor with nodeId 3 12:42:08 43834 INFO Still waiting for the monitor to drive us to state "catchingup" 12:42:08 43834 WARN Please make sure that the primary node is currently running `pg_autoctl run` and contacting the monitor. $ pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/4000810 | read-write | primary | primary node2 | 2 | localhost:5502 | 0/4000810 | read-only | secondary | secondary node3 | 3 | localhost:5503 | 0/4000810 | read-only | secondary | secondary pg_auto_failover-1.6.3/docs/ref/pg_autoctl_enable_secondary.rst000066400000000000000000000020431414244367200250300ustar00rootroot00000000000000.. _pg_autoctl_enable_secondary: pg_autoctl enable secondary =========================== pg_autoctl enable secondary - Enable secondary nodes on a formation Synopsis -------- This feature makes the most sense when using the Enterprise Edition of pg_auto_failover, which is fully compatible with Citus formations. When ``secondary`` are enabled, then Citus workers creation policy is to assign a primary node then a standby node for each group. When ``secondary`` is disabled the Citus workers creation policy is to assign only the primary nodes. :: usage: pg_autoctl enable secondary [ --pgdata --formation ] --pgdata path to data directory --formation Formation to enable secondary on Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --formation Target formation where to enable secondary feature. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_enable_ssl.rst000066400000000000000000000037351414244367200236530ustar00rootroot00000000000000.. _pg_autoctl_enable_ssl: pg_autoctl enable ssl ===================== pg_autoctl enable ssl - Enable SSL configuration on this node Synopsis -------- It is possible to manage Postgres SSL settings with the ``pg_autoctl`` command, both at :ref:`pg_autoctl_create_postgres` time and then again to change your mind and update the SSL settings at run-time. :: usage: pg_autoctl enable ssl [ --pgdata ] [ --json ] --pgdata path to data directory --ssl-self-signed setup network encryption using self signed certificates (does NOT protect against MITM) --ssl-mode use that sslmode in connection strings --ssl-ca-file set the Postgres ssl_ca_file to that file path --ssl-crl-file set the Postgres ssl_crl_file to that file path --no-ssl don't enable network encryption (NOT recommended, prefer --ssl-self-signed) --server-key set the Postgres ssl_key_file to that file path --server-cert set the Postgres ssl_cert_file to that file path Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --ssl-self-signed Generate SSL self-signed certificates to provide network encryption. This does not protect against man-in-the-middle kinds of attacks. See :ref:`security` for more about our SSL settings. --ssl-mode SSL Mode used by ``pg_autoctl`` when connecting to other nodes, including when connecting for streaming replication. --ssl-ca-file Set the Postgres ``ssl_ca_file`` to that file path. --ssl-crl-file Set the Postgres ``ssl_crl_file`` to that file path. --no-ssl Don't enable network encryption. This is not recommended, prefer ``--ssl-self-signed``. --server-key Set the Postgres ``ssl_key_file`` to that file path. --server-cert Set the Postgres ``ssl_cert_file`` to that file path. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_get.rst000066400000000000000000000005001414244367200223060ustar00rootroot00000000000000.. _pg_autoctl_get: pg_autoctl get ============== pg_autoctl get - Get a pg_auto_failover node, or formation setting .. toctree:: :maxdepth: 1 pg_autoctl_get_formation_settings pg_autoctl_get_formation_number_sync_standbys pg_autoctl_get_node_replication_quorum pg_autoctl_get_node_candidate_priority pg_auto_failover-1.6.3/docs/ref/pg_autoctl_get_formation_number_sync_standbys.rst000066400000000000000000000024031414244367200307030ustar00rootroot00000000000000.. _pg_autoctl_get_formation_number_sync_standbys: pg_autoctl get formation number-sync-standbys ============================================= pg_autoctl get formation number-sync-standbys - get number_sync_standbys for a formation from the monitor Synopsis -------- This command prints a ``pg_autoctl`` replication settings for number sync standbys:: usage: pg_autoctl get formation number-sync-standbys [ --pgdata ] [ --json ] [ --formation ] --pgdata path to data directory --json output data in the JSON format --formation pg_auto_failover formation Description ----------- See also :ref:`pg_autoctl_show_settings` for the full list of replication settings. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output JSON formated data. --formation Show replication settings for given formation. Defaults to ``default``. Examples -------- :: $ pg_autoctl get formation number-sync-standbys 1 $ pg_autoctl get formation number-sync-standbys --json { "number-sync-standbys": 1 } pg_auto_failover-1.6.3/docs/ref/pg_autoctl_get_formation_settings.rst000066400000000000000000000073111414244367200263130ustar00rootroot00000000000000.. _pg_autoctl_get_formation_settings: pg_autoctl get formation settings ================================= pg_autoctl get formation settings - get replication settings for a formation from the monitor Synopsis -------- This command prints a ``pg_autoctl`` replication settings:: usage: pg_autoctl get formation settings [ --pgdata ] [ --json ] [ --formation ] --pgdata path to data directory --json output data in the JSON format --formation pg_auto_failover formation Description ----------- See also :ref:`pg_autoctl_show_settings` which is a synonym. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output JSON formated data. --formation Show replication settings for given formation. Defaults to ``default``. Examples -------- :: $ pg_autoctl get formation settings Context | Name | Setting | Value ----------+---------+---------------------------+------------------------------------------------------------- formation | default | number_sync_standbys | 1 primary | node1 | synchronous_standby_names | 'ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)' node | node1 | candidate priority | 50 node | node2 | candidate priority | 50 node | node3 | candidate priority | 50 node | node1 | replication quorum | true node | node2 | replication quorum | true node | node3 | replication quorum | true $ pg_autoctl get formation settings --json { "nodes": [ { "value": "true", "context": "node", "node_id": 1, "setting": "replication quorum", "group_id": 0, "nodename": "node1" }, { "value": "true", "context": "node", "node_id": 2, "setting": "replication quorum", "group_id": 0, "nodename": "node2" }, { "value": "true", "context": "node", "node_id": 3, "setting": "replication quorum", "group_id": 0, "nodename": "node3" }, { "value": "50", "context": "node", "node_id": 1, "setting": "candidate priority", "group_id": 0, "nodename": "node1" }, { "value": "50", "context": "node", "node_id": 2, "setting": "candidate priority", "group_id": 0, "nodename": "node2" }, { "value": "50", "context": "node", "node_id": 3, "setting": "candidate priority", "group_id": 0, "nodename": "node3" } ], "primary": [ { "value": "'ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)'", "context": "primary", "node_id": 1, "setting": "synchronous_standby_names", "group_id": 0, "nodename": "node1" } ], "formation": { "value": "1", "context": "formation", "node_id": null, "setting": "number_sync_standbys", "group_id": null, "nodename": "default" } } pg_auto_failover-1.6.3/docs/ref/pg_autoctl_get_node_candidate_priority.rst000066400000000000000000000025471414244367200272650ustar00rootroot00000000000000.. _pg_autoctl_get_node_candidate_priority: pg_autoctl get node candidate-priority ====================================== pg_autoctl get candidate-priority - get candidate-priority property from the monitor Synopsis -------- This command prints ``pg_autoctl`` candidate priority for a given node:: usage: pg_autoctl get node candidate-priority [ --pgdata ] [ --json ] [ --formation ] [ --name ] --pgdata path to data directory --formation pg_auto_failover formation --name pg_auto_failover node name --json output data in the JSON format Description ----------- See also :ref:`pg_autoctl_show_settings` for the full list of replication settings. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output JSON formated data. --formation Show replication settings for given formation. Defaults to ``default``. --name Show replication settings for given node, selected by name. Examples -------- :: $ pg_autoctl get node candidate-priority --name node1 50 $ pg_autoctl get node candidate-priority --name node1 --json { "name": "node1", "candidate-priority": 50 } pg_auto_failover-1.6.3/docs/ref/pg_autoctl_get_node_replication_quorum.rst000066400000000000000000000025531414244367200273260ustar00rootroot00000000000000.. _pg_autoctl_get_node_replication_quorum: pg_autoctl get node replication-quorum ====================================== pg_autoctl get replication-quorum - get replication-quorum property from the monitor Synopsis -------- This command prints ``pg_autoctl`` replication quorun for a given node:: usage: pg_autoctl get node replication-quorum [ --pgdata ] [ --json ] [ --formation ] [ --name ] --pgdata path to data directory --formation pg_auto_failover formation --name pg_auto_failover node name --json output data in the JSON format Description ----------- See also :ref:`pg_autoctl_show_settings` for the full list of replication settings. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output JSON formated data. --formation Show replication settings for given formation. Defaults to ``default``. --name Show replication settings for given node, selected by name. Examples -------- :: $ pg_autoctl get node replication-quorum --name node1 true $ pg_autoctl get node replication-quorum --name node1 --json { "name": "node1", "replication-quorum": true } pg_auto_failover-1.6.3/docs/ref/pg_autoctl_perform.rst000066400000000000000000000004021414244367200232020ustar00rootroot00000000000000.. _pg_autoctl_perform: pg_autoctl perform ================== pg_autoctl perform - Perform an action orchestrated by the monitor .. toctree:: :maxdepth: 1 pg_autoctl_perform_failover pg_autoctl_perform_switchover pg_autoctl_perform_promotion pg_auto_failover-1.6.3/docs/ref/pg_autoctl_perform_failover.rst000066400000000000000000000116661414244367200251070ustar00rootroot00000000000000.. _pg_autoctl_perform_failover: pg_autoctl perform failover =========================== pg_autoctl perform failover - Perform a failover for given formation and group Synopsis -------- This command starts a Postgres failover orchestration from the pg_auto_failover monitor:: usage: pg_autoctl perform failover [ --pgdata --formation --group ] --pgdata path to data directory --formation formation to target, defaults to 'default' --group group to target, defaults to 0 --wait how many seconds to wait, default to 60 Description ----------- The pg_auto_failover monitor can be used to orchestrate a manual failover, sometimes also known as a switchover. When doing so, split-brain are prevented thanks to intermediary states being used in the Finite State Machine. The ``pg_autoctl perform failover`` command waits until the failover is known complete on the monitor, or until the hard-coded 60s timeout has passed. The failover orchestration is done in the background by the monitor, so even if the ``pg_autoctl perform failover`` stops on the timeout, the failover orchestration continues at the monitor. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --formation Formation to target for the operation. Defaults to ``default``. --group Postgres group to target for the operation. Defaults to ``0``, only Citus formations may have more than one group. --wait How many seconds to wait for notifications about the promotion. The command stops when the promotion is finished (a node is primary), or when the timeout has elapsed, whichever comes first. The value 0 (zero) disables the timeout and allows the command to wait forever. Examples -------- :: $ pg_autoctl perform failover 12:57:30 3635 INFO Listening monitor notifications about state changes in formation "default" and group 0 12:57:30 3635 INFO Following table displays times when notifications are received Time | Name | Node | Host:Port | Current State | Assigned State ---------+-------+-------+----------------+---------------------+-------------------- 12:57:30 | node1 | 1 | localhost:5501 | primary | draining 12:57:30 | node1 | 1 | localhost:5501 | draining | draining 12:57:30 | node2 | 2 | localhost:5502 | secondary | report_lsn 12:57:30 | node3 | 3 | localhost:5503 | secondary | report_lsn 12:57:36 | node3 | 3 | localhost:5503 | report_lsn | report_lsn 12:57:36 | node2 | 2 | localhost:5502 | report_lsn | report_lsn 12:57:36 | node2 | 2 | localhost:5502 | report_lsn | prepare_promotion 12:57:36 | node2 | 2 | localhost:5502 | prepare_promotion | prepare_promotion 12:57:36 | node2 | 2 | localhost:5502 | prepare_promotion | stop_replication 12:57:36 | node1 | 1 | localhost:5501 | draining | demote_timeout 12:57:36 | node3 | 3 | localhost:5503 | report_lsn | join_secondary 12:57:36 | node1 | 1 | localhost:5501 | demote_timeout | demote_timeout 12:57:36 | node3 | 3 | localhost:5503 | join_secondary | join_secondary 12:57:37 | node2 | 2 | localhost:5502 | stop_replication | stop_replication 12:57:37 | node2 | 2 | localhost:5502 | stop_replication | wait_primary 12:57:37 | node1 | 1 | localhost:5501 | demote_timeout | demoted 12:57:37 | node1 | 1 | localhost:5501 | demoted | demoted 12:57:37 | node2 | 2 | localhost:5502 | wait_primary | wait_primary 12:57:37 | node3 | 3 | localhost:5503 | join_secondary | secondary 12:57:37 | node1 | 1 | localhost:5501 | demoted | catchingup 12:57:38 | node3 | 3 | localhost:5503 | secondary | secondary 12:57:38 | node2 | 2 | localhost:5502 | wait_primary | primary 12:57:38 | node1 | 1 | localhost:5501 | catchingup | catchingup 12:57:38 | node2 | 2 | localhost:5502 | primary | primary $ pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/4000F50 | read-only | secondary | secondary node2 | 2 | localhost:5502 | 0/4000F50 | read-write | primary | primary node3 | 3 | localhost:5503 | 0/4000F50 | read-only | secondary | secondary pg_auto_failover-1.6.3/docs/ref/pg_autoctl_perform_promotion.rst000066400000000000000000000131051414244367200253140ustar00rootroot00000000000000.. _pg_autoctl_perform_promotion: pg_autoctl perform promotion ============================ pg_autoctl perform promotion - Perform a failover that promotes a target node Synopsis -------- This command starts a Postgres failover orchestration from the pg_auto_promotion monitor and targets given node:: usage: pg_autoctl perform promotion [ --pgdata --formation --group ] --pgdata path to data directory --formation formation to target, defaults to 'default' --name node name to target, defaults to current node --wait how many seconds to wait, default to 60 Description ----------- The pg_auto_promotion monitor can be used to orchestrate a manual promotion, sometimes also known as a switchover. When doing so, split-brain are prevented thanks to intermediary states being used in the Finite State Machine. The ``pg_autoctl perform promotion`` command waits until the promotion is known complete on the monitor, or until the hard-coded 60s timeout has passed. The promotion orchestration is done in the background by the monitor, so even if the ``pg_autoctl perform promotion`` stops on the timeout, the promotion orchestration continues at the monitor. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --formation Formation to target for the operation. Defaults to ``default``. --name Name of the node that should be elected as the new primary node. --wait How many seconds to wait for notifications about the promotion. The command stops when the promotion is finished (a node is primary), or when the timeout has elapsed, whichever comes first. The value 0 (zero) disables the timeout and allows the command to wait forever. Examples -------- :: $ pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/4000F88 | read-only | secondary | secondary node2 | 2 | localhost:5502 | 0/4000F88 | read-write | primary | primary node3 | 3 | localhost:5503 | 0/4000F88 | read-only | secondary | secondary $ pg_autoctl perform promotion --name node1 13:08:13 15297 INFO Listening monitor notifications about state changes in formation "default" and group 0 13:08:13 15297 INFO Following table displays times when notifications are received Time | Name | Node | Host:Port | Current State | Assigned State ---------+-------+-------+----------------+---------------------+-------------------- 13:08:13 | node1 | 0/1 | localhost:5501 | secondary | secondary 13:08:13 | node2 | 0/2 | localhost:5502 | primary | draining 13:08:13 | node2 | 0/2 | localhost:5502 | draining | draining 13:08:13 | node1 | 0/1 | localhost:5501 | secondary | report_lsn 13:08:13 | node3 | 0/3 | localhost:5503 | secondary | report_lsn 13:08:19 | node3 | 0/3 | localhost:5503 | report_lsn | report_lsn 13:08:19 | node1 | 0/1 | localhost:5501 | report_lsn | report_lsn 13:08:19 | node1 | 0/1 | localhost:5501 | report_lsn | prepare_promotion 13:08:19 | node1 | 0/1 | localhost:5501 | prepare_promotion | prepare_promotion 13:08:19 | node1 | 0/1 | localhost:5501 | prepare_promotion | stop_replication 13:08:19 | node2 | 0/2 | localhost:5502 | draining | demote_timeout 13:08:19 | node3 | 0/3 | localhost:5503 | report_lsn | join_secondary 13:08:19 | node2 | 0/2 | localhost:5502 | demote_timeout | demote_timeout 13:08:19 | node3 | 0/3 | localhost:5503 | join_secondary | join_secondary 13:08:20 | node1 | 0/1 | localhost:5501 | stop_replication | stop_replication 13:08:20 | node1 | 0/1 | localhost:5501 | stop_replication | wait_primary 13:08:20 | node2 | 0/2 | localhost:5502 | demote_timeout | demoted 13:08:20 | node1 | 0/1 | localhost:5501 | wait_primary | wait_primary 13:08:20 | node3 | 0/3 | localhost:5503 | join_secondary | secondary 13:08:20 | node2 | 0/2 | localhost:5502 | demoted | demoted 13:08:20 | node2 | 0/2 | localhost:5502 | demoted | catchingup 13:08:21 | node3 | 0/3 | localhost:5503 | secondary | secondary 13:08:21 | node1 | 0/1 | localhost:5501 | wait_primary | primary 13:08:21 | node2 | 0/2 | localhost:5502 | catchingup | catchingup 13:08:21 | node1 | 0/1 | localhost:5501 | primary | primary $ pg_autoctl show state Name | Node | Host:Port | LSN | Connection | Current State | Assigned State ------+-------+----------------+-----------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 0/40012F0 | read-write | primary | primary node2 | 2 | localhost:5502 | 0/40012F0 | read-only | secondary | secondary node3 | 3 | localhost:5503 | 0/40012F0 | read-only | secondary | secondary pg_auto_failover-1.6.3/docs/ref/pg_autoctl_perform_switchover.rst000066400000000000000000000032271414244367200254670ustar00rootroot00000000000000.. _pg_autoctl_perform_switchover: pg_autoctl perform switchover ============================= pg_autoctl perform switchover - Perform a switchover for given formation and group Synopsis -------- This command starts a Postgres switchover orchestration from the pg_auto_switchover monitor:: usage: pg_autoctl perform switchover [ --pgdata --formation --group ] --pgdata path to data directory --formation formation to target, defaults to 'default' --group group to target, defaults to 0 Description ----------- The pg_auto_switchover monitor can be used to orchestrate a manual switchover, sometimes also known as a switchover. When doing so, split-brain are prevented thanks to intermediary states being used in the Finite State Machine. The ``pg_autoctl perform switchover`` command waits until the switchover is known complete on the monitor, or until the hard-coded 60s timeout has passed. The switchover orchestration is done in the background by the monitor, so even if the ``pg_autoctl perform switchover`` stops on the timeout, the switchover orchestration continues at the monitor. See also :ref:`pg_autoctl_perform_failover`, a synonym for this command. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --formation Formation to target for the operation. Defaults to ``default``. --group Postgres group to target for the operation. Defaults to ``0``, only Citus formations may have more than one group. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_reload.rst000066400000000000000000000016161414244367200230060ustar00rootroot00000000000000.. _pg_autoctl_reload: pg_autoctl reload ================= pg_autoctl reload - signal the pg_autoctl for it to reload its configuration Synopsis -------- This commands signals a running ``pg_autoctl`` process to reload its configuration from disk, and also signal the managed Postgres service to reload its configuration. :: usage: pg_autoctl reload [ --pgdata ] [ --json ] --pgdata path to data directory Description ----------- The ``pg_autoctl reload`` commands finds the PID of the running service for the given ``--pgdata``, and if the process is still running, sends a ``SIGHUP`` signal to the process. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_run.rst000066400000000000000000000057051414244367200223470ustar00rootroot00000000000000.. _pg_autoctl_run: pg_autoctl run ============== pg_autoctl run - Run the pg_autoctl service (monitor or keeper) Synopsis -------- This commands starts the processes needed to run a monitor node or a keeper node, depending on the configuration file that belongs to the ``--pgdata`` option or ``PGDATA`` environment variable. :: usage: pg_autoctl run [ --pgdata --name --hostname --pgport ] --pgdata path to data directory --name pg_auto_failover node name --hostname hostname used to connect from other nodes --pgport PostgreSQL's port number Description ----------- When registering Postgres nodes to the pg_auto_failover monitor using the :ref:`pg_autoctl_create_postgres` command, the nodes are registered with metadata: the node name, hostname and Postgres port. The node name is used mostly in the logs and :ref:`pg_autoctl_show_state` commands and helps human administrators of the formation. The node hostname and pgport are used by other nodes, including the pg_auto_failover monitor, to open a Postgres connection. Both the node name and the node hostname and port can be changed after the node registration by using either this command (``pg_autoctl run``) or the :ref:`pg_autoctl_config_set` command. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --name Node name used on the monitor to refer to this node. The hostname is a technical information, and given Postgres requirements on the HBA setup and DNS resolution (both forward and reverse lookups), IP addresses are often used for the hostname. The ``--name`` option allows using a user-friendly name for your Postgres nodes. --hostname Hostname or IP address (both v4 and v6 are supported) to use from any other node to connect to this node. When not provided, a default value is computed by running the following algorithm. 1. We get this machine's "public IP" by opening a connection to the given monitor hostname or IP address. Then we get TCP/IP client address that has been used to make that connection. 2. We then do a reverse DNS lookup on the IP address found in the previous step to fetch a hostname for our local machine. 3. If the reverse DNS lookup is successful , then ``pg_autoctl`` does a forward DNS lookup of that hostname. When the forward DNS lookup response in step 3. is an IP address found in one of our local network interfaces, then ``pg_autoctl`` uses the hostname found in step 2. as the default ``--hostname``. Otherwise it uses the IP address found in step 1. You may use the ``--hostname`` command line option to bypass the whole DNS lookup based process and force the local node name to a fixed value. --pgport Postgres port to use, defaults to 5432. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_set.rst000066400000000000000000000004331414244367200223270ustar00rootroot00000000000000.. _pg_autoctl_set: pg_autoctl set ============== pg_autoctl set - Set a pg_auto_failover node, or formation setting .. toctree:: :maxdepth: 1 pg_autoctl_set_formation_number_sync_standbys pg_autoctl_set_node_replication_quorum pg_autoctl_set_node_candidate_priority pg_auto_failover-1.6.3/docs/ref/pg_autoctl_set_formation_number_sync_standbys.rst000066400000000000000000000026561414244367200307310ustar00rootroot00000000000000.. _pg_autoctl_set_formation_number_sync_standbys: pg_autoctl set formation number-sync-standbys ============================================= pg_autoctl set formation number-sync-standbys - set number_sync_standbys for a formation from the monitor Synopsis -------- This command set a ``pg_autoctl`` replication settings for number sync standbys:: usage: pg_autoctl set formation number-sync-standbys [ --pgdata ] [ --json ] [ --formation ] --pgdata path to data directory --formation pg_auto_failover formation --json output data in the JSON format Description ----------- The pg_auto_failover monitor ensures that at least N+1 candidate standby nodes are registered when number-sync-standbys is N. This means that to be able to run the following command, at least 3 standby nodes with a non-zero candidate priority must be registered to the monitor:: $ pg_autoctl set formation number-sync-standbys 2 See also :ref:`pg_autoctl_show_settings` for the full list of replication settings. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output JSON formated data. --formation Show replication settings for given formation. Defaults to ``default``. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_set_node_candidate_priority.rst000066400000000000000000000043061414244367200272740ustar00rootroot00000000000000.. _pg_autoctl_set_node_candidate_priority: pg_autoctl set node candidate-priority ====================================== pg_autoctl set candidate-priority - set candidate-priority property from the monitor Synopsis -------- This command sets the ``pg_autoctl`` candidate priority for a given node:: usage: pg_autoctl set node candidate-priority [ --pgdata ] [ --json ] [ --formation ] [ --name ] --pgdata path to data directory --formation pg_auto_failover formation --name pg_auto_failover node name --json output data in the JSON format Description ----------- See also :ref:`pg_autoctl_show_settings` for the full list of replication settings. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output JSON formated data. --formation Show replication settings for given formation. Defaults to ``default``. --name Show replication settings for given node, selected by name. Examples -------- :: $ pg_autoctl set node candidate-priority --name node1 65 12:47:59 92326 INFO Waiting for the settings to have been applied to the monitor and primary node 12:47:59 92326 INFO New state is reported by node 1 "node1" (localhost:5501): "apply_settings" 12:47:59 92326 INFO Setting goal state of node 1 "node1" (localhost:5501) to primary after it applied replication properties change. 12:47:59 92326 INFO New state is reported by node 1 "node1" (localhost:5501): "primary" 65 $ pg_autoctl set node candidate-priority --name node1 50 --json 12:48:05 92450 INFO Waiting for the settings to have been applied to the monitor and primary node 12:48:05 92450 INFO New state is reported by node 1 "node1" (localhost:5501): "apply_settings" 12:48:05 92450 INFO Setting goal state of node 1 "node1" (localhost:5501) to primary after it applied replication properties change. 12:48:05 92450 INFO New state is reported by node 1 "node1" (localhost:5501): "primary" { "candidate-priority": 50 } pg_auto_failover-1.6.3/docs/ref/pg_autoctl_set_node_replication_quorum.rst000066400000000000000000000043061414244367200273400ustar00rootroot00000000000000.. _pg_autoctl_set_node_replication_quorum: pg_autoctl set node replication-quorum ====================================== pg_autoctl set replication-quorum - set replication-quorum property from the monitor Synopsis -------- This command sets ``pg_autoctl`` replication quorum for a given node:: usage: pg_autoctl set node replication-quorum [ --pgdata ] [ --json ] [ --formation ] [ --name ] --pgdata path to data directory --formation pg_auto_failover formation --name pg_auto_failover node name --json output data in the JSON format Description ----------- See also :ref:`pg_autoctl_show_settings` for the full list of replication settings. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output JSON formated data. --formation Show replication settings for given formation. Defaults to ``default``. --name Show replication settings for given node, selected by name. Examples -------- :: $ pg_autoctl set node replication-quorum --name node1 false 12:49:37 94092 INFO Waiting for the settings to have been applied to the monitor and primary node 12:49:37 94092 INFO New state is reported by node 1 "node1" (localhost:5501): "apply_settings" 12:49:37 94092 INFO Setting goal state of node 1 "node1" (localhost:5501) to primary after it applied replication properties change. 12:49:37 94092 INFO New state is reported by node 1 "node1" (localhost:5501): "primary" false $ pg_autoctl set node replication-quorum --name node1 true --json 12:49:42 94199 INFO Waiting for the settings to have been applied to the monitor and primary node 12:49:42 94199 INFO New state is reported by node 1 "node1" (localhost:5501): "apply_settings" 12:49:42 94199 INFO Setting goal state of node 1 "node1" (localhost:5501) to primary after it applied replication properties change. 12:49:43 94199 INFO New state is reported by node 1 "node1" (localhost:5501): "primary" { "replication-quorum": true } pg_auto_failover-1.6.3/docs/ref/pg_autoctl_show.rst000066400000000000000000000005041414244367200225130ustar00rootroot00000000000000.. _pg_autoctl_show: pg_autoctl show =============== pg_autoctl show - Show pg_auto_failover information .. toctree:: :maxdepth: 1 pg_autoctl_show_uri pg_autoctl_show_events pg_autoctl_show_state pg_autoctl_show_settings pg_autoctl_show_standby_names pg_autoctl_show_file pg_autoctl_show_systemd pg_auto_failover-1.6.3/docs/ref/pg_autoctl_show_events.rst000066400000000000000000000062131414244367200241020ustar00rootroot00000000000000.. _pg_autoctl_show_events: pg_autoctl show events ====================== pg_autoctl show events - Prints monitor's state of nodes in a given formation and group Synopsis -------- This command outputs the events that the pg_auto_failover events records about state changes of the pg_auto_failover nodes managed by the monitor:: usage: pg_autoctl show events [ --pgdata --formation --group --count ] --pgdata path to data directory --monitor pg_auto_failover Monitor Postgres URL --formation formation to query, defaults to 'default' --group group to query formation, defaults to all --count how many events to fetch, defaults to 10 --watch display an auto-updating dashboard --json output data in the JSON format Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --monitor Postgres URI used to connect to the monitor. Must use the ``autoctl_node`` username and target the ``pg_auto_failover`` database name. It is possible to show the Postgres URI from the monitor node using the command :ref:`pg_autoctl_show_uri`. --formation List the events recorded for nodes in the given formation. Defaults to ``default``. --count By default only the last 10 events are printed. --watch Take control of the terminal and display the current state of the system and the last events from the monitor. The display is updated automatically every 500 milliseconds (half a second) and reacts properly to window size change. Depending on the terminal window size, a different set of columns is visible in the state part of the output. See :ref:`pg_autoctl_watch`. --json Output a JSON formated data instead of a table formatted list. Examples -------- :: $ pg_autoctl show events --count 2 --json [ { "nodeid": 1, "eventid": 15, "groupid": 0, "nodehost": "localhost", "nodename": "node1", "nodeport": 5501, "eventtime": "2021-03-18T12:32:36.103467+01:00", "goalstate": "primary", "description": "Setting goal state of node 1 \"node1\" (localhost:5501) to primary now that at least one secondary candidate node is healthy.", "formationid": "default", "reportedlsn": "0/4000060", "reportedstate": "wait_primary", "reportedrepstate": "async", "candidatepriority": 50, "replicationquorum": true }, { "nodeid": 1, "eventid": 16, "groupid": 0, "nodehost": "localhost", "nodename": "node1", "nodeport": 5501, "eventtime": "2021-03-18T12:32:36.215494+01:00", "goalstate": "primary", "description": "New state is reported by node 1 \"node1\" (localhost:5501): \"primary\"", "formationid": "default", "reportedlsn": "0/4000110", "reportedstate": "primary", "reportedrepstate": "quorum", "candidatepriority": 50, "replicationquorum": true } ] pg_auto_failover-1.6.3/docs/ref/pg_autoctl_show_file.rst000066400000000000000000000154351414244367200235230ustar00rootroot00000000000000.. _pg_autoctl_show_file: pg_autoctl show file ============================= pg_autoctl show file - List pg_autoctl internal files (config, state, pid) Synopsis -------- This command the files that ``pg_autoctl`` uses internally for its own configuration, state, and pid:: usage: pg_autoctl show file [ --pgdata --all --config | --state | --init | --pid --contents ] --pgdata path to data directory --all show all pg_autoctl files --config show pg_autoctl configuration file --state show pg_autoctl state file --init show pg_autoctl initialisation state file --pid show pg_autoctl PID file --contents show selected file contents --json output data in the JSON format Description ----------- The ``pg_autoctl`` command follows the `XDG Base Directory Specification`__ and places its internal and configuration files by default in places such as ``~/.config/pg_autoctl`` and ``~/.local/share/pg_autoctl``. __ https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html It is possible to change the default XDG locations by using the environment variables ``XDG_CONFIG_HOME``, ``XDG_DATA_HOME``, and ``XDG_RUNTIME_DIR``. Also, ``pg_config`` uses sub-directories that are specific to a given ``PGDATA``, making it possible to run several Postgres nodes on the same machine, which is very practical for testing and development purposes, though not advised for production setups. Configuration File ^^^^^^^^^^^^^^^^^^ The ``pg_autoctl`` configuration file for an instance serving the data directory at ``/data/pgsql`` is found at ``~/.config/pg_autoctl/data/pgsql/pg_autoctl.cfg``, written in the INI format. It is possible to get the location of the configuration file by using the command ``pg_autoctl show file --config --pgdata /data/pgsql`` and to output its content by using the command ``pg_autoctl show file --config --contents --pgdata /data/pgsql``. See also :ref:`pg_autoctl_config_get` and :ref:`pg_autoctl_config_set`. State File ^^^^^^^^^^ The ``pg_autoctl`` state file for an instance serving the data directory at ``/data/pgsql`` is found at ``~/.local/share/pg_autoctl/data/pgsql/pg_autoctl.state``, written in a specific binary format. This file is not intended to be written by anything else than ``pg_autoctl`` itself. In case of state corruption, see the trouble shooting section of the documentation. It is possible to get the location of the state file by using the command ``pg_autoctl show file --state --pgdata /data/pgsql`` and to output its content by using the command ``pg_autoctl show file --state --contents --pgdata /data/pgsql``. Init State File ^^^^^^^^^^^^^^^ The ``pg_autoctl`` init state file for an instance serving the data directory at ``/data/pgsql`` is found at ``~/.local/share/pg_autoctl/data/pgsql/pg_autoctl.init``, written in a specific binary format. This file is not intended to be written by anything else than ``pg_autoctl`` itself. In case of state corruption, see the trouble shooting section of the documentation. This initialization state file only exists during the initialization of a pg_auto_failover node. In normal operations, this file does not exist. It is possible to get the location of the state file by using the command ``pg_autoctl show file --init --pgdata /data/pgsql`` and to output its content by using the command ``pg_autoctl show file --init --contents --pgdata /data/pgsql``. PID File ^^^^^^^^ The ``pg_autoctl`` PID file for an instance serving the data directory at ``/data/pgsql`` is found at ``/tmp/pg_autoctl/data/pgsql/pg_autoctl.pid``, written in a specific text format. The PID file is located in a temporary directory by default, or in the ``XDG_RUNTIME_DIR`` directory when this is setup. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --all List all the files that belong to this `pg_autoctl` node. --config Show only the configuration file. --state Show only the state file. --init Show only the init state file, which only exists while the command ``pg_autoctl create postgres`` or the command ``pg_autoctl create monitor`` is running, or when than command failed (and can then be retried). --pid Show only the pid file. --contents When one of the options to show a specific file is in use, then ``--contents`` shows the contents of the selected file instead of showing its absolute file path. --json Output JSON formated data. Examples -------- The following examples are taken from a QA environment that has been prepared thanks to the ``make cluster`` command made available to the pg_auto_failover contributors. As a result, the XDG environment variables have been tweaked to obtain a self-contained test:: $ tmux show-env | grep XDG XDG_CONFIG_HOME=/Users/dim/dev/MS/pg_auto_failover/tmux/config XDG_DATA_HOME=/Users/dim/dev/MS/pg_auto_failover/tmux/share XDG_RUNTIME_DIR=/Users/dim/dev/MS/pg_auto_failover/tmux/run Within that self-contained test location, we can see the following examples. :: $ pg_autoctl show file --pgdata ./node1 File | Path --------+---------------- Config | /Users/dim/dev/MS/pg_auto_failover/tmux/config/pg_autoctl/Users/dim/dev/MS/pg_auto_failover/tmux/node1/pg_autoctl.cfg State | /Users/dim/dev/MS/pg_auto_failover/tmux/share/pg_autoctl/Users/dim/dev/MS/pg_auto_failover/tmux/node1/pg_autoctl.state Init | /Users/dim/dev/MS/pg_auto_failover/tmux/share/pg_autoctl/Users/dim/dev/MS/pg_auto_failover/tmux/node1/pg_autoctl.init Pid | /Users/dim/dev/MS/pg_auto_failover/tmux/run/pg_autoctl/Users/dim/dev/MS/pg_auto_failover/tmux/node1/pg_autoctl.pid 'ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)' $ pg_autoctl show file --pgdata node1 --state /Users/dim/dev/MS/pg_auto_failover/tmux/share/pg_autoctl/Users/dim/dev/MS/pg_auto_failover/tmux/node1/pg_autoctl.state $ pg_autoctl show file --pgdata node1 --state --contents Current Role: primary Assigned Role: primary Last Monitor Contact: Thu Mar 18 17:32:25 2021 Last Secondary Contact: 0 pg_autoctl state version: 1 group: 0 node id: 1 nodes version: 0 PostgreSQL Version: 1201 PostgreSQL CatVersion: 201909212 PostgreSQL System Id: 6940955496243696337 pg_autoctl show file --pgdata node1 --config --contents --json | jq .pg_autoctl { "role": "keeper", "monitor": "postgres://autoctl_node@localhost:5500/pg_auto_failover?sslmode=prefer", "formation": "default", "group": 0, "name": "node1", "hostname": "localhost", "nodekind": "standalone" } pg_auto_failover-1.6.3/docs/ref/pg_autoctl_show_settings.rst000066400000000000000000000100251414244367200244320ustar00rootroot00000000000000.. _pg_autoctl_show_settings: pg_autoctl show settings ======================== pg_autoctl show settings - Print replication settings for a formation from the monitor Synopsis -------- This command allows to review all the replication settings of a given formation (defaults to `'default'` as usual):: usage: pg_autoctl show settings [ --pgdata ] [ --json ] [ --formation ] --pgdata path to data directory --monitor pg_auto_failover Monitor Postgres URL --json output data in the JSON format --formation pg_auto_failover formation Description ----------- See also :ref:`pg_autoctl_get_formation_settings` which is a synonym. The output contains setting and values that apply at different contexts, as shown here with a formation of four nodes, where ``node_4`` is not participating in the replication quorum and also not a candidate for failover:: $ pg_autoctl show settings Context | Name | Setting | Value ----------+---------+---------------------------+------------------------------------------------------------- formation | default | number_sync_standbys | 1 primary | node_1 | synchronous_standby_names | 'ANY 1 (pgautofailover_standby_3, pgautofailover_standby_2)' node | node_1 | replication quorum | true node | node_2 | replication quorum | true node | node_3 | replication quorum | true node | node_4 | replication quorum | false node | node_1 | candidate priority | 50 node | node_2 | candidate priority | 50 node | node_3 | candidate priority | 50 node | node_4 | candidate priority | 0 Three replication settings context are listed: 1. The `"formation"` context contains a single entry, the value of ``number_sync_standbys`` for the target formation. 2. The `"primary"` context contains one entry per group of Postgres nodes in the formation, and shows the current value of the ``synchronous_standby_names`` Postgres setting as computed by the monitor. It should match what's currently set on the primary node unless while applying a change, as shown by the primary being in the APPLY_SETTING state. 3. The `"node"` context contains two entry per nodes, one line shows the replication quorum setting of nodes, and another line shows the candidate priority of nodes. This command gives an overview of all the settings that apply to the current formation. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --monitor Postgres URI used to connect to the monitor. Must use the ``autoctl_node`` username and target the ``pg_auto_failover`` database name. It is possible to show the Postgres URI from the monitor node using the command :ref:`pg_autoctl_show_uri`. Defaults to the value of the environment variable ``PG_AUTOCTL_MONITOR``. --formation Show the current replication settings for the given formation. Defaults to the ``default`` formation. --json Output a JSON formated data instead of a table formatted list. Examples -------- :: $ pg_autoctl show settings Context | Name | Setting | Value ----------+---------+---------------------------+------------------------------------------------------------- formation | default | number_sync_standbys | 1 primary | node1 | synchronous_standby_names | 'ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)' node | node1 | candidate priority | 50 node | node2 | candidate priority | 50 node | node3 | candidate priority | 50 node | node1 | replication quorum | true node | node2 | replication quorum | true node | node3 | replication quorum | true pg_auto_failover-1.6.3/docs/ref/pg_autoctl_show_standby_names.rst000066400000000000000000000037731414244367200254350ustar00rootroot00000000000000.. _pg_autoctl_show_standby_names: pg_autoctl show standby-names ============================= pg_autoctl show standby-names - Prints synchronous_standby_names for a given group Synopsis -------- This command prints the current value for synchronous_standby_names for the primary Postgres server of the target group (default ``0``) in the target formation (default ``default``), as computed by the monitor:: usage: pg_autoctl show standby-names [ --pgdata ] --formation --group --pgdata path to data directory --monitor pg_auto_failover Monitor Postgres URL --formation formation to query, defaults to 'default' --group group to query formation, defaults to all --json output data in the JSON format Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --monitor Postgres URI used to connect to the monitor. Must use the ``autoctl_node`` username and target the ``pg_auto_failover`` database name. It is possible to show the Postgres URI from the monitor node using the command :ref:`pg_autoctl_show_uri`. Defaults to the value of the environment variable ``PG_AUTOCTL_MONITOR``. --formation Show the current ``synchronous_standby_names`` value for the given formation. Defaults to the ``default`` formation. --group Show the current ``synchronous_standby_names`` value for the given group in the given formation. Defaults to group ``0``. --json Output a JSON formated data instead of a table formatted list. Examples -------- :: $ pg_autoctl show standby-names 'ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)' $ pg_autoctl show standby-names --json { "formation": "default", "group": 0, "synchronous_standby_names": "ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)" } pg_auto_failover-1.6.3/docs/ref/pg_autoctl_show_state.rst000066400000000000000000000143111414244367200237140ustar00rootroot00000000000000.. _pg_autoctl_show_state: pg_autoctl show state ===================== pg_autoctl show state - Prints monitor's state of nodes in a given formation and group Synopsis -------- This command outputs the current state of the formation and groups registered to the pg_auto_failover monitor:: usage: pg_autoctl show state [ --pgdata --formation --group ] --pgdata path to data directory --monitor pg_auto_failover Monitor Postgres URL --formation formation to query, defaults to 'default' --group group to query formation, defaults to all --local show local data, do not connect to the monitor --watch display an auto-updating dashboard --json output data in the JSON format Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --monitor Postgres URI used to connect to the monitor. Must use the ``autoctl_node`` username and target the ``pg_auto_failover`` database name. It is possible to show the Postgres URI from the monitor node using the command :ref:`pg_autoctl_show_uri`. --formation List the events recorded for nodes in the given formation. Defaults to ``default``. --group Limit output to a single group in the formation. Default to including all groups registered in the target formation. --local Print the local state information without connecting to the monitor. --watch Take control of the terminal and display the current state of the system and the last events from the monitor. The display is updated automatically every 500 milliseconds (half a second) and reacts properly to window size change. Depending on the terminal window size, a different set of columns is visible in the state part of the output. See :ref:`pg_autoctl_watch`. --json Output a JSON formated data instead of a table formatted list. Description ----------- The ``pg_autoctl show state`` output includes the following columns: - Name Name of the node. - Node Node information. When the formation has a single group (group zero), then this column only contains the nodeId. Only Citus formations allow several groups. When using a Citus formation the Node column contains the groupId and the nodeId, separated by a colon, such as ``0:1`` for the first coordinator node. - Host:Port Hostname and port number used to connect to the node. - TLI: LSN Timeline identifier (TLI) and Postgres Log Sequence Number (LSN). The LSN is the current position in the Postgres WAL stream. This is a hexadecimal number. See `pg_lsn`__ for more information. __ https://www.postgresql.org/docs/current/datatype-pg-lsn.html The current `timeline`__ is incremented each time a failover happens, or when doing Point In Time Recovery. A node can only reach the secondary state when it is on the same timeline as its primary node. __ https://www.postgresql.org/docs/current/continuous-archiving.html#BACKUP-TIMELINES - Connection This output field contains two bits of information. First, the Postgres connection type that the node provides, either ``read-write`` or ``read-only``. Then the mark ``!`` is added when the monitor has failed to connect to this node, and ``?`` when the monitor didn't connect to the node yet. - Reported State The latest reported FSM state, as reported to the monitor by the pg_autoctl process running on the Postgres node. - Assigned State The assigned FSM state on the monitor. When the assigned state is not the same as the reported start, then the pg_autoctl process running on the Postgres node might have not retrieved the assigned state yet, or might still be implementing the FSM transition from the current state to the assigned state. Examples -------- :: $ pg_autoctl show state Name | Node | Host:Port | TLI: LSN | Connection | Reported State | Assigned State ------+-------+----------------+----------------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 1: 0/4000678 | read-write | primary | primary node2 | 2 | localhost:5502 | 1: 0/4000678 | read-only | secondary | secondary node3 | 3 | localhost:5503 | 1: 0/4000678 | read-only | secondary | secondary $ pg_autoctl show state --local Name | Node | Host:Port | TLI: LSN | Connection | Reported State | Assigned State ------+-------+----------------+----------------+--------------+---------------------+-------------------- node1 | 1 | localhost:5501 | 1: 0/4000678 | read-write ? | primary | primary $ pg_autoctl show state --json [ { "health": 1, "node_id": 1, "group_id": 0, "nodehost": "localhost", "nodename": "node1", "nodeport": 5501, "reported_lsn": "0/4000678", "reported_tli": 1, "formation_kind": "pgsql", "candidate_priority": 50, "replication_quorum": true, "current_group_state": "primary", "assigned_group_state": "primary" }, { "health": 1, "node_id": 2, "group_id": 0, "nodehost": "localhost", "nodename": "node2", "nodeport": 5502, "reported_lsn": "0/4000678", "reported_tli": 1, "formation_kind": "pgsql", "candidate_priority": 50, "replication_quorum": true, "current_group_state": "secondary", "assigned_group_state": "secondary" }, { "health": 1, "node_id": 3, "group_id": 0, "nodehost": "localhost", "nodename": "node3", "nodeport": 5503, "reported_lsn": "0/4000678", "reported_tli": 1, "formation_kind": "pgsql", "candidate_priority": 50, "replication_quorum": true, "current_group_state": "secondary", "assigned_group_state": "secondary" } ] pg_auto_failover-1.6.3/docs/ref/pg_autoctl_show_systemd.rst000066400000000000000000000031311414244367200242620ustar00rootroot00000000000000.. _pg_autoctl_show_systemd: pg_autoctl show systemd ======================= pg_autoctl show systemd - Print systemd service file for this node Synopsis -------- This command outputs a configuration unit that is suitable for registering ``pg_autoctl`` as a systemd service. Examples -------- :: $ pg_autoctl show systemd --pgdata node1 17:38:29 99778 INFO HINT: to complete a systemd integration, run the following commands: 17:38:29 99778 INFO pg_autoctl -q show systemd --pgdata "node1" | sudo tee /etc/systemd/system/pgautofailover.service 17:38:29 99778 INFO sudo systemctl daemon-reload 17:38:29 99778 INFO sudo systemctl enable pgautofailover 17:38:29 99778 INFO sudo systemctl start pgautofailover [Unit] Description = pg_auto_failover [Service] WorkingDirectory = /Users/dim Environment = 'PGDATA=node1' User = dim ExecStart = /Applications/Postgres.app/Contents/Versions/12/bin/pg_autoctl run Restart = always StartLimitBurst = 0 ExecReload = /Applications/Postgres.app/Contents/Versions/12/bin/pg_autoctl reload [Install] WantedBy = multi-user.target To avoid the logs output, use the ``-q`` option: :: $ pg_autoctl show systemd --pgdata node1 -q [Unit] Description = pg_auto_failover [Service] WorkingDirectory = /Users/dim Environment = 'PGDATA=node1' User = dim ExecStart = /Applications/Postgres.app/Contents/Versions/12/bin/pg_autoctl run Restart = always StartLimitBurst = 0 ExecReload = /Applications/Postgres.app/Contents/Versions/12/bin/pg_autoctl reload [Install] WantedBy = multi-user.target pg_auto_failover-1.6.3/docs/ref/pg_autoctl_show_uri.rst000066400000000000000000000053701414244367200234000ustar00rootroot00000000000000.. _pg_autoctl_show_uri: pg_autoctl show uri =================== pg_autoctl show uri - Show the postgres uri to use to connect to pg_auto_failover nodes Synopsis -------- This command outputs the monitor or the coordinator Postgres URI to use from an application to connect to Postgres:: usage: pg_autoctl show uri [ --pgdata --monitor --formation --json ] --pgdata path to data directory --monitor monitor uri --formation show the coordinator uri of given formation --json output data in the JSON format Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --monitor Postgres URI used to connect to the monitor. Must use the ``autoctl_node`` username and target the ``pg_auto_failover`` database name. It is possible to show the Postgres URI from the monitor node using the command :ref:`pg_autoctl_show_uri`. Defaults to the value of the environment variable ``PG_AUTOCTL_MONITOR``. --formation When ``--formation`` is used, lists the Postgres URIs of all known formations on the monitor. --json Output a JSON formated data instead of a table formatted list. Examples -------- :: $ pg_autoctl show uri Type | Name | Connection String -------------+---------+------------------------------- monitor | monitor | postgres://autoctl_node@localhost:5500/pg_auto_failover formation | default | postgres://localhost:5502,localhost:5503,localhost:5501/demo?target_session_attrs=read-write&sslmode=prefer $ pg_autoctl show uri --formation monitor postgres://autoctl_node@localhost:5500/pg_auto_failover $ pg_autoctl show uri --formation default postgres://localhost:5503,localhost:5502,localhost:5501/demo?target_session_attrs=read-write&sslmode=prefer $ pg_autoctl show uri --json [ { "uri": "postgres://autoctl_node@localhost:5500/pg_auto_failover", "name": "monitor", "type": "monitor" }, { "uri": "postgres://localhost:5503,localhost:5502,localhost:5501/demo?target_session_attrs=read-write&sslmode=prefer", "name": "default", "type": "formation" } ] Multi-hosts Postgres connection strings --------------------------------------- PostgreSQL since version 10 includes support for multiple hosts in its connection driver ``libpq``, with the special ``target_session_attrs`` connection property. This multi-hosts connection string facility allows applications to keep using the same stable connection string over server-side failovers. That's why ``pg_autoctl show uri`` uses that format. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_status.rst000066400000000000000000000050311414244367200230560ustar00rootroot00000000000000.. _pg_autoctl_status: pg_autoctl status ================= pg_autoctl status - Display the current status of the pg_autoctl service Synopsis -------- This commands outputs the current process status for the ``pg_autoctl`` service running for the given ``--pgdata`` location. :: usage: pg_autoctl status [ --pgdata ] [ --json ] --pgdata path to data directory --json output data in the JSON format Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --json Output a JSON formated data instead of a table formatted list. Example ------- :: $ pg_autoctl status --pgdata node1 11:26:30 27248 INFO pg_autoctl is running with pid 26618 11:26:30 27248 INFO Postgres is serving PGDATA "/Users/dim/dev/MS/pg_auto_failover/tmux/node1" on port 5501 with pid 26725 $ pg_autoctl status --pgdata node1 --json 11:26:37 27385 INFO pg_autoctl is running with pid 26618 11:26:37 27385 INFO Postgres is serving PGDATA "/Users/dim/dev/MS/pg_auto_failover/tmux/node1" on port 5501 with pid 26725 { "postgres": { "pgdata": "\/Users\/dim\/dev\/MS\/pg_auto_failover\/tmux\/node1", "pg_ctl": "\/Applications\/Postgres.app\/Contents\/Versions\/12\/bin\/pg_ctl", "version": "12.3", "host": "\/tmp", "port": 5501, "proxyport": 0, "pid": 26725, "in_recovery": false, "control": { "version": 0, "catalog_version": 0, "system_identifier": "0" }, "postmaster": { "status": "ready" } }, "pg_autoctl": { "pid": 26618, "status": "running", "pgdata": "\/Users\/dim\/dev\/MS\/pg_auto_failover\/tmux\/node1", "version": "1.5.0", "semId": 196609, "services": [ { "name": "postgres", "pid": 26625, "status": "running", "version": "1.5.0", "pgautofailover": "1.5.0.1" }, { "name": "node-active", "pid": 26626, "status": "running", "version": "1.5.0", "pgautofailover": "1.5.0.1" } ] } } pg_auto_failover-1.6.3/docs/ref/pg_autoctl_stop.rst000066400000000000000000000031621414244367200225230ustar00rootroot00000000000000.. _pg_autoctl_stop: pg_autoctl stop =============== pg_autoctl stop - signal the pg_autoctl service for it to stop Synopsis -------- This commands stops the processes needed to run a monitor node or a keeper node, depending on the configuration file that belongs to the ``--pgdata`` option or ``PGDATA`` environment variable. :: usage: pg_autoctl stop [ --pgdata --fast --immediate ] --pgdata path to data directory --fast fast shutdown mode for the keeper --immediate immediate shutdown mode for the keeper Description ----------- The ``pg_autoctl stop`` commands finds the PID of the running service for the given ``--pgdata``, and if the process is still running, sends a ``SIGTERM`` signal to the process. When ``pg_autoclt`` receives a shutdown signal a shutdown sequence is triggered. Depending on the signal received, an operation that has been started (such as a state transition) is either run to completion, stopped as the next opportunity, or stopped immediately even when in the middle of the transition. Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --fast Fast Shutdown mode for ``pg_autoctl``. Sends the ``SIGINT`` signal to the running service, which is the same as using ``C-c`` on an interactive process running as a foreground shell job. --immediate Immediate Shutdown mode for ``pg_autoctl``. Sends the ``SIGQUIT`` signal to the running service. pg_auto_failover-1.6.3/docs/ref/pg_autoctl_watch.rst000066400000000000000000000111761414244367200226500ustar00rootroot00000000000000.. _pg_autoctl_watch: pg_autoctl watch ====================== pg_autoctl watch - Display an auto-updating dashboard Synopsis -------- This command outputs the events that the pg_auto_failover events records about state changes of the pg_auto_failover nodes managed by the monitor:: usage: pg_autoctl watch [ --pgdata --formation --group ] --pgdata path to data directory --monitor show the monitor uri --formation formation to query, defaults to 'default' --group group to query formation, defaults to all --json output data in the JSON format Options ------- --pgdata Location of the Postgres node being managed locally. Defaults to the environment variable ``PGDATA``. Use ``--monitor`` to connect to a monitor from anywhere, rather than the monitor URI used by a local Postgres node managed with ``pg_autoctl``. --monitor Postgres URI used to connect to the monitor. Must use the ``autoctl_node`` username and target the ``pg_auto_failover`` database name. It is possible to show the Postgres URI from the monitor node using the command :ref:`pg_autoctl_show_uri`. --formation List the events recorded for nodes in the given formation. Defaults to ``default``. --group Limit output to a single group in the formation. Default to including all groups registered in the target formation. Description ----------- The ``pg_autoctl watch`` output is divided in 3 sections. The first section is a single header line which includes the name of the currently selected formation, the formation replication setting :ref:`number_sync_standbys`, and then in the right most position the current time. The second section displays one line per node, and each line contains a list of columns that describe the current state for the node. This list can includes the following columns, and which columns are part of the output depends on the terminal window size. This choice is dynamic and changes if your terminal window size changes: - Name Name of the node. - Node, or Id Node information. When the formation has a single group (group zero), then this column only contains the nodeId. Only Citus formations allow several groups. When using a Citus formation the Node column contains the groupId and the nodeId, separated by a colon, such as ``0:1`` for the first coordinator node. - Last Report, or Report Time interval between now and the last known time when a node has reported to the monitor, using the ``node_active`` protocol. This value is expected to stay under 2s or abouts, and is known to increment when either the ``pg_autoctl run`` service is not running, or when there is a network split. - Last Check, or Check Time inverval between now and the last known time when the monitor could connect to a node's Postgres instance, via its health check mechanism. This value is known to increment when either the Postgres service is not running on the target node, when there is a network split, or when the internal machinery (the health check worker background process) implements jitter. - Host:Port Hostname and port number used to connect to the node. - TLI: LSN Timeline identifier (TLI) and Postgres Log Sequence Number (LSN). The LSN is the current position in the Postgres WAL stream. This is a hexadecimal number. See `pg_lsn`__ for more information. __ https://www.postgresql.org/docs/current/datatype-pg-lsn.html The current `timeline`__ is incremented each time a failover happens, or when doing Point In Time Recovery. A node can only reach the secondary state when it is on the same timeline as its primary node. __ https://www.postgresql.org/docs/current/continuous-archiving.html#BACKUP-TIMELINES - Connection This output field contains two bits of information. First, the Postgres connection type that the node provides, either ``read-write`` or ``read-only``. Then the mark ``!`` is added when the monitor has failed to connect to this node, and ``?`` when the monitor didn't connect to the node yet. - Reported State The current FSM state as reported to the monitor by the pg_autoctl process running on the Postgres node. - Assigned State The assigned FSM state on the monitor. When the assigned state is not the same as the reported start, then the pg_autoctl process running on the Postgres node might have not retrieved the assigned state yet, or might still be implementing the FSM transition from the current state to the assigned state. The third and last section lists the most recent events that the monitor has registered, the more recent event is found at the bottom of the screen. To quit the command hit either the ``F1`` key or the ``q`` key. pg_auto_failover-1.6.3/docs/requirements.txt000066400000000000000000000001351414244367200212700ustar00rootroot00000000000000Sphinx==4.0.2 sphinx_rtd_theme_citus==0.5.16 docutils==0.16 readthedocs-sphinx-search==0.1.0 pg_auto_failover-1.6.3/docs/security.rst000066400000000000000000000353411414244367200204140ustar00rootroot00000000000000.. _security: Security settings for pg_auto_failover ====================================== In order to be able to orchestrate fully automated failovers, pg_auto_failover needs to be able to establish the following Postgres connections: - from the monitor node to each Postgres node to check the node's “health” - from each Postgres node to the monitor to implement our `node_active` protocol and fetch the current assigned state for this node - from the secondary node to the primary node for Postgres streaming replication. Postgres Client authentication is controlled by a configuration file: ``pg_hba.conf``. This file contains a list of rules where each rule may allow or reject a connection attempt. For pg_auto_failover to work as intended, some HBA rules need to be added to each node configuration. You can choose to provision the ``pg_hba.conf`` file yourself thanks to ``pg_autoctl`` options' ``--skip-pg-hba``, or you can use the following options to control which kind of rules are going to be added for you. Postgres HBA rules ------------------ For your application to be able to connect to the current Postgres primary servers, some application specific HBA rules have to be added to ``pg_hba.conf``. There is no provision for doing that in pg_auto_failover. In other words, it is expected that you have to edit ``pg_hba.conf`` to open connections for your application needs. The trust security model ------------------------ As its name suggests the trust security model is not enabling any kind of security validation. This setting is popular for testing deployments though, as it makes it very easy to verify that everything works as intended before putting security restrictions in place. To enable a “trust” security model with pg_auto_failover, use the ``pg_autoctl`` option ``--auth trust`` when creating nodes:: $ pg_autoctl create monitor --auth trust ... $ pg_autoctl create postgres --auth trust ... $ pg_autoctl create postgres --auth trust ... When using ``--auth trust`` pg_autoctl adds new HBA rules in the monitor and the Postgres nodes to enable connections as seen above. Authentication with passwords ----------------------------- To setup pg_auto_failover with password for connections, you can use one of the password based authentication methods supported by Postgres, such as ``password`` or ``scram-sha-256``. We recommend the latter, as in the following example:: $ pg_autoctl create monitor --auth scram-sha-256 ... The ``pg_autoctl`` does not set the password for you. The first step is to set the database user password in the monitor database thanks to the following command:: $ psql postgres://monitor.host/pg_auto_failover > alter user autoctl_node password 'h4ckm3'; Now that the monitor is ready with our password set for the ``autoctl_node`` user, we can use the password in the monitor connection string used when creating Postgres nodes. On the primary node, we can create the Postgres setup as usual, and then set our replication password, that we will use if we are demoted and then re-join as a standby:: $ pg_autoctl create postgres \ --auth scram-sha-256 \ ... \ --monitor postgres://autoctl_node:h4ckm3@monitor.host/pg_auto_failover $ pg_autoctl config set replication.password h4ckm3m0r3 The second Postgres node is going to be initialized as a secondary and ``pg_autoctl`` then calls ``pg_basebackup`` at create time. We need to have the replication password already set at this time, and we can achieve that the following way:: $ export PGPASSWORD=h4ckm3m0r3 $ pg_autoctl create postgres \ --auth scram-sha-256 \ ... \ --monitor postgres://autoctl_node:h4ckm3@monitor.host/pg_auto_failover $ pg_autoctl config set replication.password h4ckm3m0r3 Note that you can use `The Password File`__ mechanism as discussed in the Postgres documentation in order to maintain your passwords in a separate file, not in your main pg_auto_failover configuration file. This also avoids using passwords in the environment and in command lines. __ https://www.postgresql.org/docs/current/libpq-pgpass.html Encryption of network communications ------------------------------------ Postgres knows how to use SSL to enable network encryption of all communications, including authentication with passwords and the whole data set when streaming replication is used. To enable SSL on the server an SSL certificate is needed. It could be as simple as a self-signed certificate, and ``pg_autoctl`` creates such a certificate for you when using ``--ssl-self-signed`` command line option:: $ pg_autoctl create monitor --ssl-self-signed ... \ --auth scram-sha-256 ... \ --ssl-mode require \ ... $ pg_autoctl create postgres --ssl-self-signed ... \ --auth scram-sha-256 ... \ ... $ pg_autoctl create postgres --ssl-self-signed ... \ --auth scram-sha-256 ... \ ... In that example we setup SSL connections to encrypt the network traffic, and we still have to setup an authentication mechanism exactly as in the previous sections of this document. Here ``scram-sha-256`` has been selected, and the password will be sent over an encrypted channel. When using the ``--ssl-self-signed`` option, ``pg_autoctl`` creates a self-signed certificate, as per the Postgres documentation at the `Creating Certificates`__ page. __ https://www.postgresql.org/docs/current/ssl-tcp.html#SSL-CERTIFICATE-CREATION The certificate subject CN defaults to the ``--hostname`` parameter, which can be given explicitely or computed by ``pg_autoctl`` as either your hostname when you have proper DNS resolution, or your current IP address. Self-signed certificates provide protection against eavesdropping; this setup does NOT protect against Man-In-The-Middle attacks nor Impersonation attacks. See PostgreSQL documentation page `SSL Support`__ for details. __ https://www.postgresql.org/docs/current/libpq-ssl.html Using your own SSL certificates ------------------------------- In many cases you will want to install certificates provided by your local security department and signed by a trusted Certificate Authority. In that case one solution is to use ``--skip-pg-hba`` and do the whole setup yourself. It is still possible to give the certificates to pg_auto_failover and have it handle the Postgres setup for you:: $ pg_autoctl create monitor --ssl-ca-file root.crt \ --ssl-crl-file root.crl \ --server-cert server.crt \ --server-key server.key \ --ssl-mode verify-full \ ... $ pg_autoctl create postgres --ssl-ca-file root.crt \ --server-cert server.crt \ --server-key server.key \ --ssl-mode verify-full \ ... $ pg_autoctl create postgres --ssl-ca-file root.crt \ --server-cert server.crt \ --server-key server.key \ --ssl-mode verify-full \ ... The option ``--ssl-mode`` can be used to force connection strings used by ``pg_autoctl`` to contain your prefered ssl mode. It defaults to ``require`` when using ``--ssl-self-signed`` and to ``allow`` when ``--no-ssl`` is used. Here, we set ``--ssl-mode`` to ``verify-full`` which requires SSL Certificates Authentication, covered next. The default ``--ssl-mode`` when providing your own certificates (signed by your trusted CA) is then ``verify-full``. This setup applies to the client connection where the server identity is going to be checked against the root certificate provided with ``--ssl-ca-file`` and the revocation list optionally provided with the ``--ssl-crl-file``. Both those files are used as the respective parameters ``sslrootcert`` and ``sslcrl`` in pg_autoctl connection strings to both the monitor and the streaming replication primary server. SSL Certificates Authentication ------------------------------- Given those files, it is then possible to use certificate based authentication of client connections. For that, it is necessary to prepare client certificates signed by your root certificate private key and using the target user name as its CN, as per Postgres documentation for `Certificate Authentication`__: The cn (Common Name) attribute of the certificate will be compared to the requested database user name, and if they match the login will be allowed __ https://www.postgresql.org/docs/current/auth-cert.html For enabling the `cert` authentication method with pg_auto_failover, you need to prepare a `Client Certificate`__ for the user ``postgres`` and used by pg_autoctl when connecting to the monitor, to place in ``~/.postgresql/postgresql.crt`` along with its key ``~/.postgresql/postgresql.key``, in the home directory of the user that runs the pg_autoctl service (which defaults to ``postgres``). __ https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-CLIENTCERT Then you need to create a user name map as documented in Postgres page `User Name Maps`__ so that your certificate can be used to authenticate pg_autoctl users. __ https://www.postgresql.org/docs/current/auth-username-maps.html The ident map in ``pg_ident.conf`` on the pg_auto_failover monitor should then have the following entry, to allow ``postgres`` to connect as the ``autoctl_node`` user for ``pg_autoctl`` operations:: # MAPNAME SYSTEM-USERNAME PG-USERNAME # pg_autoctl runs as postgres and connects to the monitor autoctl_node user pgautofailover postgres autoctl_node To enable streaming replication, the ``pg_ident.conf`` file on each Postgres node should now allow the ``postgres`` user in the client certificate to connect as the ``pgautofailover_replicator`` database user:: # MAPNAME SYSTEM-USERNAME PG-USERNAME # pg_autoctl runs as postgres and connects to the monitor autoctl_node user pgautofailover postgres pgautofailover_replicator Given that user name map, you can then use the ``cert`` authentication method. As with the ``pg_ident.conf`` provisioning, it is best to now provision the HBA rules yourself, using the ``--skip-pg-hba`` option:: $ pg_autoctl create postgres --skip-pg-hba --ssl-ca-file ... The HBA rule will use the authentication method ``cert`` with a map option, and might then look like the following on the monitor:: # allow certificate based authentication to the monitor hostssl pg_auto_failover autoctl_node 10.0.0.0/8 cert map=pgautofailover Then your pg_auto_failover nodes on the 10.0.0.0 network are allowed to connect to the monitor with the user ``autoctl_node`` used by ``pg_autoctl``, assuming they have a valid and trusted client certificate. The HBA rule to use on the Postgres nodes to allow for Postgres streaming replication connections looks like the following:: # allow streaming replication for pg_auto_failover nodes hostssl replication pgautofailover_replicator 10.0.0.0/8 cert map=pgautofailover Because the Postgres server runs as the ``postgres`` system user, the connection to the primary node can be made with SSL enabled and will then use the client certificates installed in the ``postgres`` home directory in ``~/.postgresql/postgresql.{key,cert}`` locations. Postgres HBA provisioning ------------------------- While pg_auto_failover knows how to manage the Postgres HBA rules that are necessary for your stream replication needs and for its monitor protocol, it will not manage the Postgres HBA rules that are needed for your applications. If you have your own HBA provisioning solution, you can include the rules needed for pg_auto_failover and then use the ``--skip-pg-hba`` option to the ``pg_autoctl create`` commands. Enable SSL connections on an existing setup ------------------------------------------- Whether you upgrade pg_auto_failover from a previous version that did not have support for the SSL features, or when you started with ``--no-ssl`` and later change your mind, it is possible with pg_auto_failover to add SSL settings on system that has already been setup without explicit SSL support. In this section we detail how to upgrade to SSL settings. Installing Self-Signed certificates on-top of an already existing pg_auto_failover setup is done with one of the following pg_autoctl command variants, depending if you want self-signed certificates or fully verified ssl certificates:: $ pg_autoctl enable ssl --ssl-self-signed --ssl-mode required $ pg_autoctl enable ssl --ssl-ca-file root.crt \ --ssl-crl-file root.crl \ --server-cert server.crt \ --server-key server.key \ --ssl-mode verify-full The ``pg_autoctl enable ssl`` command edits the ``postgresql-auto-failover.conf`` Postgres configuration file to match the command line arguments given and enable SSL as instructed, and then updates the pg_autoctl configuration. The connection string to connect to the monitor is also automatically updated by the ``pg_autoctl enable ssl`` command. You can verify your new configuration with:: $ pg_autoctl config get pg_autoctl.monitor Note that an already running pg_autoctl deamon will try to reload its configuration after ``pg_autoctl enable ssl`` has finished. In some cases this is not possible to do without a restart. So be sure to check the logs from a running daemon to confirm that the reload succeeded. If it did not you may need to restart the daemon to ensure the new connection string is used. The HBA settings are not edited, irrespective of the ``--skip-pg-hba`` that has been used at creation time. That's because the ``host`` records match either SSL or non-SSL connection attempts in Postgres HBA file, so the pre-existing setup will continue to work. To enhance the SSL setup, you can manually edit the HBA files and change the existing lines from ``host`` to ``hostssl`` to dissallow unencrypted connections at the server side. In summary, to upgrade an existing pg_auto_failover setup to enable SSL: 1. run the ``pg_autoctl enable ssl`` command on your monitor and then all the Postgres nodes, 2. on the Postgres nodes, review your pg_autoctl logs to make sure that the reload operation has been effective, and review your Postgres settings to verify that you have the expected result, 3. review your HBA rules setup to change the pg_auto_failover rules from ``host`` to ``hostssl`` to disallow insecure connections. pg_auto_failover-1.6.3/docs/tikz/000077500000000000000000000000001414244367200167665ustar00rootroot00000000000000pg_auto_failover-1.6.3/docs/tikz/Makefile000066400000000000000000000007221414244367200204270ustar00rootroot00000000000000SRC = $(wildcard arch*.tex fsm.tex) PDF = $(SRC:.tex=.pdf) SVG = $(SRC:.tex=.svg) PNG = $(SRC:.tex=.png) all: pdf svg png ; pdf: $(SRC) $(PDF) ; svg: $(SRC) $(SVG) ; png: $(SRC) $(PNG) ; clean: latexmk -C rm -rf $(PDF) rm -rf $(SVG) rm -rf $(PNG) %.pdf: %.tex common.tex latexmk -lualatex --interaction=nonstopmode -shell-escape $< $@ latexmk -c %.png: %.pdf pdftocairo -singlefile -r 300 -transp -png $< %.svg: %.pdf pdftocairo -svg $< .PHONY: clean pg_auto_failover-1.6.3/docs/tikz/arch-multi-standby.svg000066400000000000000000003036171414244367200232300ustar00rootroot00000000000000 pg_auto_failover-1.6.3/docs/tikz/arch-multi-standby.tex000066400000000000000000000040641414244367200232230ustar00rootroot00000000000000% Fix for: https://tex.stackexchange.com/a/315027/43228 \RequirePackage{luatex85} \documentclass[border=10pt,17pt]{standalone} \usepackage{cfr-lm} \usepackage{pgf} \usepackage{tikz} \usetikzlibrary{arrows,shapes,snakes} \usetikzlibrary{shapes.multipart} \begin{document} %% sans-serif fonts, large by default, and bold too \sffamily \sbweight \bfseries \large \begin{tikzpicture}[>=stealth',auto,rounded corners] \input{common.tex} %% \draw [help lines] (-10,0) grid (10,20); \node (flegend) at (0,18) {\textt{number\_sync\_standby = 1}} ; \node (a) at (0,15) [primary] {\textbf{\normalsize Node A} \nodepart{second} \textbf{\Large Primary} \nodepart[text=stxt]{third} \texttt{replication quorum = true} \\ \texttt{candidate priority = 50} }; \node (b) at (-6,8) [standby] {\textbf{\normalsize Node B} \nodepart{second} \textbf{\Large Secondary} \nodepart{third} \texttt{replication quorum = true} \\ \texttt{candidate priority = 50} }; \node (c) at (6,8) [standby] {\textbf{\normalsize Node C} \nodepart{second} \textbf{\Large Secondary} \nodepart{third} \texttt{replication quorum = true} \\ \texttt{candidate priority = 50} }; \node (app) at (-8,18) [app] {\textbf{\Large Application}}; \node (m) at (8,18) [monitor] {\textbf{\Large Monitor}}; \path (app) edge [sql] node[near start] {SQL} (a) edge [sqlf] node[left,near start] {SQL (fallback)} (b) edge [sqlf,out=-90,in=145] (c) (a) edge [sr] node[left] {Streaming} (b.north east) edge [sr] node[right] {Replication} (c.north west) (m) edge [hc] (a) edge [hc,out=-90,in=35] (b) edge [hc] node[right,near start] {Health checks} (c); \end{tikzpicture} \end{document} pg_auto_failover-1.6.3/docs/tikz/arch-single-standby.svg000066400000000000000000001231671414244367200233570ustar00rootroot00000000000000 pg_auto_failover-1.6.3/docs/tikz/arch-single-standby.tex000066400000000000000000000025331414244367200233510ustar00rootroot00000000000000% Fix for: https://tex.stackexchange.com/a/315027/43228 \RequirePackage{luatex85} \documentclass[border=10pt,17pt]{standalone} \usepackage{cfr-lm} \usepackage{pgf} \usepackage{tikz} \usetikzlibrary{arrows,shapes,snakes,automata,backgrounds,petri} \begin{document} %% sans-serif fonts, large by default, and bold too \sffamily \sbweight \bfseries \large \begin{tikzpicture}[>=stealth',bend angle=45,auto,rounded corners] \input{common.tex} %% \draw [help lines] (-10,0) grid (10,20); \tikzstyle{primary}=[node,text=ptxt,fill=pbox,draw=white] \tikzstyle{standby}=[node,text=stxt,fill=sbox,draw=white] \node (p) at (0,18) [primary] {\textbf{Primary}}; \node (s) at (0,12) [standby] {\textbf{Secondary}}; \node (app) at (-6,15) [app] {\textbf{Application}}; \node (m) at (6,15) [monitor] {\textbf{Monitor}}; \path (app.north east) edge [sql,out=90,in=180] node {SQL} (p) (app.south east) edge [sqlf,out=-90,in=180] node[below] {SQL (fallback)} (s) (p) edge [sr] node[left] {Streaming} node [right] {Replication} (s) (m) edge [hc,out=90,in=0] node[above] {Health checks} (p) edge [hc,out=-90,in=0] node {Health checks} (s); \end{tikzpicture} \end{document} pg_auto_failover-1.6.3/docs/tikz/arch-three-standby-one-async.svg000066400000000000000000003321301414244367200250670ustar00rootroot00000000000000 pg_auto_failover-1.6.3/docs/tikz/arch-three-standby-one-async.tex000066400000000000000000000047501414244367200250740ustar00rootroot00000000000000% Fix for: https://tex.stackexchange.com/a/315027/43228 \RequirePackage{luatex85} \documentclass[border=10pt,17pt]{standalone} \usepackage{cfr-lm} \usepackage{pgf} \usepackage{tikz} \usetikzlibrary{arrows,shapes,snakes} \usetikzlibrary{shapes.multipart} \begin{document} %% sans-serif fonts, large by default, and bold too \sffamily \sbweight \bfseries \large \begin{tikzpicture}[>=stealth',auto,rounded corners] \input{common.tex} %% \draw [help lines] (-10,0) grid (10,20); \node (flegend) at (0,18) {\textbf{\textt{number\_sync\_standby = 1}}} ; \node (a) at (0,15) [primary] {\textbf{\normalsize Node A} \nodepart{second} \textbf{\Large Primary} \nodepart[text=stxt]{third} \texttt{replication quorum = true} \\ \texttt{candidate priority = 50} }; \node (b) at (-7,2) [standby] {\textbf{\normalsize Node B} \nodepart{second} \textbf{\Large Secondary} \nodepart[align=left]{third} \texttt{replication quorum = true} \\ \texttt{candidate priority = 50} }; \node (c) at (0,8) [standby,rectangle split part fill={sbox,sbox,async}] {\textbf{\normalsize Node C} \nodepart{second} \textbf{\Large Secondary} \nodepart[align=left]{third} \textbf{\texttt{replication quorum = false}} \\ \textbf{\texttt{candidate priority = 0}} }; \node (d) at (7,2) [standby] {\textbf{\normalsize Node D} \nodepart{second} \textbf{\Large Secondary} \nodepart[align=left]{third} \texttt{replication quorum = true} \\ \texttt{candidate priority = 50} }; \node (app) at (-8,18) [app] {\Large Application}; \node (m) at (8,18) [monitor] {\Large Monitor}; \path (app) edge [sql,out=0,in=180] node[below,near start] {SQL} (a) edge [sqlf,out=-90,in=135] node[left,near start] {SQL (fallback)} (b.north west) edge [sqlf,out=-90,in=180] (d.west) (m) edge [hc,out=-90,in=0] (a) edge [hc,out=-90,in=0] (b.east) edge [hc,out=-90,in=45] (c.north east) edge [hc,out=-90,in=45] node[right,near start] {Health checks} (d.north east) (a.south west) edge [sr] (b) (a) edge [sr] node[left,near start] {Streaming} node[right,near start] {Replication} (c) (a.south east) edge [sr] (d); \end{tikzpicture} \end{document} pg_auto_failover-1.6.3/docs/tikz/arch-three-standby.svg000066400000000000000000003277311414244367200232100ustar00rootroot00000000000000 pg_auto_failover-1.6.3/docs/tikz/arch-three-standby.tex000066400000000000000000000047601414244367200232030ustar00rootroot00000000000000% Fix for: https://tex.stackexchange.com/a/315027/43228 \RequirePackage{luatex85} \documentclass[border=10pt,17pt]{standalone} \usepackage{cfr-lm} \usepackage{pgf} \usepackage{tikz} \usetikzlibrary{arrows,shapes,snakes} \usetikzlibrary{shapes.multipart} \begin{document} %% sans-serif fonts, large by default, and bold too \sffamily \sbweight \bfseries \large \begin{tikzpicture}[>=stealth',auto,rounded corners] \input{common.tex} %% \draw [help lines] (-10,0) grid (10,20); \node (flegend) at (0,18) {\textbf{\textt{number\_sync\_standby = 2}}} ; \node (a) at (0,15) [primary] {\textbf{\normalsize Node A} \nodepart{second} \textbf{\Large Primary} \nodepart[text=stxt]{third} \texttt{replication quorum = true} \\ \texttt{candidate priority = 50} }; \node (b) at (-7,2) [standby] {\textbf{\normalsize Node B} \nodepart{second} \textbf{\Large Secondary} \nodepart[align=left]{third} \texttt{replication quorum = true} \\ \texttt{candidate priority = 50} }; \node (c) at (0,8) [standby] {\textbf{\normalsize Node C} \nodepart{second} \textbf{\Large Secondary} \nodepart[align=left]{third} \texttt{replication quorum = true} \\ \texttt{candidate priority = 50} }; \node (d) at (7,2) [standby] {\textbf{\normalsize Node D} \nodepart{second} \textbf{\Large Secondary} \nodepart[align=left]{third} \texttt{replication quorum = true} \\ \texttt{candidate priority = 50} }; \node (app) at (-8,18) [app] {\Large Application}; \node (m) at (8,18) [monitor] {\Large Monitor}; \path (app) edge [sql,out=0,in=180] node[below,near start] {SQL} (a) edge [sqlf,out=-90,in=135] node[left,near start] {SQL (fallback)} (b.north west) edge [sqlf,out=-90,in=135] (c.north west) edge [sqlf,out=-90,in=180] (d.west) (m) edge [hc,out=-90,in=0] (a) edge [hc,out=-90,in=0] (b.east) edge [hc,out=-90,in=45] (c.north east) edge [hc,out=-90,in=45] node[right,near start] {Health checks} (d.north east) (a.south west) edge [sr] (b) (a) edge [sr] node[left,near start] {Streaming} node[right,near start] {Replication} (c) (a.south east) edge [sr] (d); \end{tikzpicture} \end{document} pg_auto_failover-1.6.3/docs/tikz/common.tex000066400000000000000000000024121414244367200207770ustar00rootroot00000000000000\definecolor{pbox}{HTML}{0078D4} % MS blue \definecolor{ptxt}{HTML}{FFFFFF} % white \definecolor{sbox}{HTML}{50E6FF} % MS cyan \definecolor{stxt}{HTML}{2F2F2F} % off-black \definecolor{mbox}{HTML}{9BF00B} % MS light green \definecolor{mtxt}{HTML}{2F2F2F} % off-black \definecolor{apbox}{HTML}{0078D4} % MS blue \definecolor{aptxt}{HTML}{2F2F2F} % off-black \definecolor{async}{HTML}{EBEFF5} % very light grey \tikzstyle{app}=[circle,thick, text=aptxt,draw=apbox,fill=white, line width=0.25em,minimum size=4cm] \tikzstyle{node}=[rectangle,minimum height=2.5cm,minimum width=4cm] \tikzstyle{mpnode}=[rectangle split,rectangle split parts=3, align=center, rectangle split part align={center, center, left}, minimum height=2.5cm,minimum width=4cm,inner sep=0.5cm] \tikzstyle{primary}=[mpnode,text=ptxt,draw=white, rectangle split part fill={pbox,pbox,white}] \tikzstyle{standby}=[mpnode,text=stxt,draw=white, rectangle split part fill={sbox,sbox,white}] \tikzstyle{monitor}=[node,text=mtxt,draw=mbox,fill=mbox] \tikzstyle{sql}=[->,color=pbox,text=stxt,line width=0.15em] \tikzstyle{sqlf}=[->,color=sbox,text=stxt,line width=0.15em,loosely dashed] \tikzstyle{sr}=[>->,color=stxt,text=stxt,line width=0.15em] \tikzstyle{hc}=[<->,color=mbox,text=mtxt,line width=0.15em,dotted] pg_auto_failover-1.6.3/docs/tikz/fsm.svg000066400000000000000000002631351414244367200203060ustar00rootroot00000000000000 pg_auto_failover-1.6.3/docs/tikz/fsm.tex000066400000000000000000000070331414244367200203000ustar00rootroot00000000000000% Fix for: https://tex.stackexchange.com/a/315027/43228 \RequirePackage{luatex85} \documentclass[border=10pt,12pt]{standalone} \usepackage{cfr-lm} \usepackage{pgf} \usepackage{tikz} \usetikzlibrary {graphs,graphdrawing} \usegdlibrary {force,circular,layered,trees,routing} \begin{document} %% sans-serif fonts, large by default, and bold too \sffamily \sbweight \large \begin{tikzpicture}[rounded corners] %%\graph [layered layout, sibling distance=8mm, level distance=8mm] %% \graph [simple necklace layout, node distance=4mm, nodes={circle,draw}] %% \graph [simple necklace layout, node distance=4mm] %%\draw [help lines] (-10,0) grid (10,20); \graph [ tree layout, grow=right, sibling distance=1.5cm, level sep=1em, %% spring electrical layout, %% node distance=2.5cm, %% convergence tolerance=0.01, %% downsize ratio=0.25, %% spring constant=0.1, %% cooling factor=0.9, %% coarsen=true, %% iterations=2000, nodes={text height=.7em, text depth=.2em, draw=black!20, thick, fill=blue!20, font=\footnotesize}] { init [circle,draw,x=-10,y=10,color=white,fill=black!70] -> single [font=\normalsize, draw=black, circle]; init -> "wait standby" [fill=violet!30]; single -> "wait primary" [font=\normalsize, draw=black]; "wait primary" -> primary [font=\normalsize, draw=black, circle]; "wait standby" -> "catching up" [fill=violet!30]; "catching up" -> secondary [circle,draw=black,circle,fill=violet!30]; primary -> "wait primary" ; primary -> draining [fill=red!30]; draining -> demoted [fill=red!30]; primary -> demoted ; primary -> "demote timeout" [fill=red!30]; draining -> "demote timeout" ; "demote timeout" -> demoted ; primary -> "apply settings" [font=\normalsize, draw=black]; "apply settings" -> primary ; "apply settings" -> draining ; "apply settings" -> demoted ; "apply settings" -> "demote timeout" ; "apply settings" -> primary ; primary -> "prepare maintenance" [font=\normalsize, draw=black]; "prepare maintenance" -> maintenance [fill=black!20]; secondary -> "wait maintenance" [fill=violet!20]; "catching up" -> "wait maintenance" ; "wait maintenance" -> maintenance ; maintenance -> "catching up" ; secondary -> "catching up" ; secondary -> "prepare promotion" [fill=violet!30]; "catching up" -> "prepare promotion" ; "prepare promotion" -> "stop replication" [fill=violet!30]; "stop replication" -> "wait primary" ; "prepare promotion" -> "wait primary" ; "demote timeout" -> primary ; demoted -> "catching up" ; secondary -> "report LSN" [fill=violet!30]; "catching up" -> "report LSN" ; "report LSN" -> "prepare promotion" ; "report LSN" -> "fast forward" [fill=violet!30]; "fast forward" -> "prepare promotion" ; "report LSN" -> "join secondary" [fill=violet!30]; "join secondary" -> secondary ; %% primary -> single ; %% "wait primary" -> single ; %% demoted -> single ; %% "demote timeout" -> single ; %% draining -> single ; %% secondary -> single ; %% "catching up" -> single ; %% "prepare promotion" -> single ; %% "stop replication" -> single ; }; \end{tikzpicture} \end{document} pg_auto_failover-1.6.3/docs/tutorial.rst000066400000000000000000000431051414244367200204050ustar00rootroot00000000000000.. _tutorial: pg_auto_failover Tutorial ========================= In this guide we’ll create a primary and secondary Postgres node and set up pg_auto_failover to replicate data between them. We’ll simulate failure in the primary node and see how the system smoothly switches (fails over) to the secondary. For illustration, we'll run our databases on virtual machines in the Azure platform, but the techniques here are relevant to any cloud provider or on-premise network. We'll use four virtual machines: a primary database, a secondary database, a monitor, and an "application." The monitor watches the other nodes’ health, manages global state, and assigns nodes their roles. Create virtual network ---------------------- Our database machines need to talk to each other and to the monitor node, so let's create a virtual network. .. code-block:: bash az group create \ --name ha-demo \ --location eastus az network vnet create \ --resource-group ha-demo \ --name ha-demo-net \ --address-prefix 10.0.0.0/16 We need to open ports 5432 (Postgres) and 22 (SSH) between the machines, and also give ourselves access from our remote IP. We'll do this with a network security group and a subnet. .. code-block:: bash az network nsg create \ --resource-group ha-demo \ --name ha-demo-nsg az network nsg rule create \ --resource-group ha-demo \ --nsg-name ha-demo-nsg \ --name ha-demo-ssh-and-pg \ --access allow \ --protocol Tcp \ --direction Inbound \ --priority 100 \ --source-address-prefixes `curl ifconfig.me` 10.0.1.0/24 \ --source-port-range "*" \ --destination-address-prefix "*" \ --destination-port-ranges 22 5432 az network vnet subnet create \ --resource-group ha-demo \ --vnet-name ha-demo-net \ --name ha-demo-subnet \ --address-prefixes 10.0.1.0/24 \ --network-security-group ha-demo-nsg Finally add four virtual machines (ha-demo-a, ha-demo-b, ha-demo-monitor, and ha-demo-app). For speed we background the ``az vm create`` processes and run them in parallel: .. code-block:: bash # create VMs in parallel for node in monitor a b app do az vm create \ --resource-group ha-demo \ --name ha-demo-${node} \ --vnet-name ha-demo-net \ --subnet ha-demo-subnet \ --nsg ha-demo-nsg \ --public-ip-address ha-demo-${node}-ip \ --image debian \ --admin-username ha-admin \ --generate-ssh-keys & done wait To make it easier to SSH into these VMs in future steps, let's make a shell function to retrieve their IP addresses: .. code-block:: bash # run this in your local shell as well vm_ip () { az vm list-ip-addresses -g ha-demo -n ha-demo-$1 -o tsv \ --query '[] [] .virtualMachine.network.publicIpAddresses[0].ipAddress' } # for convenience with ssh for node in monitor a b app do ssh-keyscan -H `vm_ip $node` >> ~/.ssh/known_hosts done Let's review what we created so far. .. code-block:: bash az resource list --output table --query \ "[?resourceGroup=='ha-demo'].{ name: name, flavor: kind, resourceType: type, region: location }" This shows the following resources: :: Name ResourceType Region ------------------------------- ----------------------------------------------------- -------- ha-demo-a Microsoft.Compute/virtualMachines eastus ha-demo-app Microsoft.Compute/virtualMachines eastus ha-demo-b Microsoft.Compute/virtualMachines eastus ha-demo-monitor Microsoft.Compute/virtualMachines eastus ha-demo-appVMNic Microsoft.Network/networkInterfaces eastus ha-demo-aVMNic Microsoft.Network/networkInterfaces eastus ha-demo-bVMNic Microsoft.Network/networkInterfaces eastus ha-demo-monitorVMNic Microsoft.Network/networkInterfaces eastus ha-demo-nsg Microsoft.Network/networkSecurityGroups eastus ha-demo-a-ip Microsoft.Network/publicIPAddresses eastus ha-demo-app-ip Microsoft.Network/publicIPAddresses eastus ha-demo-b-ip Microsoft.Network/publicIPAddresses eastus ha-demo-monitor-ip Microsoft.Network/publicIPAddresses eastus ha-demo-net Microsoft.Network/virtualNetworks eastus Install the "pg_autoctl" executable ----------------------------------- This guide uses Debian Linux, but similar steps will work on other distributions. All that differs are the packages and paths. See :ref:`install`. The pg_auto_failover system is distributed as a single ``pg_autoctl`` binary with subcommands to initialize and manage a replicated PostgreSQL service. We’ll install the binary with the operating system package manager on all nodes. It will help us run and observe PostgreSQL. .. code-block:: bash for node in monitor a b app do az vm run-command invoke \ --resource-group ha-demo \ --name ha-demo-${node} \ --command-id RunShellScript \ --scripts \ "sudo touch /home/ha-admin/.hushlogin" \ "curl https://install.citusdata.com/community/deb.sh | sudo bash" \ "sudo DEBIAN_FRONTEND=noninteractive apt-get install -q -y postgresql-common" \ "echo 'create_main_cluster = false' | sudo tee -a /etc/postgresql-common/createcluster.conf" \ "sudo DEBIAN_FRONTEND=noninteractive apt-get install -q -y postgresql-11-auto-failover-1.4" \ "sudo usermod -a -G postgres ha-admin" & done wait Run a monitor ------------- The pg_auto_failover monitor is the first component to run. It periodically attempts to contact the other nodes and watches their health. It also maintains global state that “keepers” on each node consult to determine their own roles in the system. .. code-block:: bash # on the monitor virtual machine ssh -l ha-admin `vm_ip monitor` -- \ pg_autoctl create monitor \ --auth trust \ --ssl-self-signed \ --pgdata monitor \ --pgctl /usr/lib/postgresql/11/bin/pg_ctl This command initializes a PostgreSQL cluster at the location pointed by the ``--pgdata`` option. When ``--pgdata`` is omitted, ``pg_autoctl`` attempts to use the ``PGDATA`` environment variable. If a PostgreSQL instance had already existing in the destination directory, this command would have configured it to serve as a monitor. ``pg_auto_failover``, installs the ``pgautofailover`` Postgres extension, and grants access to a new ``autoctl_node`` user. In the Quick Start we use ``--auth trust`` to avoid complex security settings. The Postgres `trust authentication method`__ is not considered a reasonable choice for production environments. Consider either using the ``--skip-pg-hba`` option or ``--auth scram-sha-256`` and then setting up passwords yourself. __ https://www.postgresql.org/docs/current/auth-trust.html_ At this point the monitor is created. Now we'll install it as a service with systemd so that it will resume if the VM restarts. .. code-block:: bash ssh -T -l ha-admin `vm_ip monitor` << CMD pg_autoctl -q show systemd --pgdata ~ha-admin/monitor > pgautofailover.service sudo mv pgautofailover.service /etc/systemd/system sudo systemctl daemon-reload sudo systemctl enable pgautofailover sudo systemctl start pgautofailover CMD Bring up the nodes ------------------ We’ll create the primary database using the ``pg_autoctl create`` subcommand. .. code-block:: bash ssh -l ha-admin `vm_ip a` -- \ pg_autoctl create postgres \ --pgdata ha \ --auth trust \ --ssl-self-signed \ --username ha-admin \ --dbname appdb \ --hostname ha-demo-a.internal.cloudapp.net \ --pgctl /usr/lib/postgresql/11/bin/pg_ctl \ --monitor 'postgres://autoctl_node@ha-demo-monitor.internal.cloudapp.net/pg_auto_failover?sslmode=require' Notice the user and database name in the monitor connection string -- these are what monitor init created. We also give it the path to pg_ctl so that the keeper will use the correct version of pg_ctl in future even if other versions of postgres are installed on the system. In the example above, the keeper creates a primary database. It chooses to set up node A as primary because the monitor reports there are no other nodes in the system yet. This is one example of how the keeper is state-based: it makes observations and then adjusts its state, in this case from "init" to "single." Also add a setting to trust connections from our "application" VM: .. code-block:: bash ssh -T -l ha-admin `vm_ip a` << CMD echo 'hostssl "appdb" "ha-admin" ha-demo-app.internal.cloudapp.net trust' \ >> ~ha-admin/ha/pg_hba.conf CMD At this point the monitor and primary node are created and running. Next we need to run the keeper. It’s an independent process so that it can continue operating even if the PostgreSQL process goes terminates on the node. We'll install it as a service with systemd so that it will resume if the VM restarts. .. code-block:: bash ssh -T -l ha-admin `vm_ip a` << CMD pg_autoctl -q show systemd --pgdata ~ha-admin/ha > pgautofailover.service sudo mv pgautofailover.service /etc/systemd/system sudo systemctl daemon-reload sudo systemctl enable pgautofailover sudo systemctl start pgautofailover CMD Next connect to node B and do the same process. We'll do both steps at once: .. code-block:: bash ssh -l ha-admin `vm_ip b` -- \ pg_autoctl create postgres \ --pgdata ha \ --auth trust \ --ssl-self-signed \ --username ha-admin \ --dbname appdb \ --hostname ha-demo-b.internal.cloudapp.net \ --pgctl /usr/lib/postgresql/11/bin/pg_ctl \ --monitor 'postgres://autoctl_node@ha-demo-monitor.internal.cloudapp.net/pg_auto_failover?sslmode=require' ssh -T -l ha-admin `vm_ip b` << CMD pg_autoctl -q show systemd --pgdata ~ha-admin/ha > pgautofailover.service sudo mv pgautofailover.service /etc/systemd/system sudo systemctl daemon-reload sudo systemctl enable pgautofailover sudo systemctl start pgautofailover CMD It discovers from the monitor that a primary exists, and then switches its own state to be a hot standby and begins streaming WAL contents from the primary. Node communication ------------------ For convenience, pg_autoctl modifies each node's ``pg_hba.conf`` file to allow the nodes to connect to one another. For instance, pg_autoctl added the following lines to node A: .. code-block:: ini # automatically added to node A hostssl "appdb" "ha-admin" ha-demo-a.internal.cloudapp.net trust hostssl replication "pgautofailover_replicator" ha-demo-b.internal.cloudapp.net trust hostssl "appdb" "pgautofailover_replicator" ha-demo-b.internal.cloudapp.net trust For ``pg_hba.conf`` on the monitor node pg_autoctl inspects the local network and makes its best guess about the subnet to allow. In our case it guessed correctly: .. code-block:: ini # automatically added to the monitor hostssl "pg_auto_failover" "autoctl_node" 10.0.1.0/24 trust If worker nodes have more ad-hoc addresses and are not in the same subnet, it's better to disable pg_autoctl's automatic modification of pg_hba using the ``--skip-pg-hba`` command line option during creation. You will then need to edit the hba file by hand. Another reason for manual edits would be to use special authentication methods. Watch the replication --------------------- First let’s verify that the monitor knows about our nodes, and see what states it has assigned them: .. code-block:: bash ssh -l ha-admin `vm_ip monitor` pg_autoctl show state --pgdata monitor   Name | Node | Host:Port | LSN | Reachable | Current State | Assigned State -------+-------+--------------------------------------+-----------+-----------+---------------------+-------------------- node_1 | 1 | ha-demo-a.internal.cloudapp.net:5432 | 0/3000060 | yes | primary | primary node_2 | 2 | ha-demo-b.internal.cloudapp.net:5432 | 0/3000060 | yes | secondary | secondary This looks good. We can add data to the primary, and later see it appear in the secondary. We'll connect to the database from inside our "app" virtual machine, using a connection string obtained from the monitor. .. code-block:: bash ssh -l ha-admin `vm_ip monitor` pg_autoctl show uri --pgdata monitor Type | Name | Connection String -----------+---------+------------------------------- monitor | monitor | postgres://autoctl_node@ha-demo-monitor.internal.cloudapp.net:5432/pg_auto_failover?sslmode=require formation | default | postgres://ha-demo-b.internal.cloudapp.net:5432,ha-demo-a.internal.cloudapp.net:5432/appdb?target_session_attrs=read-write&sslmode=require Now we'll get the connection string and store it in a local environment variable: .. code-block:: bash APP_DB_URI=$( \ ssh -l ha-admin `vm_ip monitor` \ pg_autoctl show uri --formation default --pgdata monitor \ ) The connection string contains both our nodes, comma separated, and includes the url parameter ``?target_session_attrs=read-write`` telling psql that we want to connect to whichever of these servers supports reads *and* writes. That will be the primary server. .. code-block:: bash # connect to database via psql on the app vm and # create a table with a million rows ssh -l ha-admin -t `vm_ip app` -- \ psql "'$APP_DB_URI'" \ -c "'CREATE TABLE foo AS SELECT generate_series(1,1000000) bar;'" Cause a failover ---------------- Now that we've added data to node A, let's switch which is considered the primary and which the secondary. After the switch we'll connect again and query the data, this time from node B. .. code-block:: bash # initiate failover to node B ssh -l ha-admin -t `vm_ip monitor` \ pg_autoctl perform switchover --pgdata monitor Once node B is marked "primary" (or "wait_primary") we can connect and verify that the data is still present: .. code-block:: bash # connect to database via psql on the app vm ssh -l ha-admin -t `vm_ip app` -- \ psql "'$APP_DB_URI'" \ -c "'SELECT count(*) FROM foo;'" It shows .. code-block:: bash   count --------- 1000000 Cause a node failure -------------------- This plot is too boring, time to introduce a problem. We’ll turn off VM for node B (currently the primary after our previous failover) and watch node A get promoted. In one terminal let’s keep an eye on events: .. code-block:: bash ssh -t -l ha-admin `vm_ip monitor` -- \ watch -n 1 -d pg_autoctl show state --pgdata monitor In another terminal we’ll turn off the virtual server. .. code-block:: bash az vm stop \ --resource-group ha-demo \ --name ha-demo-b After a number of failed attempts to talk to node B, the monitor determines the node is unhealthy and puts it into the "demoted" state. The monitor promotes node A to be the new primary. .. code-block:: bash   Name | Node | Host:Port | LSN | Reachable | Current State | Assigned State -------+-------+--------------------------------------+-----------+-----------+---------------------+-------------------- node_1 | 1 | ha-demo-a.internal.cloudapp.net:5432 | 0/6D4E068 | yes | wait_primary | wait_primary node_2 | 2 | ha-demo-b.internal.cloudapp.net:5432 | 0/6D4E000 | yes | demoted | catchingup Node A cannot be considered in full "primary" state since there is no secondary present, but it can still serve client requests. It is marked as "wait_primary" until a secondary appears, to indicate that it's running without a backup. Let's add some data while B is offline. .. code-block:: bash # notice how $APP_DB_URI continues to work no matter which node # is serving as primary ssh -l ha-admin -t `vm_ip app` -- \ psql "'$APP_DB_URI'" \ -c "'INSERT INTO foo SELECT generate_series(1000001, 2000000);'" Resurrect node B ---------------- Run this command to bring node B back online: .. code-block:: bash az vm start \ --resource-group ha-demo \ --name ha-demo-b Now the next time the keeper retries its health check, it brings the node back. Node B goes through the state "catchingup" while it updates its data to match A. Once that's done, B becomes a secondary, and A is now a full primary again. .. code-block:: bash   Name | Node | Host:Port | LSN | Reachable | Current State | Assigned State -------+-------+--------------------------------------+------------+-----------+---------------------+-------------------- node_1 | 1 | ha-demo-a.internal.cloudapp.net:5432 | 0/12000738 | yes | primary | primary node_2 | 2 | ha-demo-b.internal.cloudapp.net:5432 | 0/12000738 | yes | secondary | secondary What's more, if we connect directly to the database again, all two million rows are still present. .. code-block:: bash ssh -l ha-admin -t `vm_ip app` -- \ psql "'$APP_DB_URI'" \ -c "'SELECT count(*) FROM foo;'" It shows .. code-block:: bash   count --------- 2000000 pg_auto_failover-1.6.3/pyproject.toml000066400000000000000000000000701414244367200177660ustar00rootroot00000000000000[tool.black] line-length = 80 target-version = ['py36'] pg_auto_failover-1.6.3/src/000077500000000000000000000000001414244367200156445ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/Makefile000066400000000000000000000007761414244367200173160ustar00rootroot00000000000000# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the PostgreSQL License. all: monitor bin ; clean: clean-monitor clean-bin ; install: install-monitor install-bin ; monitor: $(MAKE) -C monitor all install-monitor: $(MAKE) -C monitor install clean-monitor: $(MAKE) -C monitor clean bin: $(MAKE) -C bin all install-bin: $(MAKE) -C bin install clean-bin: $(MAKE) -C bin clean .PHONY: all clean bin test monitor .PHONY: clean-monitor clean-bin install-monitor install-bin pg_auto_failover-1.6.3/src/bin/000077500000000000000000000000001414244367200164145ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/Makefile000066400000000000000000000004511414244367200200540ustar00rootroot00000000000000# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the PostgreSQL License. all: pg_autoctl ; pg_autoctl: $(MAKE) -C pg_autoctl pg_autoctl clean: $(MAKE) -C pg_autoctl clean install: $(pg_autoctl) $(MAKE) -C pg_autoctl install .PHONY: all pg_autoctl install clean pg_auto_failover-1.6.3/src/bin/lib/000077500000000000000000000000001414244367200171625ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/lib/README.md000066400000000000000000000015601414244367200204430ustar00rootroot00000000000000# Vendored-in librairies ## log.c A very simple lib for handling logs in C is available at https://github.com/rxi/log.c It says that log.c and log.h should be dropped into an existing project and compiled along with it. So this directory contains a _vendored-in_ copy of the log.c repository. ## SubCommands.c The single-header library is used to implement parsing "modern" command lines. ## Configuration file parsing We utilize the "ini.h" ini-file reader from https://github.com/mattiasgustavsson/libs ## JSON The parson librairy at https://github.com/kgabis/parson is a single C file and MIT licenced. It allows parsing from and serializing to JSON. ## pg We vendor-in some code from the Postgres project at https://git.postgresql.org/gitweb/?p=postgresql.git;a=summary. This code is licenced under The PostgreSQL Licence, a derivative of the BSD licence. pg_auto_failover-1.6.3/src/bin/lib/libs/000077500000000000000000000000001414244367200201135ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/lib/libs/docs/000077500000000000000000000000001414244367200210435ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/lib/libs/docs/ini.md000066400000000000000000000344211414244367200221500ustar00rootroot00000000000000ini.h ===== Library: [ini.h](../ini.h) Examples ======== Loading an ini file and retrieving values ----------------------------------------- ```cpp #define INI_IMPLEMENTATION #include "ini.h" #include #include int main() { FILE* fp = fopen( "test.ini", "r" ); fseek( fp, 0, SEEK_END ); int size = ftell( fp ); fseek( fp, 0, SEEK_SET ); char* data = (char*) malloc( size + 1 ); fread( data, 1, size, fp ); data[ size ] = '\0'; fclose( fp ); ini_t* ini = ini_load( data ); free( data ); int second_index = ini_find_property( ini, INI_GLOBAL_SECTION, "SecondSetting" ); char const* second = ini_property_value( ini, INI_GLOBAL_SECTION, second_index ); printf( "%s=%s\n", "SecondSetting", second ); int section = ini_find_section( ini, "MySection" ); int third_index = ini_find_property( ini, section, "ThirdSetting" ); char const* third = ini_property_value( ini, section, third_index ); printf( "%s=%s\n", "ThirdSetting", third ); ini_destroy( ini ); return 0; } ``` Creating a new ini file ----------------------- ```cpp #define INI_IMPLEMENTATION #include "ini.h" #include #include int main() { ini_t* ini = ini_create(); ini_property_add( ini, INI_GLOBAL_SECTION, "FirstSetting", "Test" ); ini_property_add( ini, INI_GLOBAL_SECTION, "SecondSetting", "2" ); int section = ini_section_add( ini, "MySection" ); ini_property_add( ini, section, "ThirdSetting", "Three" ); int size = ini_save( ini, NULL, 0 ); // Find the size needed char* data = (char*) malloc( size ); size = ini_save( ini, data, size ); // Actually save the file ini_destroy( ini ); FILE* fp = fopen( "test.ini", "w" ); fwrite( data, 1, size, fp ); fclose( fp ); free( data ); return 0; } ``` API Documentation ================= ini.h is a small library for reading classic .ini files. It is a single-header library, and does not need any .lib files or other binaries, or any build scripts. To use it, you just include ini.h to get the API declarations. To get the definitions, you must include ini.h from *one* single C or C++ file, and #define the symbol `INI_IMPLEMENTATION` before you do. Customization ------------- There are a few different things in ini.h which are configurable by #defines. The customizations only affect the implementation, so will only need to be defined in the file where you have the #define INI_IMPLEMENTATION. Note that if all customizations are utilized, ini.h will include no external files whatsoever, which might be useful if you need full control over what code is being built. ### Custom memory allocators To store the internal data structures, ini.h needs to do dynamic allocation by calling `malloc`. Programs might want to keep track of allocations done, or use custom defined pools to allocate memory from. ini.h allows for specifying custom memory allocation functions for `malloc` and `free`. This is done with the following code: #define INI_IMPLEMENTATION #define INI_MALLOC( ctx, size ) ( my_custom_malloc( ctx, size ) ) #define INI_FREE( ctx, ptr ) ( my_custom_free( ctx, ptr ) ) #include "ini.h" where `my_custom_malloc` and `my_custom_free` are your own memory allocation/deallocation functions. The `ctx` parameter is an optional parameter of type `void*`. When `ini_create` or `ini_load` is called, you can pass in a `memctx` parameter, which can be a pointer to anything you like, and which will be passed through as the `ctx` parameter to every `INI_MALLOC`/`INI_FREE` call. For example, if you are doing memory tracking, you can pass a pointer to your tracking data as `memctx`, and in your custom allocation/deallocation function, you can cast the `ctx` param back to the right type, and access the tracking data. If no custom allocator is defined, ini.h will default to `malloc` and `free` from the C runtime library. ### Custom C runtime function The library makes use of three additional functions from the C runtime library, and for full flexibility, it allows you to substitute them for your own. Here's an example: #define INI_IMPLEMENTATION #define INI_MEMCPY( dst, src, cnt ) ( my_memcpy_func( dst, src, cnt ) ) #define INI_STRLEN( s ) ( my_strlen_func( s ) ) #define INI_STRICMP( s1, s2 ) ( my_stricmp_func( s1, s2 ) ) #include "ini.h" If no custom function is defined, ini.h will default to the C runtime library equivalent. ini_create ---------- ini_t* ini_create( void* memctx ) Instantiates a new, empty ini structure, which can be manipulated with other API calls, to fill it with data. To save it out to an ini-file string, use `ini_save`. When no longer needed, it can be destroyed by calling `ini_destroy`. `memctx` is a pointer to user defined data which will be passed through to the custom INI_MALLOC/INI_FREE calls. It can be NULL if no user defined data is needed. ini_load -------- ini_t* ini_load( char const* data, void* memctx ) Parse the zero-terminated string `data` containing an ini-file, and create a new ini_t instance containing the data. The instance can be manipulated with other API calls to enumerate sections/properties and retrieve values. When no longer needed, it can be destroyed by calling `ini_destroy`. `memctx` is a pointer to user defined data which will be passed through to the custom INI_MALLOC/INI_FREE calls. It can be NULL if no user defined data is needed. ini_save -------- int ini_save( ini_t const* ini, char* data, int size ) Saves an ini structure as a zero-terminated ini-file string, into the specified buffer. Returns the number of bytes written, including the zero terminator. If `data` is NULL, nothing is written, but `ini_save` still returns the number of bytes it would have written. If the size of `data`, as specified in the `size` parameter, is smaller than that required, only part of the ini-file string will be written. `ini_save` still returns the number of bytes it would have written had the buffer been large enough. ini_destroy ----------- void ini_destroy( ini_t* ini ) Destroy an `ini_t` instance created by calling `ini_load` or `ini_create`, releasing the memory allocated by it. No further API calls are valid on an `ini_t` instance after calling `ini_destroy` on it. ini_section_count ----------------- int ini_section_count( ini_t const* ini ) Returns the number of sections in an ini file. There's at least one section in an ini file (the global section), but there can be many more, each specified in the file by the section name wrapped in square brackets [ ]. ini_section_name ---------------- char const* ini_section_name( ini_t const* ini, int section ) Returns the name of the section with the specified index. `section` must be non-negative and less than the value returned by `ini_section_count`, or `ini_section_name` will return NULL. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. ini_property_count ------------------ int ini_property_count( ini_t const* ini, int section ) Returns the number of properties belonging to the section with the specified index. `section` must be non-negative and less than the value returned by `ini_section_count`, or `ini_section_name` will return 0. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. Properties are declared in the ini-file on he format `name=value`. ini_property_name ----------------- char const* ini_property_name( ini_t const* ini, int section, int property ) Returns the name of the property with the specified index `property` in the section with the specified index `section`. `section` must be non-negative and less than the value returned by `ini_section_count`, and `property` must be non-negative and less than the value returned by `ini_property_count`, or `ini_property_name` will return NULL. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. ini_property_value ------------------ char const* ini_property_value( ini_t const* ini, int section, int property ) Returns the value of the property with the specified index `property` in the section with the specified index `section`. `section` must be non-negative and less than the value returned by `ini_section_count`, and `property` must be non-negative and less than the value returned by `ini_property_count`, or `ini_property_value` will return NULL. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. ini_find_section ---------------- int ini_find_section( ini_t const* ini, char const* name, int name_length ) Finds the section with the specified name, and returns its index. `name_length` specifies the number of characters in `name`, which does not have to be zero-terminated. If `name_length` is zero, the length is determined automatically, but in this case `name` has to be zero-terminated. If no section with the specified name could be found, the value `INI_NOT_FOUND` is returned. ini_find_property ----------------- int ini_find_property( ini_t const* ini, int section, char const* name, int name_length ) Finds the property with the specified name, within the section with the specified index, and returns the index of the property. `name_length` specifies the number of characters in `name`, which does not have to be zero-terminated. If `name_length` is zero, the length is determined automatically, but in this case `name` has to be zero-terminated. If no property with the specified name could be found within the specified section, the value `INI_NOT_FOUND` is returned. `section` must be non-negative and less than the value returned by `ini_section_count`, or `ini_find_property` will return `INI_NOT_FOUND`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. ini_section_add --------------- int ini_section_add( ini_t* ini, char const* name, int length ) Adds a section with the specified name, and returns the index it was added at. There is no check done to see if a section with the specified name already exists - multiple sections of the same name are allowed. `length` specifies the number of characters in `name`, which does not have to be zero-terminated. If `length` is zero, the length is determined automatically, but in this case `name` has to be zero-terminated. ini_property_add ---------------- void ini_property_add( ini_t* ini, int section, char const* name, int name_length, char const* value, int value_length ) Adds a property with the specified name and value to the specified section, and returns the index it was added at. There is no check done to see if a property with the specified name already exists - multiple properties of the same name are allowed. `name_length` and `value_length` specifies the number of characters in `name` and `value`, which does not have to be zero-terminated. If `name_length` or `value_length` is zero, the length is determined automatically, but in this case `name`/`value` has to be zero-terminated. `section` must be non-negative and less than the value returned by `ini_section_count`, or the property will not be added. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. ini_section_remove ------------------ void ini_section_remove( ini_t* ini, int section ) Removes the section with the specified index, and all properties within it. `section` must be non-negative and less than the value returned by `ini_section_count`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. Note that removing a section will shuffle section indices, so that section indices you may have stored will no longer indicate the same section as it did before the remove. Use the find functions to update your indices. ini_property_remove ------------------- void ini_property_remove( ini_t* ini, int section, int property ) Removes the property with the specified index from the specified section. `section` must be non-negative and less than the value returned by `ini_section_count`, and `property` must be non-negative and less than the value returned by `ini_property_count`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. Note that removing a property will shuffle property indices within the specified section, so that property indices you may have stored will no longer indicate the same property as it did before the remove. Use the find functions to update your indices. ini_section_name_set -------------------- void ini_section_name_set( ini_t* ini, int section, char const* name, int length ) Change the name of the section with the specified index. `section` must be non-negative and less than the value returned by `ini_section_count`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. `length` specifies the number of characters in `name`, which does not have to be zero-terminated. If `length` is zero, the length is determined automatically, but in this case `name` has to be zero-terminated. ini_property_name_set --------------------- void ini_property_name_set( ini_t* ini, int section, int property, char const* name, int length ) Change the name of the property with the specified index in the specified section. `section` must be non-negative and less than the value returned by `ini_section_count`, and `property` must be non-negative and less than the value returned by `ini_property_count`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. `length` specifies the number of characters in `name`, which does not have to be zero-terminated. If `length` is zero, the length is determined automatically, but in this case `name` has to be zero-terminated. ini_property_value_set ---------------------- void ini_property_value_set( ini_t* ini, int section, int property, char const* value, int length ) Change the value of the property with the specified index in the specified section. `section` must be non-negative and less than the value returned by `ini_section_count`, and `property` must be non-negative and less than the value returned by `ini_property_count`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. `length` specifies the number of characters in `value`, which does not have to be zero-terminated. If `length` is zero, the length is determined automatically, but in this case `value` has to be zero-terminated. pg_auto_failover-1.6.3/src/bin/lib/libs/ini.h000066400000000000000000001140551414244367200210510ustar00rootroot00000000000000/* ------------------------------------------------------------------------------ Licensing information can be found at the end of the file. ------------------------------------------------------------------------------ ini.h - v1.2 - Simple ini-file reader for C/C++. Do this: #define INI_IMPLEMENTATION before you include this file in *one* C/C++ file to create the implementation. */ #ifndef ini_h #define ini_h #define INI_GLOBAL_SECTION ( 0 ) #define INI_NOT_FOUND ( -1 ) typedef struct ini_t ini_t; ini_t* ini_create( void* memctx ); ini_t* ini_load( char const* data, void* memctx ); int ini_save( ini_t const* ini, char* data, int size ); void ini_destroy( ini_t* ini ); int ini_section_count( ini_t const* ini ); char const* ini_section_name( ini_t const* ini, int section ); int ini_property_count( ini_t const* ini, int section ); char const* ini_property_name( ini_t const* ini, int section, int property ); char const* ini_property_value( ini_t const* ini, int section, int property ); int ini_find_section( ini_t const* ini, char const* name, int name_length ); int ini_find_property( ini_t const* ini, int section, char const* name, int name_length ); int ini_section_add( ini_t* ini, char const* name, int length ); void ini_property_add( ini_t* ini, int section, char const* name, int name_length, char const* value, int value_length ); void ini_section_remove( ini_t* ini, int section ); void ini_property_remove( ini_t* ini, int section, int property ); void ini_section_name_set( ini_t* ini, int section, char const* name, int length ); void ini_property_name_set( ini_t* ini, int section, int property, char const* name, int length ); void ini_property_value_set( ini_t* ini, int section, int property, char const* value, int length ); #endif /* ini_h */ /** Examples ======== Loading an ini file and retrieving values ----------------------------------------- #define INI_IMPLEMENTATION #include "ini.h" #include #include int main() { FILE* fp = fopen( "test.ini", "r" ); fseek( fp, 0, SEEK_END ); int size = ftell( fp ); fseek( fp, 0, SEEK_SET ); char* data = (char*) malloc( size + 1 ); fread( data, 1, size, fp ); data[ size ] = '\0'; fclose( fp ); ini_t* ini = ini_load( data ); free( data ); int second_index = ini_find_property( ini, INI_GLOBAL_SECTION, "SecondSetting" ); char const* second = ini_property_value( ini, INI_GLOBAL_SECTION, second_index ); printf( "%s=%s\n", "SecondSetting", second ); int section = ini_find_section( ini, "MySection" ); int third_index = ini_find_property( ini, section, "ThirdSetting" ); char const* third = ini_property_value( ini, section, third_index ); printf( "%s=%s\n", "ThirdSetting", third ); ini_destroy( ini ); return 0; } Creating a new ini file ----------------------- #define INI_IMPLEMENTATION #include "ini.h" #include #include int main() { ini_t* ini = ini_create(); ini_property_add( ini, INI_GLOBAL_SECTION, "FirstSetting", "Test" ); ini_property_add( ini, INI_GLOBAL_SECTION, "SecondSetting", "2" ); int section = ini_section_add( ini, "MySection" ); ini_property_add( ini, section, "ThirdSetting", "Three" ); int size = ini_save( ini, NULL, 0 ); // Find the size needed char* data = (char*) malloc( size ); size = ini_save( ini, data, size ); // Actually save the file ini_destroy( ini ); FILE* fp = fopen( "test.ini", "w" ); fwrite( data, 1, size, fp ); fclose( fp ); free( data ); return 0; } API Documentation ================= ini.h is a small library for reading classic .ini files. It is a single-header library, and does not need any .lib files or other binaries, or any build scripts. To use it, you just include ini.h to get the API declarations. To get the definitions, you must include ini.h from *one* single C or C++ file, and #define the symbol `INI_IMPLEMENTATION` before you do. Customization ------------- There are a few different things in ini.h which are configurable by #defines. The customizations only affect the implementation, so will only need to be defined in the file where you have the #define INI_IMPLEMENTATION. Note that if all customizations are utilized, ini.h will include no external files whatsoever, which might be useful if you need full control over what code is being built. ### Custom memory allocators To store the internal data structures, ini.h needs to do dynamic allocation by calling `malloc`. Programs might want to keep track of allocations done, or use custom defined pools to allocate memory from. ini.h allows for specifying custom memory allocation functions for `malloc` and `free`. This is done with the following code: #define INI_IMPLEMENTATION #define INI_MALLOC( ctx, size ) ( my_custom_malloc( ctx, size ) ) #define INI_FREE( ctx, ptr ) ( my_custom_free( ctx, ptr ) ) #include "ini.h" where `my_custom_malloc` and `my_custom_free` are your own memory allocation/deallocation functions. The `ctx` parameter is an optional parameter of type `void*`. When `ini_create` or `ini_load` is called, you can pass in a `memctx` parameter, which can be a pointer to anything you like, and which will be passed through as the `ctx` parameter to every `INI_MALLOC`/`INI_FREE` call. For example, if you are doing memory tracking, you can pass a pointer to your tracking data as `memctx`, and in your custom allocation/deallocation function, you can cast the `ctx` param back to the right type, and access the tracking data. If no custom allocator is defined, ini.h will default to `malloc` and `free` from the C runtime library. ### Custom C runtime function The library makes use of three additional functions from the C runtime library, and for full flexibility, it allows you to substitute them for your own. Here's an example: #define INI_IMPLEMENTATION #define INI_MEMCPY( dst, src, cnt ) ( my_memcpy_func( dst, src, cnt ) ) #define INI_STRLEN( s ) ( my_strlen_func( s ) ) #define INI_STRNICMP( s1, s2, cnt ) ( my_strnicmp_func( s1, s2, cnt ) ) #include "ini.h" If no custom function is defined, ini.h will default to the C runtime library equivalent. ini_create ---------- ini_t* ini_create( void* memctx ) Instantiates a new, empty ini structure, which can be manipulated with other API calls, to fill it with data. To save it out to an ini-file string, use `ini_save`. When no longer needed, it can be destroyed by calling `ini_destroy`. `memctx` is a pointer to user defined data which will be passed through to the custom INI_MALLOC/INI_FREE calls. It can be NULL if no user defined data is needed. ini_load -------- ini_t* ini_load( char const* data, void* memctx ) Parse the zero-terminated string `data` containing an ini-file, and create a new ini_t instance containing the data. The instance can be manipulated with other API calls to enumerate sections/properties and retrieve values. When no longer needed, it can be destroyed by calling `ini_destroy`. `memctx` is a pointer to user defined data which will be passed through to the custom INI_MALLOC/INI_FREE calls. It can be NULL if no user defined data is needed. ini_save -------- int ini_save( ini_t const* ini, char* data, int size ) Saves an ini structure as a zero-terminated ini-file string, into the specified buffer. Returns the number of bytes written, including the zero terminator. If `data` is NULL, nothing is written, but `ini_save` still returns the number of bytes it would have written. If the size of `data`, as specified in the `size` parameter, is smaller than that required, only part of the ini-file string will be written. `ini_save` still returns the number of bytes it would have written had the buffer been large enough. ini_destroy ----------- void ini_destroy( ini_t* ini ) Destroy an `ini_t` instance created by calling `ini_load` or `ini_create`, releasing the memory allocated by it. No further API calls are valid on an `ini_t` instance after calling `ini_destroy` on it. ini_section_count ----------------- int ini_section_count( ini_t const* ini ) Returns the number of sections in an ini file. There's at least one section in an ini file (the global section), but there can be many more, each specified in the file by the section name wrapped in square brackets [ ]. ini_section_name ---------------- char const* ini_section_name( ini_t const* ini, int section ) Returns the name of the section with the specified index. `section` must be non-negative and less than the value returned by `ini_section_count`, or `ini_section_name` will return NULL. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. ini_property_count ------------------ int ini_property_count( ini_t const* ini, int section ) Returns the number of properties belonging to the section with the specified index. `section` must be non-negative and less than the value returned by `ini_section_count`, or `ini_section_name` will return 0. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. Properties are declared in the ini-file on he format `name=value`. ini_property_name ----------------- char const* ini_property_name( ini_t const* ini, int section, int property ) Returns the name of the property with the specified index `property` in the section with the specified index `section`. `section` must be non-negative and less than the value returned by `ini_section_count`, and `property` must be non-negative and less than the value returned by `ini_property_count`, or `ini_property_name` will return NULL. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. ini_property_value ------------------ char const* ini_property_value( ini_t const* ini, int section, int property ) Returns the value of the property with the specified index `property` in the section with the specified index `section`. `section` must be non-negative and less than the value returned by `ini_section_count`, and `property` must be non-negative and less than the value returned by `ini_property_count`, or `ini_property_value` will return NULL. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. ini_find_section ---------------- int ini_find_section( ini_t const* ini, char const* name, int name_length ) Finds the section with the specified name, and returns its index. `name_length` specifies the number of characters in `name`, which does not have to be zero-terminated. If `name_length` is zero, the length is determined automatically, but in this case `name` has to be zero-terminated. If no section with the specified name could be found, the value `INI_NOT_FOUND` is returned. ini_find_property ----------------- int ini_find_property( ini_t const* ini, int section, char const* name, int name_length ) Finds the property with the specified name, within the section with the specified index, and returns the index of the property. `name_length` specifies the number of characters in `name`, which does not have to be zero-terminated. If `name_length` is zero, the length is determined automatically, but in this case `name` has to be zero-terminated. If no property with the specified name could be found within the specified section, the value `INI_NOT_FOUND` is returned. `section` must be non-negative and less than the value returned by `ini_section_count`, or `ini_find_property` will return `INI_NOT_FOUND`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. ini_section_add --------------- int ini_section_add( ini_t* ini, char const* name, int length ) Adds a section with the specified name, and returns the index it was added at. There is no check done to see if a section with the specified name already exists - multiple sections of the same name are allowed. `length` specifies the number of characters in `name`, which does not have to be zero-terminated. If `length` is zero, the length is determined automatically, but in this case `name` has to be zero-terminated. ini_property_add ---------------- void ini_property_add( ini_t* ini, int section, char const* name, int name_length, char const* value, int value_length ) Adds a property with the specified name and value to the specified section, and returns the index it was added at. There is no check done to see if a property with the specified name already exists - multiple properties of the same name are allowed. `name_length` and `value_length` specifies the number of characters in `name` and `value`, which does not have to be zero-terminated. If `name_length` or `value_length` is zero, the length is determined automatically, but in this case `name`/`value` has to be zero-terminated. `section` must be non-negative and less than the value returned by `ini_section_count`, or the property will not be added. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. ini_section_remove ------------------ void ini_section_remove( ini_t* ini, int section ) Removes the section with the specified index, and all properties within it. `section` must be non-negative and less than the value returned by `ini_section_count`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. Note that removing a section will shuffle section indices, so that section indices you may have stored will no longer indicate the same section as it did before the remove. Use the find functions to update your indices. ini_property_remove ------------------- void ini_property_remove( ini_t* ini, int section, int property ) Removes the property with the specified index from the specified section. `section` must be non-negative and less than the value returned by `ini_section_count`, and `property` must be non-negative and less than the value returned by `ini_property_count`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. Note that removing a property will shuffle property indices within the specified section, so that property indices you may have stored will no longer indicate the same property as it did before the remove. Use the find functions to update your indices. ini_section_name_set -------------------- void ini_section_name_set( ini_t* ini, int section, char const* name, int length ) Change the name of the section with the specified index. `section` must be non-negative and less than the value returned by `ini_section_count`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. `length` specifies the number of characters in `name`, which does not have to be zero-terminated. If `length` is zero, the length is determined automatically, but in this case `name` has to be zero-terminated. ini_property_name_set --------------------- void ini_property_name_set( ini_t* ini, int section, int property, char const* name, int length ) Change the name of the property with the specified index in the specified section. `section` must be non-negative and less than the value returned by `ini_section_count`, and `property` must be non-negative and less than the value returned by `ini_property_count`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. `length` specifies the number of characters in `name`, which does not have to be zero-terminated. If `length` is zero, the length is determined automatically, but in this case `name` has to be zero-terminated. ini_property_value_set ---------------------- void ini_property_value_set( ini_t* ini, int section, int property, char const* value, int length ) Change the value of the property with the specified index in the specified section. `section` must be non-negative and less than the value returned by `ini_section_count`, and `property` must be non-negative and less than the value returned by `ini_property_count`. The defined constant `INI_GLOBAL_SECTION` can be used to indicate the global section. `length` specifies the number of characters in `value`, which does not have to be zero-terminated. If `length` is zero, the length is determined automatically, but in this case `value` has to be zero-terminated. **/ /* ---------------------- IMPLEMENTATION ---------------------- */ #ifdef INI_IMPLEMENTATION #undef INI_IMPLEMENTATION #define INITIAL_CAPACITY ( 256 ) #undef _CRT_NONSTDC_NO_DEPRECATE #define _CRT_NONSTDC_NO_DEPRECATE #undef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS #include #ifndef INI_MALLOC #include #define INI_MALLOC( ctx, size ) ( malloc( size ) ) #define INI_FREE( ctx, ptr ) ( free( ptr ) ) #endif #ifndef INI_MEMCPY #include #define INI_MEMCPY( dst, src, cnt ) ( memcpy( dst, src, cnt ) ) #endif #ifndef INI_STRLEN #include #define INI_STRLEN( s ) ( strlen( s ) ) #endif #ifndef INI_STRNICMP #ifdef _WIN32 #include #define INI_STRNICMP( s1, s2, cnt ) ( strnicmp( s1, s2, cnt ) ) #else #include #define INI_STRNICMP( s1, s2, cnt ) ( strncasecmp( s1, s2, cnt ) ) #endif #endif struct ini_internal_section_t { char name[ 32 ]; char* name_large; }; struct ini_internal_property_t { int section; char name[ 32 ]; char* name_large; char value[ 64 ]; char* value_large; }; struct ini_t { struct ini_internal_section_t* sections; int section_capacity; int section_count; struct ini_internal_property_t* properties; int property_capacity; int property_count; void* memctx; }; static int ini_internal_property_index( ini_t const* ini, int section, int property ) { int i; int p; if( ini && section >= 0 && section < ini->section_count ) { p = 0; for( i = 0; i < ini->property_count; ++i ) { if( ini->properties[ i ].section == section ) { if( p == property ) return i; ++p; } } } return INI_NOT_FOUND; } ini_t* ini_create( void* memctx ) { ini_t* ini; ini = (ini_t*) INI_MALLOC( memctx, sizeof( ini_t ) ); ini->memctx = memctx; ini->sections = (struct ini_internal_section_t*) INI_MALLOC( ini->memctx, INITIAL_CAPACITY * sizeof( ini->sections[ 0 ] ) ); ini->section_capacity = INITIAL_CAPACITY; ini->section_count = 1; /* global section */ ini->sections[ 0 ].name[ 0 ] = '\0'; ini->sections[ 0 ].name_large = 0; ini->properties = (struct ini_internal_property_t*) INI_MALLOC( ini->memctx, INITIAL_CAPACITY * sizeof( ini->properties[ 0 ] ) ); memset(ini->properties, 0, INITIAL_CAPACITY * sizeof(ini->properties[0])); ini->property_capacity = INITIAL_CAPACITY; ini->property_count = 0; return ini; } ini_t* ini_load( char const* data, void* memctx ) { ini_t* ini; char const* ptr; int s; char const* start; char const* start2; int l; ini = ini_create( memctx ); ptr = data; if( ptr ) { s = 0; while( *ptr ) { /* trim leading whitespace */ while( *ptr && *ptr <=' ' ) ++ptr; /* done? */ if( !*ptr ) break; /* comment */ else if( *ptr == ';' ) { while( *ptr && *ptr !='\n' ) ++ptr; } /* section */ else if( *ptr == '[' ) { ++ptr; start = ptr; while( *ptr && *ptr !=']' && *ptr != '\n' ) ++ptr; if( *ptr == ']' ) { s = ini_section_add( ini, start, (int)( ptr - start) ); ++ptr; } } /* property */ else { start = ptr; while( *ptr && *ptr !='=' && *ptr != '\n' ) ++ptr; if( *ptr == '=' ) { l = (int)( ptr - start); ++ptr; while( *ptr && *ptr <= ' ' && *ptr != '\n' ) ptr++; start2 = ptr; while( *ptr && *ptr != '\n' ) ++ptr; while( *(--ptr) <= ' ' ) (void)ptr; ptr++; ini_property_add( ini, s, start, l, start2, (int)( ptr - start2) ); } } } } return ini; } int ini_save( ini_t const* ini, char* data, int size ) { int s; int p; int i; int l; char* n; int pos; if( ini ) { pos = 0; for( s = 0; s < ini->section_count; ++s ) { n = ini->sections[ s ].name_large ? ini->sections[ s ].name_large : ini->sections[ s ].name; l = (int) INI_STRLEN( n ); if( l > 0 ) { if( data && pos < size ) data[ pos ] = '['; ++pos; for( i = 0; i < l; ++i ) { if( data && pos < size ) data[ pos ] = n[ i ]; ++pos; } if( data && pos < size ) data[ pos ] = ']'; ++pos; if( data && pos < size ) data[ pos ] = '\n'; ++pos; } for( p = 0; p < ini->property_count; ++p ) { if( ini->properties[ p ].section == s ) { n = ini->properties[ p ].name_large ? ini->properties[ p ].name_large : ini->properties[ p ].name; l = (int) INI_STRLEN( n ); for( i = 0; i < l; ++i ) { if( data && pos < size ) data[ pos ] = n[ i ]; ++pos; } if( data && pos < size ) data[ pos ] = '='; ++pos; n = ini->properties[ p ].value_large ? ini->properties[ p ].value_large : ini->properties[ p ].value; l = (int) INI_STRLEN( n ); for( i = 0; i < l; ++i ) { if( data && pos < size ) data[ pos ] = n[ i ]; ++pos; } if( data && pos < size ) data[ pos ] = '\n'; ++pos; } } if( pos > 0 ) { if( data && pos < size ) data[ pos ] = '\n'; ++pos; } } if( data && pos < size ) data[ pos ] = '\0'; ++pos; return pos; } return 0; } void ini_destroy( ini_t* ini ) { int i; if( ini ) { for( i = 0; i < ini->property_count; ++i ) { if( ini->properties[ i ].value_large ) INI_FREE( ini->memctx, ini->properties[ i ].value_large ); if( ini->properties[ i ].name_large ) INI_FREE( ini->memctx, ini->properties[ i ].name_large ); } for( i = 0; i < ini->section_count; ++i ) if( ini->sections[ i ].name_large ) INI_FREE( ini->memctx, ini->sections[ i ].name_large ); INI_FREE( ini->memctx, ini->properties ); INI_FREE( ini->memctx, ini->sections ); INI_FREE( ini->memctx, ini ); } } int ini_section_count( ini_t const* ini ) { if( ini ) return ini->section_count; return 0; } char const* ini_section_name( ini_t const* ini, int section ) { if( ini && section >= 0 && section < ini->section_count ) return ini->sections[ section ].name_large ? ini->sections[ section ].name_large : ini->sections[ section ].name; return NULL; } int ini_property_count( ini_t const* ini, int section ) { int i; int count; if( ini ) { count = 0; for( i = 0; i < ini->property_count; ++i ) { if( ini->properties[ i ].section == section ) ++count; } return count; } return 0; } char const* ini_property_name( ini_t const* ini, int section, int property ) { int p; if( ini && section >= 0 && section < ini->section_count ) { p = ini_internal_property_index( ini, section, property ); if( p != INI_NOT_FOUND ) return ini->properties[ p ].name_large ? ini->properties[ p ].name_large : ini->properties[ p ].name; } return NULL; } char const* ini_property_value( ini_t const* ini, int section, int property ) { int p; if( ini && section >= 0 && section < ini->section_count ) { p = ini_internal_property_index( ini, section, property ); if( p != INI_NOT_FOUND ) return ini->properties[ p ].value_large ? ini->properties[ p ].value_large : ini->properties[ p ].value; } return NULL; } int ini_find_section( ini_t const* ini, char const* name, int name_length ) { int i; if( ini && name ) { if( name_length <= 0 ) name_length = (int) INI_STRLEN( name ); for( i = 0; i < ini->section_count; ++i ) { char const* const other = ini->sections[ i ].name_large ? ini->sections[ i ].name_large : ini->sections[ i ].name; if( INI_STRNICMP( name, other, name_length ) == 0 ) return i; } } return INI_NOT_FOUND; } int ini_find_property( ini_t const* ini, int section, char const* name, int name_length ) { int i; int c; if( ini && name && section >= 0 && section < ini->section_count) { if( name_length <= 0 ) name_length = (int) INI_STRLEN( name ); c = 0; for( i = 0; i < ini->property_capacity; ++i ) { if( ini->properties[ i ].section == section ) { char const* const other = ini->properties[ i ].name_large ? ini->properties[ i ].name_large : ini->properties[ i ].name; if( INI_STRNICMP( name, other, name_length ) == 0 ) return c; ++c; } } } return INI_NOT_FOUND; } int ini_section_add( ini_t* ini, char const* name, int length ) { struct ini_internal_section_t* new_sections; if( ini && name ) { if( length <= 0 ) length = (int) INI_STRLEN( name ); if( ini->section_count >= ini->section_capacity ) { ini->section_capacity *= 2; new_sections = (struct ini_internal_section_t*) INI_MALLOC( ini->memctx, ini->section_capacity * sizeof( ini->sections[ 0 ] ) ); INI_MEMCPY( new_sections, ini->sections, ini->section_count * sizeof( ini->sections[ 0 ] ) ); INI_FREE( ini->memctx, ini->sections ); ini->sections = new_sections; } ini->sections[ ini->section_count ].name_large = 0; if( length + 1 >= sizeof( ini->sections[ 0 ].name ) ) { ini->sections[ ini->section_count ].name_large = (char*) INI_MALLOC( ini->memctx, (size_t) length + 1 ); INI_MEMCPY( ini->sections[ ini->section_count ].name_large, name, (size_t) length ); ini->sections[ ini->section_count ].name_large[ length ] = '\0'; } else { INI_MEMCPY( ini->sections[ ini->section_count ].name, name, (size_t) length ); ini->sections[ ini->section_count ].name[ length ] = '\0'; } return ini->section_count++; } return INI_NOT_FOUND; } void ini_property_add( ini_t* ini, int section, char const* name, int name_length, char const* value, int value_length ) { struct ini_internal_property_t* new_properties; if( ini && name && section >= 0 && section < ini->section_count ) { if( name_length <= 0 ) name_length = (int) INI_STRLEN( name ); if( value_length <= 0 ) value_length = (int) INI_STRLEN( value ); if( ini->property_count >= ini->property_capacity ) { ini->property_capacity *= 2; new_properties = (struct ini_internal_property_t*) INI_MALLOC( ini->memctx, ini->property_capacity * sizeof( ini->properties[ 0 ] ) ); INI_MEMCPY( new_properties, ini->properties, ini->property_count * sizeof( ini->properties[ 0 ] ) ); INI_FREE( ini->memctx, ini->properties ); ini->properties = new_properties; } ini->properties[ ini->property_count ].section = section; ini->properties[ ini->property_count ].name_large = 0; ini->properties[ ini->property_count ].value_large = 0; if( name_length + 1 >= sizeof( ini->properties[ 0 ].name ) ) { ini->properties[ ini->property_count ].name_large = (char*) INI_MALLOC( ini->memctx, (size_t) name_length + 1 ); INI_MEMCPY( ini->properties[ ini->property_count ].name_large, name, (size_t) name_length ); ini->properties[ ini->property_count ].name_large[ name_length ] = '\0'; } else { INI_MEMCPY( ini->properties[ ini->property_count ].name, name, (size_t) name_length ); ini->properties[ ini->property_count ].name[ name_length ] = '\0'; } if( value_length + 1 >= sizeof( ini->properties[ 0 ].value ) ) { ini->properties[ ini->property_count ].value_large = (char*) INI_MALLOC( ini->memctx, (size_t) value_length + 1 ); INI_MEMCPY( ini->properties[ ini->property_count ].value_large, value, (size_t) value_length ); ini->properties[ ini->property_count ].value_large[ value_length ] = '\0'; } else { INI_MEMCPY( ini->properties[ ini->property_count ].value, value, (size_t) value_length ); ini->properties[ ini->property_count ].value[ value_length ] = '\0'; } ++ini->property_count; } } void ini_section_remove( ini_t* ini, int section ) { int p; if( ini && section >= 0 && section < ini->section_count ) { if( ini->sections[ section ].name_large ) INI_FREE( ini->memctx, ini->sections[ section ].name_large ); for( p = ini->property_count - 1; p >= 0; --p ) { if( ini->properties[ p ].section == section ) { if( ini->properties[ p ].value_large ) INI_FREE( ini->memctx, ini->properties[ p ].value_large ); if( ini->properties[ p ].name_large ) INI_FREE( ini->memctx, ini->properties[ p ].name_large ); ini->properties[ p ] = ini->properties[ --ini->property_count ]; } } ini->sections[ section ] = ini->sections[ --ini->section_count ]; for( p = 0; p < ini->property_count; ++p ) { if( ini->properties[ p ].section == ini->section_count ) ini->properties[ p ].section = section; } } } void ini_property_remove( ini_t* ini, int section, int property ) { int p; if( ini && section >= 0 && section < ini->section_count ) { p = ini_internal_property_index( ini, section, property ); if( p != INI_NOT_FOUND ) { if( ini->properties[ p ].value_large ) INI_FREE( ini->memctx, ini->properties[ p ].value_large ); if( ini->properties[ p ].name_large ) INI_FREE( ini->memctx, ini->properties[ p ].name_large ); ini->properties[ p ] = ini->properties[ --ini->property_count ]; return; } } } void ini_section_name_set( ini_t* ini, int section, char const* name, int length ) { if( ini && name && section >= 0 && section < ini->section_count ) { if( length <= 0 ) length = (int) INI_STRLEN( name ); if( ini->sections[ section ].name_large ) INI_FREE( ini->memctx, ini->sections[ section ].name_large ); ini->sections[ section ].name_large = 0; if( length + 1 >= sizeof( ini->sections[ 0 ].name ) ) { ini->sections[ section ].name_large = (char*) INI_MALLOC( ini->memctx, (size_t) length + 1 ); INI_MEMCPY( ini->sections[ section ].name_large, name, (size_t) length ); ini->sections[ section ].name_large[ length ] = '\0'; } else { INI_MEMCPY( ini->sections[ section ].name, name, (size_t) length ); ini->sections[ section ].name[ length ] = '\0'; } } } void ini_property_name_set( ini_t* ini, int section, int property, char const* name, int length ) { int p; if( ini && name && section >= 0 && section < ini->section_count ) { if( length <= 0 ) length = (int) INI_STRLEN( name ); p = ini_internal_property_index( ini, section, property ); if( p != INI_NOT_FOUND ) { if( ini->properties[ p ].name_large ) INI_FREE( ini->memctx, ini->properties[ p ].name_large ); ini->properties[ ini->property_count ].name_large = 0; if( length + 1 >= sizeof( ini->properties[ 0 ].name ) ) { ini->properties[ p ].name_large = (char*) INI_MALLOC( ini->memctx, (size_t) length + 1 ); INI_MEMCPY( ini->properties[ p ].name_large, name, (size_t) length ); ini->properties[ p ].name_large[ length ] = '\0'; } else { INI_MEMCPY( ini->properties[ p ].name, name, (size_t) length ); ini->properties[ p ].name[ length ] = '\0'; } } } } void ini_property_value_set( ini_t* ini, int section, int property, char const* value, int length ) { int p; if( ini && value && section >= 0 && section < ini->section_count ) { if( length <= 0 ) length = (int) INI_STRLEN( value ); p = ini_internal_property_index( ini, section, property ); if( p != INI_NOT_FOUND ) { if( ini->properties[ p ].value_large ) INI_FREE( ini->memctx, ini->properties[ p ].value_large ); ini->properties[ ini->property_count ].value_large = 0; if( length + 1 >= sizeof( ini->properties[ 0 ].value ) ) { ini->properties[ p ].value_large = (char*) INI_MALLOC( ini->memctx, (size_t) length + 1 ); INI_MEMCPY( ini->properties[ p ].value_large, value, (size_t) length ); ini->properties[ p ].value_large[ length ] = '\0'; } else { INI_MEMCPY( ini->properties[ p ].value, value, (size_t) length ); ini->properties[ p ].value[ length ] = '\0'; } } } } #endif /* INI_IMPLEMENTATION */ /* contributors: Randy Gaul (copy-paste bug in ini_property_value_set) Branimir Karadzic (INI_STRNICMP bugfix) revision history: 1.2 using strnicmp for correct length compares, fixed copy-paste bug in ini_property_value_set 1.1 customization, added documentation, cleanup 1.0 first publicly released version */ /* ------------------------------------------------------------------------------ This software is available under 2 licenses - you may choose the one you like. ------------------------------------------------------------------------------ ALTERNATIVE A - MIT License Copyright (c) 2015 Mattias Gustavsson Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ------------------------------------------------------------------------------ ALTERNATIVE B - Public Domain (www.unlicense.org) This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ------------------------------------------------------------------------------ */ pg_auto_failover-1.6.3/src/bin/lib/log/000077500000000000000000000000001414244367200177435ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/lib/log/LICENSE000066400000000000000000000020271414244367200207510ustar00rootroot00000000000000Copyright (c) 2017 rxi Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. pg_auto_failover-1.6.3/src/bin/lib/log/README.md000066400000000000000000000040561414244367200212270ustar00rootroot00000000000000# log.c A simple logging library implemented in C99 ![screenshot](https://cloud.githubusercontent.com/assets/3920290/23831970/a2415e96-0723-11e7-9886-f8f5d2de60fe.png) ## Usage **[log.c](src/log.c?raw=1)** and **[log.h](src/log.h?raw=1)** should be dropped into an existing project and compiled along with it. The library provides 6 function-like macros for logging: ```c log_trace(const char *fmt, ...); log_debug(const char *fmt, ...); log_info(const char *fmt, ...); log_warn(const char *fmt, ...); log_error(const char *fmt, ...); log_fatal(const char *fmt, ...); ``` Each function takes a printf format string followed by additional arguments: ```c log_trace("Hello %s", "world") ``` Resulting in a line with the given format printed to stderr: ``` 20:18:26 TRACE src/main.c:11: Hello world ``` #### log_set_quiet(int enable) Quiet-mode can be enabled by passing `1` to the `log_set_quiet()` function. While this mode is enabled the library will not output anything to stderr, but will continue to write to the file if one is set. #### log_set_level(int level) The current logging level can be set by using the `log_set_level()` function. All logs below the given level will be ignored. By default the level is `LOG_TRACE`, such that nothing is ignored. #### log_set_fp(FILE *fp) A file pointer where the log should be written can be provided to the library by using the `log_set_fp()` function. The data written to the file output is of the following format: ``` 2047-03-11 20:18:26 TRACE src/main.c:11: Hello world ``` #### log_set_lock(log_LockFn fn) If the log will be written to from multiple threads a lock function can be set. The function is passed a `udata` value (set by `log_set_udata()`) and the integer `1` if the lock should be acquired or `0` if the lock should be released. #### log_use_colors(int enable) Colors in the log output can be enabled by passing `1` to the `log_use_colors()` function. ## License This library is free software; you can redistribute it and/or modify it under the terms of the MIT license. See [LICENSE](LICENSE) for details. pg_auto_failover-1.6.3/src/bin/lib/log/src/000077500000000000000000000000001414244367200205325ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/lib/log/src/log.c000066400000000000000000000066421414244367200214670ustar00rootroot00000000000000/* * Copyright (c) 2017 rxi * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include #include #include #include #include #include "snprintf.h" #include "log.h" static struct { void *udata; log_LockFn lock; FILE *fp; int level; int quiet; int useColors; } L; static const char *level_names[] = { "TRACE", "DEBUG", "INFO", "WARN", "ERROR", "FATAL" }; static const char *level_colors[] = { "\x1b[94m", "\x1b[36m", "\x1b[32m", "\x1b[33m", "\x1b[31m", "\x1b[35m" }; static void lock(void) { if (L.lock) { L.lock(L.udata, 1); } } static void unlock(void) { if (L.lock) { L.lock(L.udata, 0); } } void log_set_udata(void *udata) { L.udata = udata; } void log_set_lock(log_LockFn fn) { L.lock = fn; } void log_set_fp(FILE *fp) { L.fp = fp; } void log_set_level(int level) { L.level = level; } int log_get_level(void) { return L.level; } void log_set_quiet(int enable) { L.quiet = enable ? 1 : 0; } void log_use_colors(int enable) { L.useColors = enable ? 1 : 0; } void log_log(int level, const char *file, int line, const char *fmt, ...) { time_t t; struct tm *lt; if (level < L.level) { return; } if (fmt == NULL) { return; } /* Acquire lock */ lock(); /* Get current time */ t = time(NULL); lt = localtime(&t); /* Log to stderr */ if (!L.quiet) { va_list args; char buf[16]; int showLineNumber = L.level <= 1; buf[strftime(buf, sizeof(buf), "%H:%M:%S", lt)] = '\0'; if (L.useColors) { pg_fprintf(stderr, "%s %d %s%-5s\x1b[0m ", buf, getpid(), level_colors[level], level_names[level]); if (showLineNumber) { pg_fprintf(stderr, "\x1b[90m%s:%d:\x1b[0m ", file, line); } } else { pg_fprintf(stderr, "%s %d %-5s ", buf, getpid(), level_names[level]); if (showLineNumber) { pg_fprintf(stderr, "%s:%d ", file, line); } } va_start(args, fmt); pg_vfprintf(stderr, fmt, args); va_end(args); pg_fprintf(stderr, "\n"); } /* Log to file */ if (L.fp) { va_list args; char buf[32]; buf[strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", lt)] = '\0'; pg_fprintf(L.fp, "%s %d %-5s %s:%d: ", buf, getpid(), level_names[level], file, line); va_start(args, fmt); pg_vfprintf(L.fp, fmt, args); va_end(args); pg_fprintf(L.fp, "\n"); } /* Release lock */ unlock(); } pg_auto_failover-1.6.3/src/bin/lib/log/src/log.h000066400000000000000000000023661414244367200214730ustar00rootroot00000000000000/** * Copyright (c) 2017 rxi * * This library is free software; you can redistribute it and/or modify it * under the terms of the MIT license. See `log.c` for details. */ #ifndef LOG_H #define LOG_H #include #include #define LOG_VERSION "0.1.0" typedef void (*log_LockFn)(void *udata, int lock); enum { LOG_TRACE, LOG_DEBUG, LOG_INFO, LOG_WARN, LOG_ERROR, LOG_FATAL }; #define log_trace(...) log_log(LOG_TRACE, __FILE__, __LINE__, __VA_ARGS__) #define log_debug(...) log_log(LOG_DEBUG, __FILE__, __LINE__, __VA_ARGS__) #define log_info(...) log_log(LOG_INFO, __FILE__, __LINE__, __VA_ARGS__) #define log_warn(...) log_log(LOG_WARN, __FILE__, __LINE__, __VA_ARGS__) #define log_error(...) log_log(LOG_ERROR, __FILE__, __LINE__, __VA_ARGS__) #define log_fatal(...) log_log(LOG_FATAL, __FILE__, __LINE__, __VA_ARGS__) #define log_level(level, ...) log_log(level, __FILE__, __LINE__, __VA_ARGS__) void log_set_udata(void *udata); void log_set_lock(log_LockFn fn); void log_set_fp(FILE *fp); void log_set_level(int level); int log_get_level(void); void log_set_quiet(int enable); void log_use_colors(int enable); void log_log(int level, const char *file, int line, const char *fmt, ...) __attribute__((format(printf, 4, 5))); #endif pg_auto_failover-1.6.3/src/bin/lib/parson/000077500000000000000000000000001414244367200204645ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/lib/parson/.gitignore000066400000000000000000000000341414244367200224510ustar00rootroot00000000000000testcpp testcpp.* test.dSYM pg_auto_failover-1.6.3/src/bin/lib/parson/LICENSE000066400000000000000000000020671414244367200214760ustar00rootroot00000000000000MIT License Copyright (c) 2012 - 2020 Krzysztof Gabis Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. pg_auto_failover-1.6.3/src/bin/lib/parson/Makefile000066400000000000000000000007731414244367200221330ustar00rootroot00000000000000CC = gcc CFLAGS = -O0 -g -Wall -Wextra -std=c89 -pedantic-errors -DTESTS_MAIN CPPC = g++ CPPFLAGS = -O0 -g -Wall -Wextra -DTESTS_MAIN all: test testcpp test_hash_collisions .PHONY: test testcpp test_hash_collisions test: tests.c parson.c $(CC) $(CFLAGS) -o $@ tests.c parson.c ./$@ testcpp: tests.c parson.c $(CPPC) $(CPPFLAGS) -o $@ tests.c parson.c ./$@ test_hash_collisions: tests.c parson.c $(CC) $(CFLAGS) -DPARSON_FORCE_HASH_COLLISIONS -o $@ tests.c parson.c ./$@ clean: rm -f test *.o pg_auto_failover-1.6.3/src/bin/lib/parson/parson.c000066400000000000000000002367731414244367200221540ustar00rootroot00000000000000/* SPDX-License-Identifier: MIT Parson 1.2.1 ( http://kgabis.github.com/parson/ ) Copyright (c) 2012 - 2021 Krzysztof Gabis Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifdef _MSC_VER #ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS #endif /* _CRT_SECURE_NO_WARNINGS */ #endif /* _MSC_VER */ #include "parson.h" #define PARSON_IMPL_VERSION_MAJOR 1 #define PARSON_IMPL_VERSION_MINOR 2 #define PARSON_IMPL_VERSION_PATCH 1 #if (PARSON_VERSION_MAJOR != PARSON_IMPL_VERSION_MAJOR)\ || (PARSON_VERSION_MINOR != PARSON_IMPL_VERSION_MINOR)\ || (PARSON_VERSION_PATCH != PARSON_IMPL_VERSION_PATCH) #error "parson version mismatch between parson.c and parson.h" #endif #include #include #include #include #include #include /* Apparently sscanf is not implemented in some "standard" libraries, so don't use it, if you * don't have to. */ #ifdef sscanf #undef sscanf #define sscanf THINK_TWICE_ABOUT_USING_SSCANF #endif /* strcpy is unsafe */ #ifdef strcpy #undef strcpy #endif #define strcpy USE_MEMCPY_INSTEAD_OF_STRCPY #define STARTING_CAPACITY 16 #define MAX_NESTING 2048 #define FLOAT_FORMAT "%1.17g" /* do not increase precision without incresing NUM_BUF_SIZE */ #define NUM_BUF_SIZE 64 /* double printed with "%1.17g" shouldn't be longer than 25 bytes so let's be paranoid and use 64 */ #define SIZEOF_TOKEN(a) (sizeof(a) - 1) #define SKIP_CHAR(str) ((*str)++) #define SKIP_WHITESPACES(str) while (isspace((unsigned char)(**str))) { SKIP_CHAR(str); } #define MAX(a, b) ((a) > (b) ? (a) : (b)) #undef malloc #undef free #if defined(isnan) && defined(isinf) #define IS_NUMBER_INVALID(x) (isnan((x)) || isinf((x))) #else #define IS_NUMBER_INVALID(x) (((x) * 0.0) != 0.0) #endif #define OBJECT_INVALID_IX ((size_t)-1) static JSON_Malloc_Function parson_malloc = malloc; static JSON_Free_Function parson_free = free; static int parson_escape_slashes = 1; #define IS_CONT(b) (((unsigned char)(b) & 0xC0) == 0x80) /* is utf-8 continuation byte */ typedef int parson_bool_t; #define PARSON_TRUE 1 #define PARSON_FALSE 0 typedef struct json_string { char *chars; size_t length; } JSON_String; /* Type definitions */ typedef union json_value_value { JSON_String string; double number; JSON_Object *object; JSON_Array *array; int boolean; int null; } JSON_Value_Value; struct json_value_t { JSON_Value *parent; JSON_Value_Type type; JSON_Value_Value value; }; struct json_object_t { JSON_Value *wrapping_value; size_t *cells; unsigned long *hashes; char **names; JSON_Value **values; size_t *cell_ixs; size_t count; size_t item_capacity; size_t cell_capacity; }; struct json_array_t { JSON_Value *wrapping_value; JSON_Value **items; size_t count; size_t capacity; }; /* Various */ static char * read_file(const char *filename); static void remove_comments(char *string, const char *start_token, const char *end_token); static char * parson_strndup(const char *string, size_t n); static char * parson_strdup(const char *string); static int hex_char_to_int(char c); static JSON_Status parse_utf16_hex(const char *string, unsigned int *result); static int num_bytes_in_utf8_sequence(unsigned char c); static JSON_Status verify_utf8_sequence(const unsigned char *string, int *len); static parson_bool_t is_valid_utf8(const char *string, size_t string_len); static parson_bool_t is_decimal(const char *string, size_t length); static unsigned long hash_string(const char *string, size_t n); /* JSON Object */ static JSON_Object * json_object_make(JSON_Value *wrapping_value); static JSON_Status json_object_init(JSON_Object *object, size_t capacity); static void json_object_deinit(JSON_Object *object, parson_bool_t free_keys, parson_bool_t free_values); static JSON_Status json_object_grow_and_rehash(JSON_Object *object); static size_t json_object_get_cell_ix(const JSON_Object *object, const char *key, size_t key_len, unsigned long hash, parson_bool_t *out_found); static JSON_Status json_object_add(JSON_Object *object, char *name, JSON_Value *value); static JSON_Value * json_object_getn_value(const JSON_Object *object, const char *name, size_t name_len); static JSON_Status json_object_remove_internal(JSON_Object *object, const char *name, parson_bool_t free_value); static JSON_Status json_object_dotremove_internal(JSON_Object *object, const char *name, parson_bool_t free_value); static void json_object_free(JSON_Object *object); /* JSON Array */ static JSON_Array * json_array_make(JSON_Value *wrapping_value); static JSON_Status json_array_add(JSON_Array *array, JSON_Value *value); static JSON_Status json_array_resize(JSON_Array *array, size_t new_capacity); static void json_array_free(JSON_Array *array); /* JSON Value */ static JSON_Value * json_value_init_string_no_copy(char *string, size_t length); static const JSON_String * json_value_get_string_desc(const JSON_Value *value); /* Parser */ static JSON_Status skip_quotes(const char **string); static JSON_Status parse_utf16(const char **unprocessed, char **processed); static char * process_string(const char *input, size_t input_len, size_t *output_len); static char * get_quoted_string(const char **string, size_t *output_string_len); static JSON_Value * parse_object_value(const char **string, size_t nesting); static JSON_Value * parse_array_value(const char **string, size_t nesting); static JSON_Value * parse_string_value(const char **string); static JSON_Value * parse_boolean_value(const char **string); static JSON_Value * parse_number_value(const char **string); static JSON_Value * parse_null_value(const char **string); static JSON_Value * parse_value(const char **string, size_t nesting); /* Serialization */ static int json_serialize_to_buffer_r(const JSON_Value *value, char *buf, int level, parson_bool_t is_pretty, char *num_buf); static int json_serialize_string(const char *string, size_t len, char *buf); static int append_indent(char *buf, int level); static int append_string(char *buf, const char *string); /* Various */ static char * read_file(const char * filename) { FILE *fp = fopen(filename, "r"); size_t size_to_read = 0; size_t size_read = 0; long pos; char *file_contents; if (!fp) { return NULL; } fseek(fp, 0L, SEEK_END); pos = ftell(fp); if (pos < 0) { fclose(fp); return NULL; } size_to_read = pos; rewind(fp); file_contents = (char*)parson_malloc(sizeof(char) * (size_to_read + 1)); if (!file_contents) { fclose(fp); return NULL; } size_read = fread(file_contents, 1, size_to_read, fp); if (size_read == 0 || ferror(fp)) { fclose(fp); parson_free(file_contents); return NULL; } fclose(fp); file_contents[size_read] = '\0'; return file_contents; } static void remove_comments(char *string, const char *start_token, const char *end_token) { parson_bool_t in_string = PARSON_FALSE, escaped = PARSON_FALSE; size_t i; char *ptr = NULL, current_char; size_t start_token_len = strlen(start_token); size_t end_token_len = strlen(end_token); if (start_token_len == 0 || end_token_len == 0) { return; } while ((current_char = *string) != '\0') { if (current_char == '\\' && !escaped) { escaped = PARSON_TRUE; string++; continue; } else if (current_char == '\"' && !escaped) { in_string = !in_string; } else if (!in_string && strncmp(string, start_token, start_token_len) == 0) { for(i = 0; i < start_token_len; i++) { string[i] = ' '; } string = string + start_token_len; ptr = strstr(string, end_token); if (!ptr) { return; } for (i = 0; i < (ptr - string) + end_token_len; i++) { string[i] = ' '; } string = ptr + end_token_len - 1; } escaped = PARSON_FALSE; string++; } } static char * parson_strndup(const char *string, size_t n) { /* We expect the caller has validated that 'n' fits within the input buffer. */ char *output_string = (char*)parson_malloc(n + 1); if (!output_string) { return NULL; } output_string[n] = '\0'; memcpy(output_string, string, n); return output_string; } static char * parson_strdup(const char *string) { return parson_strndup(string, strlen(string)); } static int hex_char_to_int(char c) { if (c >= '0' && c <= '9') { return c - '0'; } else if (c >= 'a' && c <= 'f') { return c - 'a' + 10; } else if (c >= 'A' && c <= 'F') { return c - 'A' + 10; } return -1; } static JSON_Status parse_utf16_hex(const char *s, unsigned int *result) { int x1, x2, x3, x4; if (s[0] == '\0' || s[1] == '\0' || s[2] == '\0' || s[3] == '\0') { return JSONFailure; } x1 = hex_char_to_int(s[0]); x2 = hex_char_to_int(s[1]); x3 = hex_char_to_int(s[2]); x4 = hex_char_to_int(s[3]); if (x1 == -1 || x2 == -1 || x3 == -1 || x4 == -1) { return JSONFailure; } *result = (unsigned int)((x1 << 12) | (x2 << 8) | (x3 << 4) | x4); return JSONSuccess; } static int num_bytes_in_utf8_sequence(unsigned char c) { if (c == 0xC0 || c == 0xC1 || c > 0xF4 || IS_CONT(c)) { return 0; } else if ((c & 0x80) == 0) { /* 0xxxxxxx */ return 1; } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx */ return 2; } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx */ return 3; } else if ((c & 0xF8) == 0xF0) { /* 11110xxx */ return 4; } return 0; /* won't happen */ } static JSON_Status verify_utf8_sequence(const unsigned char *string, int *len) { unsigned int cp = 0; *len = num_bytes_in_utf8_sequence(string[0]); if (*len == 1) { cp = string[0]; } else if (*len == 2 && IS_CONT(string[1])) { cp = string[0] & 0x1F; cp = (cp << 6) | (string[1] & 0x3F); } else if (*len == 3 && IS_CONT(string[1]) && IS_CONT(string[2])) { cp = ((unsigned char)string[0]) & 0xF; cp = (cp << 6) | (string[1] & 0x3F); cp = (cp << 6) | (string[2] & 0x3F); } else if (*len == 4 && IS_CONT(string[1]) && IS_CONT(string[2]) && IS_CONT(string[3])) { cp = string[0] & 0x7; cp = (cp << 6) | (string[1] & 0x3F); cp = (cp << 6) | (string[2] & 0x3F); cp = (cp << 6) | (string[3] & 0x3F); } else { return JSONFailure; } /* overlong encodings */ if ((cp < 0x80 && *len > 1) || (cp < 0x800 && *len > 2) || (cp < 0x10000 && *len > 3)) { return JSONFailure; } /* invalid unicode */ if (cp > 0x10FFFF) { return JSONFailure; } /* surrogate halves */ if (cp >= 0xD800 && cp <= 0xDFFF) { return JSONFailure; } return JSONSuccess; } static int is_valid_utf8(const char *string, size_t string_len) { int len = 0; const char *string_end = string + string_len; while (string < string_end) { if (verify_utf8_sequence((const unsigned char*)string, &len) != JSONSuccess) { return PARSON_FALSE; } string += len; } return PARSON_TRUE; } static parson_bool_t is_decimal(const char *string, size_t length) { if (length > 1 && string[0] == '0' && string[1] != '.') { return PARSON_FALSE; } if (length > 2 && !strncmp(string, "-0", 2) && string[2] != '.') { return PARSON_FALSE; } while (length--) { if (strchr("xX", string[length])) { return PARSON_FALSE; } } return PARSON_TRUE; } static unsigned long hash_string(const char *string, size_t n) { #ifdef PARSON_FORCE_HASH_COLLISIONS (void)string; (void)n; return 0; #else unsigned long hash = 5381; unsigned char c; size_t i = 0; for (i = 0; i < n; i++) { c = string[i]; if (c == '\0') { break; } hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ } return hash; #endif } /* JSON Object */ static JSON_Object * json_object_make(JSON_Value *wrapping_value) { JSON_Status res = JSONFailure; JSON_Object *new_obj = (JSON_Object*)parson_malloc(sizeof(JSON_Object)); if (new_obj == NULL) { return NULL; } new_obj->wrapping_value = wrapping_value; res = json_object_init(new_obj, 0); if (res != JSONSuccess) { parson_free(new_obj); return NULL; } return new_obj; } static JSON_Status json_object_init(JSON_Object *object, size_t capacity) { unsigned int i = 0; object->cells = NULL; object->names = NULL; object->values = NULL; object->cell_ixs = NULL; object->hashes = NULL; object->count = 0; object->cell_capacity = capacity; object->item_capacity = (unsigned int)(capacity * 0.7f); if (capacity == 0) { return JSONSuccess; } object->cells = (size_t*)parson_malloc(object->cell_capacity * sizeof(*object->cells)); object->names = (char**)parson_malloc(object->item_capacity * sizeof(*object->names)); object->values = (JSON_Value**)parson_malloc(object->item_capacity * sizeof(*object->values)); object->cell_ixs = (size_t*)parson_malloc(object->item_capacity * sizeof(*object->cell_ixs)); object->hashes = (unsigned long*)parson_malloc(object->item_capacity * sizeof(*object->hashes)); if (object->cells == NULL || object->names == NULL || object->values == NULL || object->cell_ixs == NULL || object->hashes == NULL) { goto error; } for (i = 0; i < object->cell_capacity; i++) { object->cells[i] = OBJECT_INVALID_IX; } return JSONSuccess; error: parson_free(object->cells); parson_free(object->names); parson_free(object->values); parson_free(object->cell_ixs); parson_free(object->hashes); return JSONFailure; } static void json_object_deinit(JSON_Object *object, parson_bool_t free_keys, parson_bool_t free_values) { unsigned int i = 0; for (i = 0; i < object->count; i++) { if (free_keys) { parson_free(object->names[i]); } if (free_values) { json_value_free(object->values[i]); } } object->count = 0; object->item_capacity = 0; object->cell_capacity = 0; parson_free(object->cells); parson_free(object->names); parson_free(object->values); parson_free(object->cell_ixs); parson_free(object->hashes); object->cells = NULL; object->names = NULL; object->values = NULL; object->cell_ixs = NULL; object->hashes = NULL; } static JSON_Status json_object_grow_and_rehash(JSON_Object *object) { JSON_Value *wrapping_value = NULL; JSON_Object new_object; char *key = NULL; JSON_Value *value = NULL; unsigned int i = 0; size_t new_capacity = MAX(object->cell_capacity * 2, STARTING_CAPACITY); JSON_Status res = json_object_init(&new_object, new_capacity); if (res != JSONSuccess) { return JSONFailure; } wrapping_value = json_object_get_wrapping_value(object); new_object.wrapping_value = wrapping_value; for (i = 0; i < object->count; i++) { key = object->names[i]; value = object->values[i]; res = json_object_add(&new_object, key, value); if (res != JSONSuccess) { json_object_deinit(&new_object, PARSON_FALSE, PARSON_FALSE); return JSONFailure; } value->parent = wrapping_value; } json_object_deinit(object, PARSON_FALSE, PARSON_FALSE); *object = new_object; return JSONSuccess; } static size_t json_object_get_cell_ix(const JSON_Object *object, const char *key, size_t key_len, unsigned long hash, parson_bool_t *out_found) { size_t cell_ix = hash & (object->cell_capacity - 1); size_t cell = 0; size_t ix = 0; unsigned int i = 0; unsigned long hash_to_check = 0; const char *key_to_check = NULL; size_t key_to_check_len = 0; *out_found = PARSON_FALSE; for (i = 0; i < object->cell_capacity; i++) { ix = (cell_ix + i) & (object->cell_capacity - 1); cell = object->cells[ix]; if (cell == OBJECT_INVALID_IX) { return ix; } hash_to_check = object->hashes[cell]; if (hash != hash_to_check) { continue; } key_to_check = object->names[cell]; key_to_check_len = strlen(key_to_check); if (key_to_check_len == key_len && strncmp(key, key_to_check, key_len) == 0) { *out_found = PARSON_TRUE; return ix; } } return OBJECT_INVALID_IX; } static JSON_Status json_object_add(JSON_Object *object, char *name, JSON_Value *value) { unsigned long hash = 0; parson_bool_t found = PARSON_FALSE; size_t cell_ix = 0; JSON_Status res = JSONFailure; if (!object || !name || !value) { return JSONFailure; } hash = hash_string(name, strlen(name)); found = PARSON_FALSE; cell_ix = json_object_get_cell_ix(object, name, strlen(name), hash, &found); if (found) { return JSONFailure; } if (object->count >= object->item_capacity) { res = json_object_grow_and_rehash(object); if (res != JSONSuccess) { return JSONFailure; } cell_ix = json_object_get_cell_ix(object, name, strlen(name), hash, &found); } object->names[object->count] = name; object->cells[cell_ix] = object->count; object->values[object->count] = value; object->cell_ixs[object->count] = cell_ix; object->hashes[object->count] = hash; object->count++; value->parent = json_object_get_wrapping_value(object); return JSONSuccess; } static JSON_Value * json_object_getn_value(const JSON_Object *object, const char *name, size_t name_len) { unsigned long hash = 0; parson_bool_t found = PARSON_FALSE; unsigned long cell_ix = 0; size_t item_ix = 0; if (!object || !name) { return NULL; } hash = hash_string(name, name_len); found = PARSON_FALSE; cell_ix = json_object_get_cell_ix(object, name, name_len, hash, &found); if (!found) { return NULL; } item_ix = object->cells[cell_ix]; return object->values[item_ix]; } static JSON_Status json_object_remove_internal(JSON_Object *object, const char *name, parson_bool_t free_value) { unsigned long hash = 0; parson_bool_t found = PARSON_FALSE; size_t cell = 0; size_t item_ix = 0; size_t last_item_ix = 0; size_t i = 0; size_t j = 0; size_t x = 0; size_t k = 0; JSON_Value *val = NULL; if (object == NULL) { return JSONFailure; } hash = hash_string(name, strlen(name)); found = PARSON_FALSE; cell = json_object_get_cell_ix(object, name, strlen(name), hash, &found); if (!found) { return JSONFailure; } item_ix = object->cells[cell]; if (free_value) { val = object->values[item_ix]; json_value_free(val); val = NULL; } parson_free(object->names[item_ix]); last_item_ix = object->count - 1; if (item_ix < last_item_ix) { object->names[item_ix] = object->names[last_item_ix]; object->values[item_ix] = object->values[last_item_ix]; object->cell_ixs[item_ix] = object->cell_ixs[last_item_ix]; object->hashes[item_ix] = object->hashes[last_item_ix]; object->cells[object->cell_ixs[item_ix]] = item_ix; } object->count--; i = cell; j = i; for (x = 0; x < (object->cell_capacity - 1); x++) { j = (j + 1) & (object->cell_capacity - 1); if (object->cells[j] == OBJECT_INVALID_IX) { break; } k = object->hashes[object->cells[j]] & (object->cell_capacity - 1); if ((j > i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) { object->cell_ixs[object->cells[j]] = i; object->cells[i] = object->cells[j]; i = j; } } object->cells[i] = OBJECT_INVALID_IX; return JSONSuccess; } static JSON_Status json_object_dotremove_internal(JSON_Object *object, const char *name, parson_bool_t free_value) { JSON_Value *temp_value = NULL; JSON_Object *temp_object = NULL; const char *dot_pos = strchr(name, '.'); if (!dot_pos) { return json_object_remove_internal(object, name, free_value); } temp_value = json_object_getn_value(object, name, dot_pos - name); if (json_value_get_type(temp_value) != JSONObject) { return JSONFailure; } temp_object = json_value_get_object(temp_value); return json_object_dotremove_internal(temp_object, dot_pos + 1, free_value); } static void json_object_free(JSON_Object *object) { json_object_deinit(object, PARSON_TRUE, PARSON_TRUE); parson_free(object); } /* JSON Array */ static JSON_Array * json_array_make(JSON_Value *wrapping_value) { JSON_Array *new_array = (JSON_Array*)parson_malloc(sizeof(JSON_Array)); if (new_array == NULL) { return NULL; } new_array->wrapping_value = wrapping_value; new_array->items = (JSON_Value**)NULL; new_array->capacity = 0; new_array->count = 0; return new_array; } static JSON_Status json_array_add(JSON_Array *array, JSON_Value *value) { if (array->count >= array->capacity) { size_t new_capacity = MAX(array->capacity * 2, STARTING_CAPACITY); if (json_array_resize(array, new_capacity) != JSONSuccess) { return JSONFailure; } } value->parent = json_array_get_wrapping_value(array); array->items[array->count] = value; array->count++; return JSONSuccess; } static JSON_Status json_array_resize(JSON_Array *array, size_t new_capacity) { JSON_Value **new_items = NULL; if (new_capacity == 0) { return JSONFailure; } new_items = (JSON_Value**)parson_malloc(new_capacity * sizeof(JSON_Value*)); if (new_items == NULL) { return JSONFailure; } if (array->items != NULL && array->count > 0) { memcpy(new_items, array->items, array->count * sizeof(JSON_Value*)); } parson_free(array->items); array->items = new_items; array->capacity = new_capacity; return JSONSuccess; } static void json_array_free(JSON_Array *array) { size_t i; for (i = 0; i < array->count; i++) { json_value_free(array->items[i]); } parson_free(array->items); parson_free(array); } /* JSON Value */ static JSON_Value * json_value_init_string_no_copy(char *string, size_t length) { JSON_Value *new_value = (JSON_Value*)parson_malloc(sizeof(JSON_Value)); if (!new_value) { return NULL; } new_value->parent = NULL; new_value->type = JSONString; new_value->value.string.chars = string; new_value->value.string.length = length; return new_value; } /* Parser */ static JSON_Status skip_quotes(const char **string) { if (**string != '\"') { return JSONFailure; } SKIP_CHAR(string); while (**string != '\"') { if (**string == '\0') { return JSONFailure; } else if (**string == '\\') { SKIP_CHAR(string); if (**string == '\0') { return JSONFailure; } } SKIP_CHAR(string); } SKIP_CHAR(string); return JSONSuccess; } static JSON_Status parse_utf16(const char **unprocessed, char **processed) { unsigned int cp, lead, trail; char *processed_ptr = *processed; const char *unprocessed_ptr = *unprocessed; JSON_Status status = JSONFailure; unprocessed_ptr++; /* skips u */ status = parse_utf16_hex(unprocessed_ptr, &cp); if (status != JSONSuccess) { return JSONFailure; } if (cp < 0x80) { processed_ptr[0] = (char)cp; /* 0xxxxxxx */ } else if (cp < 0x800) { processed_ptr[0] = ((cp >> 6) & 0x1F) | 0xC0; /* 110xxxxx */ processed_ptr[1] = ((cp) & 0x3F) | 0x80; /* 10xxxxxx */ processed_ptr += 1; } else if (cp < 0xD800 || cp > 0xDFFF) { processed_ptr[0] = ((cp >> 12) & 0x0F) | 0xE0; /* 1110xxxx */ processed_ptr[1] = ((cp >> 6) & 0x3F) | 0x80; /* 10xxxxxx */ processed_ptr[2] = ((cp) & 0x3F) | 0x80; /* 10xxxxxx */ processed_ptr += 2; } else if (cp >= 0xD800 && cp <= 0xDBFF) { /* lead surrogate (0xD800..0xDBFF) */ lead = cp; unprocessed_ptr += 4; /* should always be within the buffer, otherwise previous sscanf would fail */ if (*unprocessed_ptr++ != '\\' || *unprocessed_ptr++ != 'u') { return JSONFailure; } status = parse_utf16_hex(unprocessed_ptr, &trail); if (status != JSONSuccess || trail < 0xDC00 || trail > 0xDFFF) { /* valid trail surrogate? (0xDC00..0xDFFF) */ return JSONFailure; } cp = ((((lead - 0xD800) & 0x3FF) << 10) | ((trail - 0xDC00) & 0x3FF)) + 0x010000; processed_ptr[0] = (((cp >> 18) & 0x07) | 0xF0); /* 11110xxx */ processed_ptr[1] = (((cp >> 12) & 0x3F) | 0x80); /* 10xxxxxx */ processed_ptr[2] = (((cp >> 6) & 0x3F) | 0x80); /* 10xxxxxx */ processed_ptr[3] = (((cp) & 0x3F) | 0x80); /* 10xxxxxx */ processed_ptr += 3; } else { /* trail surrogate before lead surrogate */ return JSONFailure; } unprocessed_ptr += 3; *processed = processed_ptr; *unprocessed = unprocessed_ptr; return JSONSuccess; } /* Copies and processes passed string up to supplied length. Example: "\u006Corem ipsum" -> lorem ipsum */ static char* process_string(const char *input, size_t input_len, size_t *output_len) { const char *input_ptr = input; size_t initial_size = (input_len + 1) * sizeof(char); size_t final_size = 0; char *output = NULL, *output_ptr = NULL, *resized_output = NULL; output = (char*)parson_malloc(initial_size); if (output == NULL) { goto error; } output_ptr = output; while ((*input_ptr != '\0') && (size_t)(input_ptr - input) < input_len) { if (*input_ptr == '\\') { input_ptr++; switch (*input_ptr) { case '\"': *output_ptr = '\"'; break; case '\\': *output_ptr = '\\'; break; case '/': *output_ptr = '/'; break; case 'b': *output_ptr = '\b'; break; case 'f': *output_ptr = '\f'; break; case 'n': *output_ptr = '\n'; break; case 'r': *output_ptr = '\r'; break; case 't': *output_ptr = '\t'; break; case 'u': if (parse_utf16(&input_ptr, &output_ptr) != JSONSuccess) { goto error; } break; default: goto error; } } else if ((unsigned char)*input_ptr < 0x20) { goto error; /* 0x00-0x19 are invalid characters for json string (http://www.ietf.org/rfc/rfc4627.txt) */ } else { *output_ptr = *input_ptr; } output_ptr++; input_ptr++; } *output_ptr = '\0'; /* resize to new length */ final_size = (size_t)(output_ptr-output) + 1; /* todo: don't resize if final_size == initial_size */ resized_output = (char*)parson_malloc(final_size); if (resized_output == NULL) { goto error; } memcpy(resized_output, output, final_size); *output_len = final_size - 1; parson_free(output); return resized_output; error: parson_free(output); return NULL; } /* Return processed contents of a string between quotes and skips passed argument to a matching quote. */ static char * get_quoted_string(const char **string, size_t *output_string_len) { const char *string_start = *string; size_t input_string_len = 0; JSON_Status status = skip_quotes(string); if (status != JSONSuccess) { return NULL; } input_string_len = *string - string_start - 2; /* length without quotes */ return process_string(string_start + 1, input_string_len, output_string_len); } static JSON_Value * parse_value(const char **string, size_t nesting) { if (nesting > MAX_NESTING) { return NULL; } SKIP_WHITESPACES(string); switch (**string) { case '{': return parse_object_value(string, nesting + 1); case '[': return parse_array_value(string, nesting + 1); case '\"': return parse_string_value(string); case 'f': case 't': return parse_boolean_value(string); case '-': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return parse_number_value(string); case 'n': return parse_null_value(string); default: return NULL; } } static JSON_Value * parse_object_value(const char **string, size_t nesting) { JSON_Status status = JSONFailure; JSON_Value *output_value = NULL, *new_value = NULL; JSON_Object *output_object = NULL; char *new_key = NULL; output_value = json_value_init_object(); if (output_value == NULL) { return NULL; } if (**string != '{') { json_value_free(output_value); return NULL; } output_object = json_value_get_object(output_value); SKIP_CHAR(string); SKIP_WHITESPACES(string); if (**string == '}') { /* empty object */ SKIP_CHAR(string); return output_value; } while (**string != '\0') { size_t key_len = 0; new_key = get_quoted_string(string, &key_len); /* We do not support key names with embedded \0 chars */ if (!new_key) { json_value_free(output_value); return NULL; } if (key_len != strlen(new_key)) { parson_free(new_key); json_value_free(output_value); return NULL; } SKIP_WHITESPACES(string); if (**string != ':') { parson_free(new_key); json_value_free(output_value); return NULL; } SKIP_CHAR(string); new_value = parse_value(string, nesting); if (new_value == NULL) { parson_free(new_key); json_value_free(output_value); return NULL; } status = json_object_add(output_object, new_key, new_value); if (status != JSONSuccess) { parson_free(new_key); json_value_free(new_value); json_value_free(output_value); return NULL; } SKIP_WHITESPACES(string); if (**string != ',') { break; } SKIP_CHAR(string); SKIP_WHITESPACES(string); } SKIP_WHITESPACES(string); if (**string != '}') { json_value_free(output_value); return NULL; } SKIP_CHAR(string); return output_value; } static JSON_Value * parse_array_value(const char **string, size_t nesting) { JSON_Value *output_value = NULL, *new_array_value = NULL; JSON_Array *output_array = NULL; output_value = json_value_init_array(); if (output_value == NULL) { return NULL; } if (**string != '[') { json_value_free(output_value); return NULL; } output_array = json_value_get_array(output_value); SKIP_CHAR(string); SKIP_WHITESPACES(string); if (**string == ']') { /* empty array */ SKIP_CHAR(string); return output_value; } while (**string != '\0') { new_array_value = parse_value(string, nesting); if (new_array_value == NULL) { json_value_free(output_value); return NULL; } if (json_array_add(output_array, new_array_value) != JSONSuccess) { json_value_free(new_array_value); json_value_free(output_value); return NULL; } SKIP_WHITESPACES(string); if (**string != ',') { break; } SKIP_CHAR(string); SKIP_WHITESPACES(string); } SKIP_WHITESPACES(string); if (**string != ']' || /* Trim array after parsing is over */ json_array_resize(output_array, json_array_get_count(output_array)) != JSONSuccess) { json_value_free(output_value); return NULL; } SKIP_CHAR(string); return output_value; } static JSON_Value * parse_string_value(const char **string) { JSON_Value *value = NULL; size_t new_string_len = 0; char *new_string = get_quoted_string(string, &new_string_len); if (new_string == NULL) { return NULL; } value = json_value_init_string_no_copy(new_string, new_string_len); if (value == NULL) { parson_free(new_string); return NULL; } return value; } static JSON_Value * parse_boolean_value(const char **string) { size_t true_token_size = SIZEOF_TOKEN("true"); size_t false_token_size = SIZEOF_TOKEN("false"); if (strncmp("true", *string, true_token_size) == 0) { *string += true_token_size; return json_value_init_boolean(1); } else if (strncmp("false", *string, false_token_size) == 0) { *string += false_token_size; return json_value_init_boolean(0); } return NULL; } static JSON_Value * parse_number_value(const char **string) { char *end; double number = 0; errno = 0; number = strtod(*string, &end); if (errno == ERANGE && (number <= -HUGE_VAL || number >= HUGE_VAL)) { return NULL; } if ((errno && errno != ERANGE) || !is_decimal(*string, end - *string)) { return NULL; } *string = end; return json_value_init_number(number); } static JSON_Value * parse_null_value(const char **string) { size_t token_size = SIZEOF_TOKEN("null"); if (strncmp("null", *string, token_size) == 0) { *string += token_size; return json_value_init_null(); } return NULL; } /* Serialization */ #define APPEND_STRING(str) do { written = append_string(buf, (str));\ if (written < 0) { return -1; }\ if (buf != NULL) { buf += written; }\ written_total += written; } while(0) #define APPEND_INDENT(level) do { written = append_indent(buf, (level));\ if (written < 0) { return -1; }\ if (buf != NULL) { buf += written; }\ written_total += written; } while(0) static int json_serialize_to_buffer_r(const JSON_Value *value, char *buf, int level, parson_bool_t is_pretty, char *num_buf) { const char *key = NULL, *string = NULL; JSON_Value *temp_value = NULL; JSON_Array *array = NULL; JSON_Object *object = NULL; size_t i = 0, count = 0; double num = 0.0; int written = -1, written_total = 0; size_t len = 0; switch (json_value_get_type(value)) { case JSONArray: array = json_value_get_array(value); count = json_array_get_count(array); APPEND_STRING("["); if (count > 0 && is_pretty) { APPEND_STRING("\n"); } for (i = 0; i < count; i++) { if (is_pretty) { APPEND_INDENT(level+1); } temp_value = json_array_get_value(array, i); written = json_serialize_to_buffer_r(temp_value, buf, level+1, is_pretty, num_buf); if (written < 0) { return -1; } if (buf != NULL) { buf += written; } written_total += written; if (i < (count - 1)) { APPEND_STRING(","); } if (is_pretty) { APPEND_STRING("\n"); } } if (count > 0 && is_pretty) { APPEND_INDENT(level); } APPEND_STRING("]"); return written_total; case JSONObject: object = json_value_get_object(value); count = json_object_get_count(object); APPEND_STRING("{"); if (count > 0 && is_pretty) { APPEND_STRING("\n"); } for (i = 0; i < count; i++) { key = json_object_get_name(object, i); if (key == NULL) { return -1; } if (is_pretty) { APPEND_INDENT(level+1); } /* We do not support key names with embedded \0 chars */ written = json_serialize_string(key, strlen(key), buf); if (written < 0) { return -1; } if (buf != NULL) { buf += written; } written_total += written; APPEND_STRING(":"); if (is_pretty) { APPEND_STRING(" "); } temp_value = json_object_get_value_at(object, i); written = json_serialize_to_buffer_r(temp_value, buf, level+1, is_pretty, num_buf); if (written < 0) { return -1; } if (buf != NULL) { buf += written; } written_total += written; if (i < (count - 1)) { APPEND_STRING(","); } if (is_pretty) { APPEND_STRING("\n"); } } if (count > 0 && is_pretty) { APPEND_INDENT(level); } APPEND_STRING("}"); return written_total; case JSONString: string = json_value_get_string(value); if (string == NULL) { return -1; } len = json_value_get_string_len(value); written = json_serialize_string(string, len, buf); if (written < 0) { return -1; } if (buf != NULL) { buf += written; } written_total += written; return written_total; case JSONBoolean: if (json_value_get_boolean(value)) { APPEND_STRING("true"); } else { APPEND_STRING("false"); } return written_total; case JSONNumber: num = json_value_get_number(value); if (buf != NULL) { num_buf = buf; } written = sprintf(num_buf, FLOAT_FORMAT, num); if (written < 0) { return -1; } if (buf != NULL) { buf += written; } written_total += written; return written_total; case JSONNull: APPEND_STRING("null"); return written_total; case JSONError: return -1; default: return -1; } } static int json_serialize_string(const char *string, size_t len, char *buf) { size_t i = 0; char c = '\0'; int written = -1, written_total = 0; APPEND_STRING("\""); for (i = 0; i < len; i++) { c = string[i]; switch (c) { case '\"': APPEND_STRING("\\\""); break; case '\\': APPEND_STRING("\\\\"); break; case '\b': APPEND_STRING("\\b"); break; case '\f': APPEND_STRING("\\f"); break; case '\n': APPEND_STRING("\\n"); break; case '\r': APPEND_STRING("\\r"); break; case '\t': APPEND_STRING("\\t"); break; case '\x00': APPEND_STRING("\\u0000"); break; case '\x01': APPEND_STRING("\\u0001"); break; case '\x02': APPEND_STRING("\\u0002"); break; case '\x03': APPEND_STRING("\\u0003"); break; case '\x04': APPEND_STRING("\\u0004"); break; case '\x05': APPEND_STRING("\\u0005"); break; case '\x06': APPEND_STRING("\\u0006"); break; case '\x07': APPEND_STRING("\\u0007"); break; /* '\x08' duplicate: '\b' */ /* '\x09' duplicate: '\t' */ /* '\x0a' duplicate: '\n' */ case '\x0b': APPEND_STRING("\\u000b"); break; /* '\x0c' duplicate: '\f' */ /* '\x0d' duplicate: '\r' */ case '\x0e': APPEND_STRING("\\u000e"); break; case '\x0f': APPEND_STRING("\\u000f"); break; case '\x10': APPEND_STRING("\\u0010"); break; case '\x11': APPEND_STRING("\\u0011"); break; case '\x12': APPEND_STRING("\\u0012"); break; case '\x13': APPEND_STRING("\\u0013"); break; case '\x14': APPEND_STRING("\\u0014"); break; case '\x15': APPEND_STRING("\\u0015"); break; case '\x16': APPEND_STRING("\\u0016"); break; case '\x17': APPEND_STRING("\\u0017"); break; case '\x18': APPEND_STRING("\\u0018"); break; case '\x19': APPEND_STRING("\\u0019"); break; case '\x1a': APPEND_STRING("\\u001a"); break; case '\x1b': APPEND_STRING("\\u001b"); break; case '\x1c': APPEND_STRING("\\u001c"); break; case '\x1d': APPEND_STRING("\\u001d"); break; case '\x1e': APPEND_STRING("\\u001e"); break; case '\x1f': APPEND_STRING("\\u001f"); break; case '/': if (parson_escape_slashes) { APPEND_STRING("\\/"); /* to make json embeddable in xml\/html */ } else { APPEND_STRING("/"); } break; default: if (buf != NULL) { buf[0] = c; buf += 1; } written_total += 1; break; } } APPEND_STRING("\""); return written_total; } static int append_indent(char *buf, int level) { int i; int written = -1, written_total = 0; for (i = 0; i < level; i++) { APPEND_STRING(" "); } return written_total; } static int append_string(char *buf, const char *string) { if (buf == NULL) { return (int)strlen(string); } return sprintf(buf, "%s", string); } #undef APPEND_STRING #undef APPEND_INDENT /* Parser API */ JSON_Value * json_parse_file(const char *filename) { char *file_contents = read_file(filename); JSON_Value *output_value = NULL; if (file_contents == NULL) { return NULL; } output_value = json_parse_string(file_contents); parson_free(file_contents); return output_value; } JSON_Value * json_parse_file_with_comments(const char *filename) { char *file_contents = read_file(filename); JSON_Value *output_value = NULL; if (file_contents == NULL) { return NULL; } output_value = json_parse_string_with_comments(file_contents); parson_free(file_contents); return output_value; } JSON_Value * json_parse_string(const char *string) { if (string == NULL) { return NULL; } if (string[0] == '\xEF' && string[1] == '\xBB' && string[2] == '\xBF') { string = string + 3; /* Support for UTF-8 BOM */ } return parse_value((const char**)&string, 0); } JSON_Value * json_parse_string_with_comments(const char *string) { JSON_Value *result = NULL; char *string_mutable_copy = NULL, *string_mutable_copy_ptr = NULL; string_mutable_copy = parson_strdup(string); if (string_mutable_copy == NULL) { return NULL; } remove_comments(string_mutable_copy, "/*", "*/"); remove_comments(string_mutable_copy, "//", "\n"); string_mutable_copy_ptr = string_mutable_copy; result = parse_value((const char**)&string_mutable_copy_ptr, 0); parson_free(string_mutable_copy); return result; } /* JSON Object API */ JSON_Value * json_object_get_value(const JSON_Object *object, const char *name) { if (object == NULL || name == NULL) { return NULL; } return json_object_getn_value(object, name, strlen(name)); } const char * json_object_get_string(const JSON_Object *object, const char *name) { return json_value_get_string(json_object_get_value(object, name)); } size_t json_object_get_string_len(const JSON_Object *object, const char *name) { return json_value_get_string_len(json_object_get_value(object, name)); } double json_object_get_number(const JSON_Object *object, const char *name) { return json_value_get_number(json_object_get_value(object, name)); } JSON_Object * json_object_get_object(const JSON_Object *object, const char *name) { return json_value_get_object(json_object_get_value(object, name)); } JSON_Array * json_object_get_array(const JSON_Object *object, const char *name) { return json_value_get_array(json_object_get_value(object, name)); } int json_object_get_boolean(const JSON_Object *object, const char *name) { return json_value_get_boolean(json_object_get_value(object, name)); } JSON_Value * json_object_dotget_value(const JSON_Object *object, const char *name) { const char *dot_position = strchr(name, '.'); if (!dot_position) { return json_object_get_value(object, name); } object = json_value_get_object(json_object_getn_value(object, name, dot_position - name)); return json_object_dotget_value(object, dot_position + 1); } const char * json_object_dotget_string(const JSON_Object *object, const char *name) { return json_value_get_string(json_object_dotget_value(object, name)); } size_t json_object_dotget_string_len(const JSON_Object *object, const char *name) { return json_value_get_string_len(json_object_dotget_value(object, name)); } double json_object_dotget_number(const JSON_Object *object, const char *name) { return json_value_get_number(json_object_dotget_value(object, name)); } JSON_Object * json_object_dotget_object(const JSON_Object *object, const char *name) { return json_value_get_object(json_object_dotget_value(object, name)); } JSON_Array * json_object_dotget_array(const JSON_Object *object, const char *name) { return json_value_get_array(json_object_dotget_value(object, name)); } int json_object_dotget_boolean(const JSON_Object *object, const char *name) { return json_value_get_boolean(json_object_dotget_value(object, name)); } size_t json_object_get_count(const JSON_Object *object) { return object ? object->count : 0; } const char * json_object_get_name(const JSON_Object *object, size_t index) { if (object == NULL || index >= json_object_get_count(object)) { return NULL; } return object->names[index]; } JSON_Value * json_object_get_value_at(const JSON_Object *object, size_t index) { if (object == NULL || index >= json_object_get_count(object)) { return NULL; } return object->values[index]; } JSON_Value *json_object_get_wrapping_value(const JSON_Object *object) { if (!object) { return NULL; } return object->wrapping_value; } int json_object_has_value (const JSON_Object *object, const char *name) { return json_object_get_value(object, name) != NULL; } int json_object_has_value_of_type(const JSON_Object *object, const char *name, JSON_Value_Type type) { JSON_Value *val = json_object_get_value(object, name); return val != NULL && json_value_get_type(val) == type; } int json_object_dothas_value (const JSON_Object *object, const char *name) { return json_object_dotget_value(object, name) != NULL; } int json_object_dothas_value_of_type(const JSON_Object *object, const char *name, JSON_Value_Type type) { JSON_Value *val = json_object_dotget_value(object, name); return val != NULL && json_value_get_type(val) == type; } /* JSON Array API */ JSON_Value * json_array_get_value(const JSON_Array *array, size_t index) { if (array == NULL || index >= json_array_get_count(array)) { return NULL; } return array->items[index]; } const char * json_array_get_string(const JSON_Array *array, size_t index) { return json_value_get_string(json_array_get_value(array, index)); } size_t json_array_get_string_len(const JSON_Array *array, size_t index) { return json_value_get_string_len(json_array_get_value(array, index)); } double json_array_get_number(const JSON_Array *array, size_t index) { return json_value_get_number(json_array_get_value(array, index)); } JSON_Object * json_array_get_object(const JSON_Array *array, size_t index) { return json_value_get_object(json_array_get_value(array, index)); } JSON_Array * json_array_get_array(const JSON_Array *array, size_t index) { return json_value_get_array(json_array_get_value(array, index)); } int json_array_get_boolean(const JSON_Array *array, size_t index) { return json_value_get_boolean(json_array_get_value(array, index)); } size_t json_array_get_count(const JSON_Array *array) { return array ? array->count : 0; } JSON_Value * json_array_get_wrapping_value(const JSON_Array *array) { if (!array) { return NULL; } return array->wrapping_value; } /* JSON Value API */ JSON_Value_Type json_value_get_type(const JSON_Value *value) { return value ? value->type : JSONError; } JSON_Object * json_value_get_object(const JSON_Value *value) { return json_value_get_type(value) == JSONObject ? value->value.object : NULL; } JSON_Array * json_value_get_array(const JSON_Value *value) { return json_value_get_type(value) == JSONArray ? value->value.array : NULL; } static const JSON_String * json_value_get_string_desc(const JSON_Value *value) { return json_value_get_type(value) == JSONString ? &value->value.string : NULL; } const char * json_value_get_string(const JSON_Value *value) { const JSON_String *str = json_value_get_string_desc(value); return str ? str->chars : NULL; } size_t json_value_get_string_len(const JSON_Value *value) { const JSON_String *str = json_value_get_string_desc(value); return str ? str->length : 0; } double json_value_get_number(const JSON_Value *value) { return json_value_get_type(value) == JSONNumber ? value->value.number : 0; } int json_value_get_boolean(const JSON_Value *value) { return json_value_get_type(value) == JSONBoolean ? value->value.boolean : -1; } JSON_Value * json_value_get_parent (const JSON_Value *value) { return value ? value->parent : NULL; } void json_value_free(JSON_Value *value) { switch (json_value_get_type(value)) { case JSONObject: json_object_free(value->value.object); break; case JSONString: parson_free(value->value.string.chars); break; case JSONArray: json_array_free(value->value.array); break; default: break; } parson_free(value); } JSON_Value * json_value_init_object(void) { JSON_Value *new_value = (JSON_Value*)parson_malloc(sizeof(JSON_Value)); if (!new_value) { return NULL; } new_value->parent = NULL; new_value->type = JSONObject; new_value->value.object = json_object_make(new_value); if (!new_value->value.object) { parson_free(new_value); return NULL; } return new_value; } JSON_Value * json_value_init_array(void) { JSON_Value *new_value = (JSON_Value*)parson_malloc(sizeof(JSON_Value)); if (!new_value) { return NULL; } new_value->parent = NULL; new_value->type = JSONArray; new_value->value.array = json_array_make(new_value); if (!new_value->value.array) { parson_free(new_value); return NULL; } return new_value; } JSON_Value * json_value_init_string(const char *string) { if (string == NULL) { return NULL; } return json_value_init_string_with_len(string, strlen(string)); } JSON_Value * json_value_init_string_with_len(const char *string, size_t length) { char *copy = NULL; JSON_Value *value; if (string == NULL) { return NULL; } if (!is_valid_utf8(string, length)) { return NULL; } copy = parson_strndup(string, length); if (copy == NULL) { return NULL; } value = json_value_init_string_no_copy(copy, length); if (value == NULL) { parson_free(copy); } return value; } JSON_Value * json_value_init_number(double number) { JSON_Value *new_value = NULL; if (IS_NUMBER_INVALID(number)) { return NULL; } new_value = (JSON_Value*)parson_malloc(sizeof(JSON_Value)); if (new_value == NULL) { return NULL; } new_value->parent = NULL; new_value->type = JSONNumber; new_value->value.number = number; return new_value; } JSON_Value * json_value_init_boolean(int boolean) { JSON_Value *new_value = (JSON_Value*)parson_malloc(sizeof(JSON_Value)); if (!new_value) { return NULL; } new_value->parent = NULL; new_value->type = JSONBoolean; new_value->value.boolean = boolean ? 1 : 0; return new_value; } JSON_Value * json_value_init_null(void) { JSON_Value *new_value = (JSON_Value*)parson_malloc(sizeof(JSON_Value)); if (!new_value) { return NULL; } new_value->parent = NULL; new_value->type = JSONNull; return new_value; } JSON_Value * json_value_deep_copy(const JSON_Value *value) { size_t i = 0; JSON_Value *return_value = NULL, *temp_value_copy = NULL, *temp_value = NULL; const JSON_String *temp_string = NULL; const char *temp_key = NULL; char *temp_string_copy = NULL; JSON_Array *temp_array = NULL, *temp_array_copy = NULL; JSON_Object *temp_object = NULL, *temp_object_copy = NULL; JSON_Status res = JSONFailure; char *key_copy = NULL; switch (json_value_get_type(value)) { case JSONArray: temp_array = json_value_get_array(value); return_value = json_value_init_array(); if (return_value == NULL) { return NULL; } temp_array_copy = json_value_get_array(return_value); for (i = 0; i < json_array_get_count(temp_array); i++) { temp_value = json_array_get_value(temp_array, i); temp_value_copy = json_value_deep_copy(temp_value); if (temp_value_copy == NULL) { json_value_free(return_value); return NULL; } if (json_array_add(temp_array_copy, temp_value_copy) != JSONSuccess) { json_value_free(return_value); json_value_free(temp_value_copy); return NULL; } } return return_value; case JSONObject: temp_object = json_value_get_object(value); return_value = json_value_init_object(); if (!return_value) { return NULL; } temp_object_copy = json_value_get_object(return_value); for (i = 0; i < json_object_get_count(temp_object); i++) { temp_key = json_object_get_name(temp_object, i); temp_value = json_object_get_value(temp_object, temp_key); temp_value_copy = json_value_deep_copy(temp_value); if (!temp_value_copy) { json_value_free(return_value); return NULL; } key_copy = parson_strdup(temp_key); if (!key_copy) { json_value_free(temp_value_copy); json_value_free(return_value); return NULL; } res = json_object_add(temp_object_copy, key_copy, temp_value_copy); if (res != JSONSuccess) { parson_free(key_copy); json_value_free(temp_value_copy); json_value_free(return_value); return NULL; } } return return_value; case JSONBoolean: return json_value_init_boolean(json_value_get_boolean(value)); case JSONNumber: return json_value_init_number(json_value_get_number(value)); case JSONString: temp_string = json_value_get_string_desc(value); if (temp_string == NULL) { return NULL; } temp_string_copy = parson_strndup(temp_string->chars, temp_string->length); if (temp_string_copy == NULL) { return NULL; } return_value = json_value_init_string_no_copy(temp_string_copy, temp_string->length); if (return_value == NULL) { parson_free(temp_string_copy); } return return_value; case JSONNull: return json_value_init_null(); case JSONError: return NULL; default: return NULL; } } size_t json_serialization_size(const JSON_Value *value) { char num_buf[NUM_BUF_SIZE]; /* recursively allocating buffer on stack is a bad idea, so let's do it only once */ int res = json_serialize_to_buffer_r(value, NULL, 0, PARSON_FALSE, num_buf); return res < 0 ? 0 : (size_t)(res) + 1; } JSON_Status json_serialize_to_buffer(const JSON_Value *value, char *buf, size_t buf_size_in_bytes) { int written = -1; size_t needed_size_in_bytes = json_serialization_size(value); if (needed_size_in_bytes == 0 || buf_size_in_bytes < needed_size_in_bytes) { return JSONFailure; } written = json_serialize_to_buffer_r(value, buf, 0, PARSON_FALSE, NULL); if (written < 0) { return JSONFailure; } return JSONSuccess; } JSON_Status json_serialize_to_file(const JSON_Value *value, const char *filename) { JSON_Status return_code = JSONSuccess; FILE *fp = NULL; char *serialized_string = json_serialize_to_string(value); if (serialized_string == NULL) { return JSONFailure; } fp = fopen(filename, "w"); if (fp == NULL) { json_free_serialized_string(serialized_string); return JSONFailure; } if (fputs(serialized_string, fp) == EOF) { return_code = JSONFailure; } if (fclose(fp) == EOF) { return_code = JSONFailure; } json_free_serialized_string(serialized_string); return return_code; } char * json_serialize_to_string(const JSON_Value *value) { JSON_Status serialization_result = JSONFailure; size_t buf_size_bytes = json_serialization_size(value); char *buf = NULL; if (buf_size_bytes == 0) { return NULL; } buf = (char*)parson_malloc(buf_size_bytes); if (buf == NULL) { return NULL; } serialization_result = json_serialize_to_buffer(value, buf, buf_size_bytes); if (serialization_result != JSONSuccess) { json_free_serialized_string(buf); return NULL; } return buf; } size_t json_serialization_size_pretty(const JSON_Value *value) { char num_buf[NUM_BUF_SIZE]; /* recursively allocating buffer on stack is a bad idea, so let's do it only once */ int res = json_serialize_to_buffer_r(value, NULL, 0, PARSON_TRUE, num_buf); return res < 0 ? 0 : (size_t)(res) + 1; } JSON_Status json_serialize_to_buffer_pretty(const JSON_Value *value, char *buf, size_t buf_size_in_bytes) { int written = -1; size_t needed_size_in_bytes = json_serialization_size_pretty(value); if (needed_size_in_bytes == 0 || buf_size_in_bytes < needed_size_in_bytes) { return JSONFailure; } written = json_serialize_to_buffer_r(value, buf, 0, PARSON_TRUE, NULL); if (written < 0) { return JSONFailure; } return JSONSuccess; } JSON_Status json_serialize_to_file_pretty(const JSON_Value *value, const char *filename) { JSON_Status return_code = JSONSuccess; FILE *fp = NULL; char *serialized_string = json_serialize_to_string_pretty(value); if (serialized_string == NULL) { return JSONFailure; } fp = fopen(filename, "w"); if (fp == NULL) { json_free_serialized_string(serialized_string); return JSONFailure; } if (fputs(serialized_string, fp) == EOF) { return_code = JSONFailure; } if (fclose(fp) == EOF) { return_code = JSONFailure; } json_free_serialized_string(serialized_string); return return_code; } char * json_serialize_to_string_pretty(const JSON_Value *value) { JSON_Status serialization_result = JSONFailure; size_t buf_size_bytes = json_serialization_size_pretty(value); char *buf = NULL; if (buf_size_bytes == 0) { return NULL; } buf = (char*)parson_malloc(buf_size_bytes); if (buf == NULL) { return NULL; } serialization_result = json_serialize_to_buffer_pretty(value, buf, buf_size_bytes); if (serialization_result != JSONSuccess) { json_free_serialized_string(buf); return NULL; } return buf; } void json_free_serialized_string(char *string) { parson_free(string); } JSON_Status json_array_remove(JSON_Array *array, size_t ix) { size_t to_move_bytes = 0; if (array == NULL || ix >= json_array_get_count(array)) { return JSONFailure; } json_value_free(json_array_get_value(array, ix)); to_move_bytes = (json_array_get_count(array) - 1 - ix) * sizeof(JSON_Value*); memmove(array->items + ix, array->items + ix + 1, to_move_bytes); array->count -= 1; return JSONSuccess; } JSON_Status json_array_replace_value(JSON_Array *array, size_t ix, JSON_Value *value) { if (array == NULL || value == NULL || value->parent != NULL || ix >= json_array_get_count(array)) { return JSONFailure; } json_value_free(json_array_get_value(array, ix)); value->parent = json_array_get_wrapping_value(array); array->items[ix] = value; return JSONSuccess; } JSON_Status json_array_replace_string(JSON_Array *array, size_t i, const char* string) { JSON_Value *value = json_value_init_string(string); if (value == NULL) { return JSONFailure; } if (json_array_replace_value(array, i, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_array_replace_string_with_len(JSON_Array *array, size_t i, const char *string, size_t len) { JSON_Value *value = json_value_init_string_with_len(string, len); if (value == NULL) { return JSONFailure; } if (json_array_replace_value(array, i, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_array_replace_number(JSON_Array *array, size_t i, double number) { JSON_Value *value = json_value_init_number(number); if (value == NULL) { return JSONFailure; } if (json_array_replace_value(array, i, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_array_replace_boolean(JSON_Array *array, size_t i, int boolean) { JSON_Value *value = json_value_init_boolean(boolean); if (value == NULL) { return JSONFailure; } if (json_array_replace_value(array, i, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_array_replace_null(JSON_Array *array, size_t i) { JSON_Value *value = json_value_init_null(); if (value == NULL) { return JSONFailure; } if (json_array_replace_value(array, i, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_array_clear(JSON_Array *array) { size_t i = 0; if (array == NULL) { return JSONFailure; } for (i = 0; i < json_array_get_count(array); i++) { json_value_free(json_array_get_value(array, i)); } array->count = 0; return JSONSuccess; } JSON_Status json_array_append_value(JSON_Array *array, JSON_Value *value) { if (array == NULL || value == NULL || value->parent != NULL) { return JSONFailure; } return json_array_add(array, value); } JSON_Status json_array_append_string(JSON_Array *array, const char *string) { JSON_Value *value = json_value_init_string(string); if (value == NULL) { return JSONFailure; } if (json_array_append_value(array, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_array_append_string_with_len(JSON_Array *array, const char *string, size_t len) { JSON_Value *value = json_value_init_string_with_len(string, len); if (value == NULL) { return JSONFailure; } if (json_array_append_value(array, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_array_append_number(JSON_Array *array, double number) { JSON_Value *value = json_value_init_number(number); if (value == NULL) { return JSONFailure; } if (json_array_append_value(array, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_array_append_boolean(JSON_Array *array, int boolean) { JSON_Value *value = json_value_init_boolean(boolean); if (value == NULL) { return JSONFailure; } if (json_array_append_value(array, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_array_append_null(JSON_Array *array) { JSON_Value *value = json_value_init_null(); if (value == NULL) { return JSONFailure; } if (json_array_append_value(array, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_object_set_value(JSON_Object *object, const char *name, JSON_Value *value) { unsigned long hash = 0; parson_bool_t found = PARSON_FALSE; size_t cell_ix = 0; size_t item_ix = 0; JSON_Value *old_value = NULL; char *key_copy = NULL; if (!object || !name || !value || value->parent) { return JSONFailure; } hash = hash_string(name, strlen(name)); found = PARSON_FALSE; cell_ix = json_object_get_cell_ix(object, name, strlen(name), hash, &found); if (found) { item_ix = object->cells[cell_ix]; old_value = object->values[item_ix]; json_value_free(old_value); object->values[item_ix] = value; value->parent = json_object_get_wrapping_value(object); return JSONSuccess; } if (object->count >= object->item_capacity) { JSON_Status res = json_object_grow_and_rehash(object); if (res != JSONSuccess) { return JSONFailure; } cell_ix = json_object_get_cell_ix(object, name, strlen(name), hash, &found); } key_copy = parson_strdup(name); if (!key_copy) { return JSONFailure; } object->names[object->count] = key_copy; object->cells[cell_ix] = object->count; object->values[object->count] = value; object->cell_ixs[object->count] = cell_ix; object->hashes[object->count] = hash; object->count++; value->parent = json_object_get_wrapping_value(object); return JSONSuccess; } JSON_Status json_object_set_string(JSON_Object *object, const char *name, const char *string) { JSON_Value *value = json_value_init_string(string); JSON_Status status = json_object_set_value(object, name, value); if (status != JSONSuccess) { json_value_free(value); } return status; } JSON_Status json_object_set_string_with_len(JSON_Object *object, const char *name, const char *string, size_t len) { JSON_Value *value = json_value_init_string_with_len(string, len); JSON_Status status = json_object_set_value(object, name, value); if (status != JSONSuccess) { json_value_free(value); } return status; } JSON_Status json_object_set_number(JSON_Object *object, const char *name, double number) { JSON_Value *value = json_value_init_number(number); JSON_Status status = json_object_set_value(object, name, value); if (status != JSONSuccess) { json_value_free(value); } return status; } JSON_Status json_object_set_boolean(JSON_Object *object, const char *name, int boolean) { JSON_Value *value = json_value_init_boolean(boolean); JSON_Status status = json_object_set_value(object, name, value); if (status != JSONSuccess) { json_value_free(value); } return status; } JSON_Status json_object_set_null(JSON_Object *object, const char *name) { JSON_Value *value = json_value_init_null(); JSON_Status status = json_object_set_value(object, name, value); if (status != JSONSuccess) { json_value_free(value); } return status; } JSON_Status json_object_dotset_value(JSON_Object *object, const char *name, JSON_Value *value) { const char *dot_pos = NULL; JSON_Value *temp_value = NULL, *new_value = NULL; JSON_Object *temp_object = NULL, *new_object = NULL; JSON_Status status = JSONFailure; size_t name_len = 0; char *name_copy = NULL; if (object == NULL || name == NULL || value == NULL) { return JSONFailure; } dot_pos = strchr(name, '.'); if (dot_pos == NULL) { return json_object_set_value(object, name, value); } name_len = dot_pos - name; temp_value = json_object_getn_value(object, name, name_len); if (temp_value) { /* Don't overwrite existing non-object (unlike json_object_set_value, but it shouldn't be changed at this point) */ if (json_value_get_type(temp_value) != JSONObject) { return JSONFailure; } temp_object = json_value_get_object(temp_value); return json_object_dotset_value(temp_object, dot_pos + 1, value); } new_value = json_value_init_object(); if (new_value == NULL) { return JSONFailure; } new_object = json_value_get_object(new_value); status = json_object_dotset_value(new_object, dot_pos + 1, value); if (status != JSONSuccess) { json_value_free(new_value); return JSONFailure; } name_copy = parson_strndup(name, name_len); if (!name_copy) { json_object_dotremove_internal(new_object, dot_pos + 1, 0); json_value_free(new_value); return JSONFailure; } status = json_object_add(object, name_copy, new_value); if (status != JSONSuccess) { parson_free(name_copy); json_object_dotremove_internal(new_object, dot_pos + 1, 0); json_value_free(new_value); return JSONFailure; } return JSONSuccess; } JSON_Status json_object_dotset_string(JSON_Object *object, const char *name, const char *string) { JSON_Value *value = json_value_init_string(string); if (value == NULL) { return JSONFailure; } if (json_object_dotset_value(object, name, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_object_dotset_string_with_len(JSON_Object *object, const char *name, const char *string, size_t len) { JSON_Value *value = json_value_init_string_with_len(string, len); if (value == NULL) { return JSONFailure; } if (json_object_dotset_value(object, name, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_object_dotset_number(JSON_Object *object, const char *name, double number) { JSON_Value *value = json_value_init_number(number); if (value == NULL) { return JSONFailure; } if (json_object_dotset_value(object, name, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_object_dotset_boolean(JSON_Object *object, const char *name, int boolean) { JSON_Value *value = json_value_init_boolean(boolean); if (value == NULL) { return JSONFailure; } if (json_object_dotset_value(object, name, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_object_dotset_null(JSON_Object *object, const char *name) { JSON_Value *value = json_value_init_null(); if (value == NULL) { return JSONFailure; } if (json_object_dotset_value(object, name, value) != JSONSuccess) { json_value_free(value); return JSONFailure; } return JSONSuccess; } JSON_Status json_object_remove(JSON_Object *object, const char *name) { return json_object_remove_internal(object, name, PARSON_TRUE); } JSON_Status json_object_dotremove(JSON_Object *object, const char *name) { return json_object_dotremove_internal(object, name, PARSON_TRUE); } JSON_Status json_object_clear(JSON_Object *object) { size_t i = 0; if (object == NULL) { return JSONFailure; } for (i = 0; i < json_object_get_count(object); i++) { parson_free(object->names[i]); json_value_free(object->values[i]); } object->count = 0; return JSONSuccess; } JSON_Status json_validate(const JSON_Value *schema, const JSON_Value *value) { JSON_Value *temp_schema_value = NULL, *temp_value = NULL; JSON_Array *schema_array = NULL, *value_array = NULL; JSON_Object *schema_object = NULL, *value_object = NULL; JSON_Value_Type schema_type = JSONError, value_type = JSONError; const char *key = NULL; size_t i = 0, count = 0; if (schema == NULL || value == NULL) { return JSONFailure; } schema_type = json_value_get_type(schema); value_type = json_value_get_type(value); if (schema_type != value_type && schema_type != JSONNull) { /* null represents all values */ return JSONFailure; } switch (schema_type) { case JSONArray: schema_array = json_value_get_array(schema); value_array = json_value_get_array(value); count = json_array_get_count(schema_array); if (count == 0) { return JSONSuccess; /* Empty array allows all types */ } /* Get first value from array, rest is ignored */ temp_schema_value = json_array_get_value(schema_array, 0); for (i = 0; i < json_array_get_count(value_array); i++) { temp_value = json_array_get_value(value_array, i); if (json_validate(temp_schema_value, temp_value) != JSONSuccess) { return JSONFailure; } } return JSONSuccess; case JSONObject: schema_object = json_value_get_object(schema); value_object = json_value_get_object(value); count = json_object_get_count(schema_object); if (count == 0) { return JSONSuccess; /* Empty object allows all objects */ } else if (json_object_get_count(value_object) < count) { return JSONFailure; /* Tested object mustn't have less name-value pairs than schema */ } for (i = 0; i < count; i++) { key = json_object_get_name(schema_object, i); temp_schema_value = json_object_get_value(schema_object, key); temp_value = json_object_get_value(value_object, key); if (temp_value == NULL) { return JSONFailure; } if (json_validate(temp_schema_value, temp_value) != JSONSuccess) { return JSONFailure; } } return JSONSuccess; case JSONString: case JSONNumber: case JSONBoolean: case JSONNull: return JSONSuccess; /* equality already tested before switch */ case JSONError: default: return JSONFailure; } } int json_value_equals(const JSON_Value *a, const JSON_Value *b) { JSON_Object *a_object = NULL, *b_object = NULL; JSON_Array *a_array = NULL, *b_array = NULL; const JSON_String *a_string = NULL, *b_string = NULL; const char *key = NULL; size_t a_count = 0, b_count = 0, i = 0; JSON_Value_Type a_type, b_type; a_type = json_value_get_type(a); b_type = json_value_get_type(b); if (a_type != b_type) { return PARSON_FALSE; } switch (a_type) { case JSONArray: a_array = json_value_get_array(a); b_array = json_value_get_array(b); a_count = json_array_get_count(a_array); b_count = json_array_get_count(b_array); if (a_count != b_count) { return PARSON_FALSE; } for (i = 0; i < a_count; i++) { if (!json_value_equals(json_array_get_value(a_array, i), json_array_get_value(b_array, i))) { return PARSON_FALSE; } } return PARSON_TRUE; case JSONObject: a_object = json_value_get_object(a); b_object = json_value_get_object(b); a_count = json_object_get_count(a_object); b_count = json_object_get_count(b_object); if (a_count != b_count) { return PARSON_FALSE; } for (i = 0; i < a_count; i++) { key = json_object_get_name(a_object, i); if (!json_value_equals(json_object_get_value(a_object, key), json_object_get_value(b_object, key))) { return PARSON_FALSE; } } return PARSON_TRUE; case JSONString: a_string = json_value_get_string_desc(a); b_string = json_value_get_string_desc(b); if (a_string == NULL || b_string == NULL) { return PARSON_FALSE; /* shouldn't happen */ } return a_string->length == b_string->length && memcmp(a_string->chars, b_string->chars, a_string->length) == 0; case JSONBoolean: return json_value_get_boolean(a) == json_value_get_boolean(b); case JSONNumber: return fabs(json_value_get_number(a) - json_value_get_number(b)) < 0.000001; /* EPSILON */ case JSONError: return PARSON_TRUE; case JSONNull: return PARSON_TRUE; default: return PARSON_TRUE; } } JSON_Value_Type json_type(const JSON_Value *value) { return json_value_get_type(value); } JSON_Object * json_object (const JSON_Value *value) { return json_value_get_object(value); } JSON_Array * json_array(const JSON_Value *value) { return json_value_get_array(value); } const char * json_string(const JSON_Value *value) { return json_value_get_string(value); } size_t json_string_len(const JSON_Value *value) { return json_value_get_string_len(value); } double json_number(const JSON_Value *value) { return json_value_get_number(value); } int json_boolean(const JSON_Value *value) { return json_value_get_boolean(value); } void json_set_allocation_functions(JSON_Malloc_Function malloc_fun, JSON_Free_Function free_fun) { parson_malloc = malloc_fun; parson_free = free_fun; } void json_set_escape_slashes(int escape_slashes) { parson_escape_slashes = escape_slashes; } pg_auto_failover-1.6.3/src/bin/lib/parson/parson.h000066400000000000000000000321331414244367200221410ustar00rootroot00000000000000/* SPDX-License-Identifier: MIT Parson 1.2.1 ( http://kgabis.github.com/parson/ ) Copyright (c) 2012 - 2021 Krzysztof Gabis Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef parson_parson_h #define parson_parson_h #ifdef __cplusplus extern "C" { #endif #define PARSON_VERSION_MAJOR 1 #define PARSON_VERSION_MINOR 2 #define PARSON_VERSION_PATCH 1 #define PARSON_VERSION_STRING "1.2.1" #include /* size_t */ /* Types and enums */ typedef struct json_object_t JSON_Object; typedef struct json_array_t JSON_Array; typedef struct json_value_t JSON_Value; enum json_value_type { JSONError = -1, JSONNull = 1, JSONString = 2, JSONNumber = 3, JSONObject = 4, JSONArray = 5, JSONBoolean = 6 }; typedef int JSON_Value_Type; enum json_result_t { JSONSuccess = 0, JSONFailure = -1 }; typedef int JSON_Status; typedef void * (*JSON_Malloc_Function)(size_t); typedef void (*JSON_Free_Function)(void *); /* Call only once, before calling any other function from parson API. If not called, malloc and free from stdlib will be used for all allocations */ void json_set_allocation_functions(JSON_Malloc_Function malloc_fun, JSON_Free_Function free_fun); /* Sets if slashes should be escaped or not when serializing JSON. By default slashes are escaped. This function sets a global setting and is not thread safe. */ void json_set_escape_slashes(int escape_slashes); /* Parses first JSON value in a file, returns NULL in case of error */ JSON_Value * json_parse_file(const char *filename); /* Parses first JSON value in a file and ignores comments (/ * * / and //), returns NULL in case of error */ JSON_Value * json_parse_file_with_comments(const char *filename); /* Parses first JSON value in a string, returns NULL in case of error */ JSON_Value * json_parse_string(const char *string); /* Parses first JSON value in a string and ignores comments (/ * * / and //), returns NULL in case of error */ JSON_Value * json_parse_string_with_comments(const char *string); /* Serialization */ size_t json_serialization_size(const JSON_Value *value); /* returns 0 on fail */ JSON_Status json_serialize_to_buffer(const JSON_Value *value, char *buf, size_t buf_size_in_bytes); JSON_Status json_serialize_to_file(const JSON_Value *value, const char *filename); char * json_serialize_to_string(const JSON_Value *value); /* Pretty serialization */ size_t json_serialization_size_pretty(const JSON_Value *value); /* returns 0 on fail */ JSON_Status json_serialize_to_buffer_pretty(const JSON_Value *value, char *buf, size_t buf_size_in_bytes); JSON_Status json_serialize_to_file_pretty(const JSON_Value *value, const char *filename); char * json_serialize_to_string_pretty(const JSON_Value *value); void json_free_serialized_string(char *string); /* frees string from json_serialize_to_string and json_serialize_to_string_pretty */ /* Comparing */ int json_value_equals(const JSON_Value *a, const JSON_Value *b); /* Validation This is *NOT* JSON Schema. It validates json by checking if object have identically named fields with matching types. For example schema {"name":"", "age":0} will validate {"name":"Joe", "age":25} and {"name":"Joe", "age":25, "gender":"m"}, but not {"name":"Joe"} or {"name":"Joe", "age":"Cucumber"}. In case of arrays, only first value in schema is checked against all values in tested array. Empty objects ({}) validate all objects, empty arrays ([]) validate all arrays, null validates values of every type. */ JSON_Status json_validate(const JSON_Value *schema, const JSON_Value *value); /* * JSON Object */ JSON_Value * json_object_get_value (const JSON_Object *object, const char *name); const char * json_object_get_string (const JSON_Object *object, const char *name); size_t json_object_get_string_len(const JSON_Object *object, const char *name); /* doesn't account for last null character */ JSON_Object * json_object_get_object (const JSON_Object *object, const char *name); JSON_Array * json_object_get_array (const JSON_Object *object, const char *name); double json_object_get_number (const JSON_Object *object, const char *name); /* returns 0 on fail */ int json_object_get_boolean(const JSON_Object *object, const char *name); /* returns -1 on fail */ /* dotget functions enable addressing values with dot notation in nested objects, just like in structs or c++/java/c# objects (e.g. objectA.objectB.value). Because valid names in JSON can contain dots, some values may be inaccessible this way. */ JSON_Value * json_object_dotget_value (const JSON_Object *object, const char *name); const char * json_object_dotget_string (const JSON_Object *object, const char *name); size_t json_object_dotget_string_len(const JSON_Object *object, const char *name); /* doesn't account for last null character */ JSON_Object * json_object_dotget_object (const JSON_Object *object, const char *name); JSON_Array * json_object_dotget_array (const JSON_Object *object, const char *name); double json_object_dotget_number (const JSON_Object *object, const char *name); /* returns 0 on fail */ int json_object_dotget_boolean(const JSON_Object *object, const char *name); /* returns -1 on fail */ /* Functions to get available names */ size_t json_object_get_count (const JSON_Object *object); const char * json_object_get_name (const JSON_Object *object, size_t index); JSON_Value * json_object_get_value_at(const JSON_Object *object, size_t index); JSON_Value * json_object_get_wrapping_value(const JSON_Object *object); /* Functions to check if object has a value with a specific name. Returned value is 1 if object has * a value and 0 if it doesn't. dothas functions behave exactly like dotget functions. */ int json_object_has_value (const JSON_Object *object, const char *name); int json_object_has_value_of_type(const JSON_Object *object, const char *name, JSON_Value_Type type); int json_object_dothas_value (const JSON_Object *object, const char *name); int json_object_dothas_value_of_type(const JSON_Object *object, const char *name, JSON_Value_Type type); /* Creates new name-value pair or frees and replaces old value with a new one. * json_object_set_value does not copy passed value so it shouldn't be freed afterwards. */ JSON_Status json_object_set_value(JSON_Object *object, const char *name, JSON_Value *value); JSON_Status json_object_set_string(JSON_Object *object, const char *name, const char *string); JSON_Status json_object_set_string_with_len(JSON_Object *object, const char *name, const char *string, size_t len); /* length shouldn't include last null character */ JSON_Status json_object_set_number(JSON_Object *object, const char *name, double number); JSON_Status json_object_set_boolean(JSON_Object *object, const char *name, int boolean); JSON_Status json_object_set_null(JSON_Object *object, const char *name); /* Works like dotget functions, but creates whole hierarchy if necessary. * json_object_dotset_value does not copy passed value so it shouldn't be freed afterwards. */ JSON_Status json_object_dotset_value(JSON_Object *object, const char *name, JSON_Value *value); JSON_Status json_object_dotset_string(JSON_Object *object, const char *name, const char *string); JSON_Status json_object_dotset_string_with_len(JSON_Object *object, const char *name, const char *string, size_t len); /* length shouldn't include last null character */ JSON_Status json_object_dotset_number(JSON_Object *object, const char *name, double number); JSON_Status json_object_dotset_boolean(JSON_Object *object, const char *name, int boolean); JSON_Status json_object_dotset_null(JSON_Object *object, const char *name); /* Frees and removes name-value pair */ JSON_Status json_object_remove(JSON_Object *object, const char *name); /* Works like dotget function, but removes name-value pair only on exact match. */ JSON_Status json_object_dotremove(JSON_Object *object, const char *key); /* Removes all name-value pairs in object */ JSON_Status json_object_clear(JSON_Object *object); /* *JSON Array */ JSON_Value * json_array_get_value (const JSON_Array *array, size_t index); const char * json_array_get_string (const JSON_Array *array, size_t index); size_t json_array_get_string_len(const JSON_Array *array, size_t index); /* doesn't account for last null character */ JSON_Object * json_array_get_object (const JSON_Array *array, size_t index); JSON_Array * json_array_get_array (const JSON_Array *array, size_t index); double json_array_get_number (const JSON_Array *array, size_t index); /* returns 0 on fail */ int json_array_get_boolean(const JSON_Array *array, size_t index); /* returns -1 on fail */ size_t json_array_get_count (const JSON_Array *array); JSON_Value * json_array_get_wrapping_value(const JSON_Array *array); /* Frees and removes value at given index, does nothing and returns JSONFailure if index doesn't exist. * Order of values in array may change during execution. */ JSON_Status json_array_remove(JSON_Array *array, size_t i); /* Frees and removes from array value at given index and replaces it with given one. * Does nothing and returns JSONFailure if index doesn't exist. * json_array_replace_value does not copy passed value so it shouldn't be freed afterwards. */ JSON_Status json_array_replace_value(JSON_Array *array, size_t i, JSON_Value *value); JSON_Status json_array_replace_string(JSON_Array *array, size_t i, const char* string); JSON_Status json_array_replace_string_with_len(JSON_Array *array, size_t i, const char *string, size_t len); /* length shouldn't include last null character */ JSON_Status json_array_replace_number(JSON_Array *array, size_t i, double number); JSON_Status json_array_replace_boolean(JSON_Array *array, size_t i, int boolean); JSON_Status json_array_replace_null(JSON_Array *array, size_t i); /* Frees and removes all values from array */ JSON_Status json_array_clear(JSON_Array *array); /* Appends new value at the end of array. * json_array_append_value does not copy passed value so it shouldn't be freed afterwards. */ JSON_Status json_array_append_value(JSON_Array *array, JSON_Value *value); JSON_Status json_array_append_string(JSON_Array *array, const char *string); JSON_Status json_array_append_string_with_len(JSON_Array *array, const char *string, size_t len); /* length shouldn't include last null character */ JSON_Status json_array_append_number(JSON_Array *array, double number); JSON_Status json_array_append_boolean(JSON_Array *array, int boolean); JSON_Status json_array_append_null(JSON_Array *array); /* *JSON Value */ JSON_Value * json_value_init_object (void); JSON_Value * json_value_init_array (void); JSON_Value * json_value_init_string (const char *string); /* copies passed string */ JSON_Value * json_value_init_string_with_len(const char *string, size_t length); /* copies passed string, length shouldn't include last null character */ JSON_Value * json_value_init_number (double number); JSON_Value * json_value_init_boolean(int boolean); JSON_Value * json_value_init_null (void); JSON_Value * json_value_deep_copy (const JSON_Value *value); void json_value_free (JSON_Value *value); JSON_Value_Type json_value_get_type (const JSON_Value *value); JSON_Object * json_value_get_object (const JSON_Value *value); JSON_Array * json_value_get_array (const JSON_Value *value); const char * json_value_get_string (const JSON_Value *value); size_t json_value_get_string_len(const JSON_Value *value); /* doesn't account for last null character */ double json_value_get_number (const JSON_Value *value); int json_value_get_boolean(const JSON_Value *value); JSON_Value * json_value_get_parent (const JSON_Value *value); /* Same as above, but shorter */ JSON_Value_Type json_type (const JSON_Value *value); JSON_Object * json_object (const JSON_Value *value); JSON_Array * json_array (const JSON_Value *value); const char * json_string (const JSON_Value *value); size_t json_string_len(const JSON_Value *value); /* doesn't account for last null character */ double json_number (const JSON_Value *value); int json_boolean(const JSON_Value *value); #ifdef __cplusplus } #endif #endif pg_auto_failover-1.6.3/src/bin/lib/parson/tests.c000066400000000000000000001043121414244367200217730ustar00rootroot00000000000000/* SPDX-License-Identifier: MIT Parson ( http://kgabis.github.com/parson/ ) Copyright (c) 2012 - 2021 Krzysztof Gabis Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifdef _MSC_VER #define _CRT_SECURE_NO_WARNINGS #endif #include "parson.h" #include #include #include #include #include #define TEST(A) do {\ if (A) {\ g_tests_passed++;\ } else {\ printf("%d %-72s - FAILED\n", __LINE__, #A);\ g_tests_failed++;\ }\ } while(0) #define STREQ(A, B) ((A) && (B) ? strcmp((A), (B)) == 0 : 0) #define DBL_EPSILON 2.2204460492503131e-16 #define DBL_EQ(a, b) (fabs((a) - (b)) < DBL_EPSILON) void test_suite_1(void); /* Test 3 files from json.org + serialization*/ void test_suite_2(JSON_Value *value); /* Test correctness of parsed values */ void test_suite_2_no_comments(void); void test_suite_2_with_comments(void); void test_suite_3(void); /* Test parsing valid and invalid strings */ void test_suite_4(void); /* Test deep copy function */ void test_suite_5(void); /* Test building json values from scratch */ void test_suite_6(void); /* Test value comparing verification */ void test_suite_7(void); /* Test schema validation */ void test_suite_8(void); /* Test serialization */ void test_suite_9(void); /* Test serialization (pretty) */ void test_suite_10(void); /* Testing for memory leaks */ void test_suite_11(void); /* Additional things that require testing */ void test_memory_leaks(void); void test_failing_allocations(void); void print_commits_info(const char *username, const char *repo); void persistence_example(void); void serialization_example(void); static const char *g_tests_path = "tests"; static int g_malloc_count = 0; static void *counted_malloc(size_t size); static void counted_free(void *ptr); typedef struct failing_alloc { int allocation_to_fail; int alloc_count; int total_count; int has_failed; int should_fail; } failing_alloc_t; static failing_alloc_t g_failing_alloc; static void *failing_malloc(size_t size); static void failing_free(void *ptr); static char * read_file(const char * filename); const char* get_file_path(const char *filename); static int g_tests_passed; static int g_tests_failed; #ifdef TESTS_MAIN int main(int argc, char *argv[]) { #else int tests_main(int argc, char *argv[]); int tests_main(int argc, char *argv[]) { #endif /* Example functions from readme file: */ /* print_commits_info("torvalds", "linux"); */ /* serialization_example(); */ /* persistence_example(); */ puts("################################################################################"); puts("Running parson tests"); if (argc == 2) { g_tests_path = argv[1]; } else { g_tests_path = "tests"; } json_set_allocation_functions(counted_malloc, counted_free); test_suite_1(); test_suite_2_no_comments(); test_suite_2_with_comments(); test_suite_3(); test_suite_4(); test_suite_5(); test_suite_6(); test_suite_7(); test_suite_8(); test_suite_9(); test_suite_10(); test_suite_11(); test_memory_leaks(); test_failing_allocations(); printf("Tests failed: %d\n", g_tests_failed); printf("Tests passed: %d\n", g_tests_passed); puts("################################################################################"); return 0; } void test_suite_1(void) { JSON_Value *val; TEST((val = json_parse_file(get_file_path("test_1_1.txt"))) != NULL); TEST(json_value_equals(json_parse_string(json_serialize_to_string(val)), val)); TEST(json_value_equals(json_parse_string(json_serialize_to_string_pretty(val)), val)); if (val) { json_value_free(val); } TEST((val = json_parse_file(get_file_path("test_1_2.txt"))) == NULL); /* Over 2048 levels of nesting */ if (val) { json_value_free(val); } TEST((val = json_parse_file(get_file_path("test_1_3.txt"))) != NULL); TEST(json_value_equals(json_parse_string(json_serialize_to_string(val)), val)); TEST(json_value_equals(json_parse_string(json_serialize_to_string_pretty(val)), val)); if (val) { json_value_free(val); } TEST((val = json_parse_file_with_comments(get_file_path("test_1_1.txt"))) != NULL); TEST(json_value_equals(json_parse_string(json_serialize_to_string(val)), val)); TEST(json_value_equals(json_parse_string(json_serialize_to_string_pretty(val)), val)); if (val) { json_value_free(val); } TEST((val = json_parse_file_with_comments(get_file_path("test_1_2.txt"))) == NULL); /* Over 2048 levels of nesting */ if (val) { json_value_free(val); } TEST((val = json_parse_file_with_comments(get_file_path("test_1_3.txt"))) != NULL); TEST(json_value_equals(json_parse_string(json_serialize_to_string(val)), val)); TEST(json_value_equals(json_parse_string(json_serialize_to_string_pretty(val)), val)); if (val) { json_value_free(val); } } void test_suite_2(JSON_Value *root_value) { JSON_Object *root_object; JSON_Array *array; JSON_Value *array_value; size_t len; size_t i; TEST(root_value); TEST(json_value_get_type(root_value) == JSONObject); root_object = json_value_get_object(root_value); TEST(json_object_has_value(root_object, "string")); TEST(!json_object_has_value(root_object, "_string")); TEST(json_object_has_value_of_type(root_object, "object", JSONObject)); TEST(!json_object_has_value_of_type(root_object, "string array", JSONObject)); TEST(json_object_has_value_of_type(root_object, "string array", JSONArray)); TEST(!json_object_has_value_of_type(root_object, "object", JSONArray)); TEST(json_object_has_value_of_type(root_object, "string", JSONString)); TEST(!json_object_has_value_of_type(root_object, "positive one", JSONString)); TEST(json_object_has_value_of_type(root_object, "positive one", JSONNumber)); TEST(!json_object_has_value_of_type(root_object, "string", JSONNumber)); TEST(json_object_has_value_of_type(root_object, "boolean true", JSONBoolean)); TEST(!json_object_has_value_of_type(root_object, "positive one", JSONBoolean)); TEST(json_object_has_value_of_type(root_object, "null", JSONNull)); TEST(!json_object_has_value_of_type(root_object, "object", JSONNull)); TEST(json_object_dothas_value(root_object, "object.nested array")); TEST(!json_object_dothas_value(root_object, "_object.nested array")); TEST(json_object_dothas_value_of_type(root_object, "object.nested object", JSONObject)); TEST(!json_object_dothas_value_of_type(root_object, "object.nested array", JSONObject)); TEST(json_object_dothas_value_of_type(root_object, "object.nested array", JSONArray)); TEST(!json_object_dothas_value_of_type(root_object, "object.nested object", JSONArray)); TEST(json_object_dothas_value_of_type(root_object, "object.nested string", JSONString)); TEST(!json_object_dothas_value_of_type(root_object, "object.nested number", JSONString)); TEST(json_object_dothas_value_of_type(root_object, "object.nested number", JSONNumber)); TEST(!json_object_dothas_value_of_type(root_object, "_object.nested whatever", JSONNumber)); TEST(json_object_dothas_value_of_type(root_object, "object.nested true", JSONBoolean)); TEST(!json_object_dothas_value_of_type(root_object, "object.nested number", JSONBoolean)); TEST(json_object_dothas_value_of_type(root_object, "object.nested null", JSONNull)); TEST(!json_object_dothas_value_of_type(root_object, "object.nested object", JSONNull)); TEST(STREQ(json_object_get_string(root_object, "string"), "lorem ipsum")); TEST(STREQ(json_object_get_string(root_object, "utf string"), "lorem ipsum")); TEST(STREQ(json_object_get_string(root_object, "utf-8 string"), "あいうえお")); TEST(STREQ(json_object_get_string(root_object, "surrogate string"), "lorem𝄞ipsum𝍧lorem")); len = json_object_get_string_len(root_object, "string with null"); TEST(len == 7); TEST(memcmp(json_object_get_string(root_object, "string with null"), "abc\0def", len) == 0); TEST(DBL_EQ(json_object_get_number(root_object, "positive one"), 1.0)); TEST(DBL_EQ(json_object_get_number(root_object, "negative one"), -1.0)); TEST(DBL_EQ(json_object_get_number(root_object, "hard to parse number"), -0.000314)); TEST(json_object_get_boolean(root_object, "boolean true") == 1); TEST(json_object_get_boolean(root_object, "boolean false") == 0); TEST(json_value_get_type(json_object_get_value(root_object, "null")) == JSONNull); array = json_object_get_array(root_object, "string array"); if (array != NULL && json_array_get_count(array) > 1) { TEST(STREQ(json_array_get_string(array, 0), "lorem")); TEST(STREQ(json_array_get_string(array, 1), "ipsum")); } else { g_tests_failed++; } array = json_object_get_array(root_object, "x^2 array"); if (array != NULL) { for (i = 0; i < json_array_get_count(array); i++) { TEST(DBL_EQ(json_array_get_number(array, i), (i * i))); } } else { g_tests_failed++; } TEST(json_object_get_array(root_object, "non existent array") == NULL); TEST(STREQ(json_object_dotget_string(root_object, "object.nested string"), "str")); TEST(json_object_dotget_boolean(root_object, "object.nested true") == 1); TEST(json_object_dotget_boolean(root_object, "object.nested false") == 0); TEST(json_object_dotget_value(root_object, "object.nested null") != NULL); TEST(DBL_EQ(json_object_dotget_number(root_object, "object.nested number"), 123)); TEST(json_object_dotget_value(root_object, "should.be.null") == NULL); TEST(json_object_dotget_value(root_object, "should.be.null.") == NULL); TEST(json_object_dotget_value(root_object, ".") == NULL); TEST(json_object_dotget_value(root_object, "") == NULL); array = json_object_dotget_array(root_object, "object.nested array"); TEST(array != NULL); TEST(json_array_get_count(array) > 1); if (array != NULL && json_array_get_count(array) > 1) { TEST(STREQ(json_array_get_string(array, 0), "lorem")); TEST(STREQ(json_array_get_string(array, 1), "ipsum")); } TEST(json_object_dotget_boolean(root_object, "object.nested true") == 1); TEST(STREQ(json_object_get_string(root_object, "/**/"), "comment")); TEST(STREQ(json_object_get_string(root_object, "//"), "comment")); TEST(STREQ(json_object_get_string(root_object, "url"), "https://www.example.com/search?q=12345")); TEST(STREQ(json_object_get_string(root_object, "escaped chars"), "\" \\ /")); TEST(json_object_get_object(root_object, "empty object") != NULL); TEST(json_object_get_array(root_object, "empty array") != NULL); TEST(json_object_get_wrapping_value(root_object) == root_value); array = json_object_get_array(root_object, "string array"); array_value = json_object_get_value(root_object, "string array"); TEST(json_array_get_wrapping_value(array) == array_value); TEST(json_value_get_parent(array_value) == root_value); TEST(json_value_get_parent(root_value) == NULL); } void test_suite_2_no_comments(void) { const char *filename = "test_2.txt"; JSON_Value *root_value = NULL; root_value = json_parse_file(get_file_path(filename)); test_suite_2(root_value); TEST(json_value_equals(root_value, json_parse_string(json_serialize_to_string(root_value)))); TEST(json_value_equals(root_value, json_parse_string(json_serialize_to_string_pretty(root_value)))); json_value_free(root_value); } void test_suite_2_with_comments(void) { const char *filename = "test_2_comments.txt"; JSON_Value *root_value = NULL; root_value = json_parse_file_with_comments(get_file_path(filename)); test_suite_2(root_value); TEST(json_value_equals(root_value, json_parse_string(json_serialize_to_string(root_value)))); TEST(json_value_equals(root_value, json_parse_string(json_serialize_to_string_pretty(root_value)))); json_value_free(root_value); } void test_suite_3(void) { /* Testing valid strings */ TEST(json_parse_string("{\"lorem\":\"ipsum\"}") != NULL); TEST(json_parse_string("[\"lorem\"]") != NULL); TEST(json_parse_string("null") != NULL); TEST(json_parse_string("true") != NULL); TEST(json_parse_string("false") != NULL); TEST(json_parse_string("\"string\"") != NULL); TEST(json_parse_string("123") != NULL); /* Test UTF-16 parsing */ TEST(STREQ(json_string(json_parse_string("\"\\u0024x\"")), "$x")); TEST(STREQ(json_string(json_parse_string("\"\\u00A2x\"")), "¢x")); TEST(STREQ(json_string(json_parse_string("\"\\u20ACx\"")), "€x")); TEST(STREQ(json_string(json_parse_string("\"\\uD801\\uDC37x\"")), "𐐷x")); /* Testing invalid strings */ g_malloc_count = 0; TEST(json_parse_string(NULL) == NULL); TEST(json_parse_string("") == NULL); /* empty string */ TEST(json_parse_string("[\"lorem\",]") == NULL); TEST(json_parse_string("{\"lorem\":\"ipsum\",}") == NULL); TEST(json_parse_string("{lorem:ipsum}") == NULL); TEST(json_parse_string("[,]") == NULL); TEST(json_parse_string("[,") == NULL); TEST(json_parse_string("[") == NULL); TEST(json_parse_string("]") == NULL); TEST(json_parse_string("{\"a\":0,\"a\":0}") == NULL); /* duplicate keys */ TEST(json_parse_string("{:,}") == NULL); TEST(json_parse_string("{,}") == NULL); TEST(json_parse_string("{,") == NULL); TEST(json_parse_string("{:") == NULL); TEST(json_parse_string("{") == NULL); TEST(json_parse_string("}") == NULL); TEST(json_parse_string("x") == NULL); TEST(json_parse_string("{:\"no name\"}") == NULL); TEST(json_parse_string("[,\"no first value\"]") == NULL); TEST(json_parse_string("{\"key\"\"value\"}") == NULL); TEST(json_parse_string("{\"a\"}") == NULL); TEST(json_parse_string("[\"\\u00zz\"]") == NULL); /* invalid utf value */ TEST(json_parse_string("[\"\\u00\"]") == NULL); /* invalid utf value */ TEST(json_parse_string("[\"\\u\"]") == NULL); /* invalid utf value */ TEST(json_parse_string("[\"\\\"]") == NULL); /* control character */ TEST(json_parse_string("[\"\"\"]") == NULL); /* control character */ TEST(json_parse_string("[\"\0\"]") == NULL); /* control character */ TEST(json_parse_string("[\"\a\"]") == NULL); /* control character */ TEST(json_parse_string("[\"\b\"]") == NULL); /* control character */ TEST(json_parse_string("[\"\t\"]") == NULL); /* control character */ TEST(json_parse_string("[\"\n\"]") == NULL); /* control character */ TEST(json_parse_string("[\"\f\"]") == NULL); /* control character */ TEST(json_parse_string("[\"\r\"]") == NULL); /* control character */ TEST(json_parse_string("[0x2]") == NULL); /* hex */ TEST(json_parse_string("[0X2]") == NULL); /* HEX */ TEST(json_parse_string("[07]") == NULL); /* octals */ TEST(json_parse_string("[0070]") == NULL); TEST(json_parse_string("[07.0]") == NULL); TEST(json_parse_string("[-07]") == NULL); TEST(json_parse_string("[-007]") == NULL); TEST(json_parse_string("[-07.0]") == NULL); TEST(json_parse_string("[\"\\uDF67\\uD834\"]") == NULL); /* wrong order surrogate pair */ TEST(json_parse_string("[1.7976931348623157e309]") == NULL); TEST(json_parse_string("[-1.7976931348623157e309]") == NULL); TEST(g_malloc_count == 0); } void test_suite_4() { const char *filename = "test_2.txt"; JSON_Value *a = NULL, *a_copy = NULL; a = json_parse_file(get_file_path(filename)); TEST(json_value_equals(a, a)); /* test equality test */ a_copy = json_value_deep_copy(a); TEST(a_copy != NULL); TEST(json_value_equals(a, a_copy)); } void test_suite_5(void) { double zero = 0.0; /* msvc is silly (workaround for error C2124) */ JSON_Value *val_from_file = json_parse_file(get_file_path("test_5.txt")); JSON_Value *val = NULL, *val_with_parent; JSON_Object *obj = NULL; JSON_Array *interests_arr = NULL; JSON_Value *remove_test_val = NULL; JSON_Array *remove_test_arr = NULL; val = json_value_init_object(); TEST(val != NULL); obj = json_value_get_object(val); TEST(obj != NULL); TEST(json_object_set_string(obj, "first", "John") == JSONSuccess); TEST(json_object_set_string(obj, "last", "Doe") == JSONSuccess); TEST(json_object_set_number(obj, "age", 25) == JSONSuccess); TEST(json_object_set_boolean(obj, "registered", 1) == JSONSuccess); TEST(json_object_set_value(obj, "interests", json_value_init_array()) == JSONSuccess); interests_arr = json_object_get_array(obj, "interests"); TEST(interests_arr != NULL); TEST(json_array_append_string(interests_arr, "Writing") == JSONSuccess); TEST(json_array_append_string(interests_arr, "Mountain Biking") == JSONSuccess); TEST(json_array_replace_string(interests_arr, 0, "Reading") == JSONSuccess); TEST(json_object_dotset_string(obj, "favorites.color", "blue") == JSONSuccess); TEST(json_object_dotset_string(obj, "favorites.sport", "running") == JSONSuccess); TEST(json_object_dotset_string(obj, "favorites.fruit", "apple") == JSONSuccess); TEST(json_object_dotremove(obj, "favorites.fruit") == JSONSuccess); TEST(json_object_set_string(obj, "utf string", "lorem ipsum") == JSONSuccess); TEST(json_object_set_string(obj, "utf-8 string", "あいうえお") == JSONSuccess); TEST(json_object_set_string(obj, "surrogate string", "lorem𝄞ipsum𝍧lorem") == JSONSuccess); TEST(json_object_set_string_with_len(obj, "string with null", "abc\0def", 7) == JSONSuccess); TEST(json_object_set_string(obj, "windows path", "C:\\Windows\\Path") == JSONSuccess); TEST(json_value_equals(val_from_file, val)); TEST(json_object_set_string(obj, NULL, "") == JSONFailure); TEST(json_object_set_string(obj, "last", NULL) == JSONFailure); TEST(json_object_set_string(obj, NULL, NULL) == JSONFailure); TEST(json_object_set_value(obj, NULL, NULL) == JSONFailure); TEST(json_object_dotset_string(obj, NULL, "") == JSONFailure); TEST(json_object_dotset_string(obj, "favorites.color", NULL) == JSONFailure); TEST(json_object_dotset_string(obj, NULL, NULL) == JSONFailure); TEST(json_object_dotset_value(obj, NULL, NULL) == JSONFailure); TEST(json_array_append_string(NULL, "lorem") == JSONFailure); TEST(json_array_append_value(interests_arr, NULL) == JSONFailure); TEST(json_array_append_value(NULL, NULL) == JSONFailure); TEST(json_array_remove(NULL, 0) == JSONFailure); TEST(json_array_replace_value(interests_arr, 0, NULL) == JSONFailure); TEST(json_array_replace_string(NULL, 0, "lorem") == JSONFailure); TEST(json_array_replace_string(interests_arr, 100, "not existing") == JSONFailure); TEST(json_array_append_string(json_object_get_array(obj, "interests"), NULL) == JSONFailure); TEST(json_array_append_string(interests_arr, "Writing") == JSONSuccess); TEST(json_array_remove(interests_arr, 0) == JSONSuccess); TEST(json_array_remove(interests_arr, 1) == JSONSuccess); TEST(json_array_remove(interests_arr, 0) == JSONSuccess); TEST(json_array_remove(interests_arr, 0) == JSONFailure); /* should be empty by now */ val_with_parent = json_value_init_null(); TEST(json_object_set_value(obj, "x", val_with_parent) == JSONSuccess); TEST(json_object_set_value(obj, "x", val_with_parent) == JSONFailure); val_with_parent = json_value_init_null(); TEST(json_array_append_value(interests_arr, val_with_parent) == JSONSuccess); TEST(json_array_append_value(interests_arr, val_with_parent) == JSONFailure); val_with_parent = json_value_init_null(); TEST(json_array_replace_value(interests_arr, 0, val_with_parent) == JSONSuccess); TEST(json_array_replace_value(interests_arr, 0, val_with_parent) == JSONFailure); TEST(json_object_remove(obj, "interests") == JSONSuccess); /* UTF-8 tests */ TEST(json_object_set_string(obj, "correct string", "κόσμε") == JSONSuccess); TEST(json_object_set_string(obj, "boundary 1", "\xed\x9f\xbf") == JSONSuccess); TEST(json_object_set_string(obj, "boundary 2", "\xee\x80\x80") == JSONSuccess); TEST(json_object_set_string(obj, "boundary 3", "\xef\xbf\xbd") == JSONSuccess); TEST(json_object_set_string(obj, "boundary 4", "\xf4\x8f\xbf\xbf") == JSONSuccess); TEST(json_object_set_string(obj, "first continuation byte", "\x80") == JSONFailure); TEST(json_object_set_string(obj, "last continuation byte", "\xbf") == JSONFailure); TEST(json_object_set_string(obj, "impossible sequence 1", "\xfe") == JSONFailure); TEST(json_object_set_string(obj, "impossible sequence 2", "\xff") == JSONFailure); TEST(json_object_set_string(obj, "impossible sequence 3", "\xfe\xfe\xff\xff") == JSONFailure); TEST(json_object_set_string(obj, "overlong 1", "\xc0\xaf") == JSONFailure); TEST(json_object_set_string(obj, "overlong 2", "\xc1\xbf") == JSONFailure); TEST(json_object_set_string(obj, "overlong 3", "\xe0\x80\xaf") == JSONFailure); TEST(json_object_set_string(obj, "overlong 4", "\xe0\x9f\xbf") == JSONFailure); TEST(json_object_set_string(obj, "overlong 5", "\xf0\x80\x80\xaf") == JSONFailure); TEST(json_object_set_string(obj, "overlong 6", "\xf0\x8f\xbf\xbf") == JSONFailure); TEST(json_object_set_string(obj, "overlong 7", "\xf0\x8f\xbf\xbf") == JSONFailure); TEST(json_object_set_string(obj, "overlong null 1", "\xc0\x80") == JSONFailure); TEST(json_object_set_string(obj, "overlong null 2", "\xe0\x80\x80") == JSONFailure); TEST(json_object_set_string(obj, "overlong null 3", "\xf0\x80\x80\x80") == JSONFailure); TEST(json_object_set_string(obj, "overlong null 4", "\xf8\x80\x80\x80\x80") == JSONFailure); TEST(json_object_set_string(obj, "overlong null 5", "\xfc\x80\x80\x80\x80\x80") == JSONFailure); TEST(json_object_set_string(obj, "single surrogate 1", "\xed\xa0\x80") == JSONFailure); TEST(json_object_set_string(obj, "single surrogate 2", "\xed\xaf\xbf") == JSONFailure); TEST(json_object_set_string(obj, "single surrogate 3", "\xed\xbf\xbf") == JSONFailure); /* Testing removing values from array, order of the elements should be preserved */ remove_test_val = json_parse_string("[1, 2, 3, 4, 5]"); remove_test_arr = json_array(remove_test_val); json_array_remove(remove_test_arr, 2); TEST(json_value_equals(remove_test_val, json_parse_string("[1, 2, 4, 5]"))); json_array_remove(remove_test_arr, 0); TEST(json_value_equals(remove_test_val, json_parse_string("[2, 4, 5]"))); json_array_remove(remove_test_arr, 2); TEST(json_value_equals(remove_test_val, json_parse_string("[2, 4]"))); /* Testing nan and inf */ TEST(json_object_set_number(obj, "num", 0.0 / zero) == JSONFailure); TEST(json_object_set_number(obj, "num", 1.0 / zero) == JSONFailure); } void test_suite_6(void) { const char *filename = "test_2.txt"; JSON_Value *a = NULL; JSON_Value *b = NULL; a = json_parse_file(get_file_path(filename)); b = json_parse_file(get_file_path(filename)); TEST(json_value_equals(a, b)); json_object_set_string(json_object(a), "string", "eki"); TEST(!json_value_equals(a, b)); a = json_value_deep_copy(b); TEST(json_value_equals(a, b)); json_array_append_number(json_object_get_array(json_object(b), "string array"), 1337); TEST(!json_value_equals(a, b)); } void test_suite_7(void) { JSON_Value *val_from_file = json_parse_file(get_file_path("test_5.txt")); JSON_Value *schema = json_value_init_object(); JSON_Object *schema_obj = json_value_get_object(schema); JSON_Array *interests_arr = NULL; json_object_set_string(schema_obj, "first", ""); json_object_set_string(schema_obj, "last", ""); json_object_set_number(schema_obj, "age", 0); json_object_set_value(schema_obj, "interests", json_value_init_array()); interests_arr = json_object_get_array(schema_obj, "interests"); json_array_append_string(interests_arr, ""); json_object_set_null(schema_obj, "favorites"); TEST(json_validate(schema, val_from_file) == JSONSuccess); json_object_set_string(schema_obj, "age", ""); TEST(json_validate(schema, val_from_file) == JSONFailure); } void test_suite_8(void) { const char *filename = "test_2.txt"; const char *temp_filename = "test_2_serialized.txt"; JSON_Value *a = NULL; JSON_Value *b = NULL; char *buf = NULL; size_t serialization_size = 0; a = json_parse_file(get_file_path(filename)); TEST(json_serialize_to_file(a, get_file_path(temp_filename)) == JSONSuccess); b = json_parse_file(get_file_path(temp_filename)); TEST(json_value_equals(a, b)); remove(temp_filename); serialization_size = json_serialization_size(a); buf = json_serialize_to_string(a); TEST((strlen(buf)+1) == serialization_size); } void test_suite_9(void) { const char *filename = "test_2_pretty.txt"; const char *temp_filename = "test_2_serialized_pretty.txt"; char *file_contents = NULL; char *serialized = NULL; JSON_Value *a = NULL; JSON_Value *b = NULL; size_t serialization_size = 0; a = json_parse_file(get_file_path(filename)); TEST(json_serialize_to_file_pretty(a, get_file_path(temp_filename)) == JSONSuccess); b = json_parse_file(get_file_path(temp_filename)); TEST(json_value_equals(a, b)); remove(temp_filename); serialization_size = json_serialization_size_pretty(a); serialized = json_serialize_to_string_pretty(a); TEST((strlen(serialized)+1) == serialization_size); file_contents = read_file(get_file_path(filename)); TEST(STREQ(file_contents, serialized)); } void test_suite_10(void) { JSON_Value *val; char *serialized; g_malloc_count = 0; val = json_parse_file(get_file_path("test_1_1.txt")); json_value_free(val); val = json_parse_file(get_file_path("test_1_3.txt")); json_value_free(val); val = json_parse_file(get_file_path("test_2.txt")); serialized = json_serialize_to_string_pretty(val); json_free_serialized_string(serialized); json_value_free(val); val = json_parse_file(get_file_path("test_2_pretty.txt")); json_value_free(val); TEST(g_malloc_count == 0); } void test_suite_11() { const char * array_with_slashes = "[\"a/b/c\"]"; const char * array_with_escaped_slashes = "[\"a\\/b\\/c\"]"; char *serialized = NULL; JSON_Value *value = json_parse_string(array_with_slashes); serialized = json_serialize_to_string(value); TEST(STREQ(array_with_escaped_slashes, serialized)); json_set_escape_slashes(0); serialized = json_serialize_to_string(value); TEST(STREQ(array_with_slashes, serialized)); json_set_escape_slashes(1); serialized = json_serialize_to_string(value); TEST(STREQ(array_with_escaped_slashes, serialized)); } void test_memory_leaks() { g_malloc_count = 0; TEST(json_object_set_string(NULL, "lorem", "ipsum") == JSONFailure); TEST(json_object_set_number(NULL, "lorem", 42) == JSONFailure); TEST(json_object_set_boolean(NULL, "lorem", 0) == JSONFailure); TEST(json_object_set_null(NULL, "lorem") == JSONFailure); TEST(json_parse_string("{\"\\u0000\"") == NULL); TEST(g_malloc_count == 0); } void test_failing_allocations() { const char *filename = "test_2.txt"; JSON_Value *root_value = NULL; JSON_Object *root_object = NULL; int i = 0; int n = 0; char key_val_buf[32]; json_set_allocation_functions(failing_malloc, failing_free); printf("Testing failing allocations: "); while (1) { /* printf("Failing at allocation %d\n", n); */ g_failing_alloc.allocation_to_fail = n; g_failing_alloc.alloc_count = 0; g_failing_alloc.total_count = 0; g_failing_alloc.has_failed = 0; g_failing_alloc.should_fail = 1; n++; root_value = json_parse_file(get_file_path(filename)); if (g_failing_alloc.has_failed) { if (root_value) { printf("Allocation has failed but parsing succeeded after allocation %d\n", n - 1); g_tests_failed++; return; } } if (root_value) { root_object = json_object(root_value); for (i = 0; i < 64; i++) { sprintf(key_val_buf, "%d", i); json_object_set_string(root_object, key_val_buf, key_val_buf); } for (i = 0; i < 64; i++) { sprintf(key_val_buf, "%d", i); json_object_set_string(root_object, key_val_buf, key_val_buf); } json_object_dotset_number(root_object, "ala.ma.kota", 123); json_object_dotremove(root_object, "ala.ma.kota"); } json_value_free(root_value); if (g_failing_alloc.alloc_count != 0) { printf("Leak after failing allocation %d\n", n - 1); g_tests_failed++; return; } if (!g_failing_alloc.has_failed) { break; } } json_set_allocation_functions(NULL, NULL); printf("OK (tested %d failing allocations)\n", n - 1); g_tests_passed++; } void print_commits_info(const char *username, const char *repo) { JSON_Value *root_value; JSON_Array *commits; JSON_Object *commit; size_t i; char curl_command[512]; char cleanup_command[256]; char output_filename[] = "commits.json"; /* it ain't pretty, but it's not a libcurl tutorial */ sprintf(curl_command, "curl -s \"https://api.github.com/repos/%s/%s/commits\" > %s", username, repo, output_filename); sprintf(cleanup_command, "rm -f %s", output_filename); system(curl_command); /* parsing json and validating output */ root_value = json_parse_file(get_file_path(output_filename)); if (json_value_get_type(root_value) != JSONArray) { system(cleanup_command); return; } /* getting array from root value and printing commit info */ commits = json_value_get_array(root_value); printf("%-10.10s %-10.10s %s\n", "Date", "SHA", "Author"); for (i = 0; i < json_array_get_count(commits); i++) { commit = json_array_get_object(commits, i); printf("%.10s %.10s %s\n", json_object_dotget_string(commit, "commit.author.date"), json_object_get_string(commit, "sha"), json_object_dotget_string(commit, "commit.author.name")); } /* cleanup code */ json_value_free(root_value); system(cleanup_command); } void persistence_example(void) { JSON_Value *schema = json_parse_string("{\"name\":\"\"}"); JSON_Value *user_data = json_parse_file(get_file_path("user_data.json")); char buf[256]; const char *name = NULL; if (user_data == NULL || json_validate(schema, user_data) != JSONSuccess) { puts("Enter your name:"); scanf("%s", buf); user_data = json_value_init_object(); json_object_set_string(json_object(user_data), "name", buf); json_serialize_to_file(user_data, "user_data.json"); } name = json_object_get_string(json_object(user_data), "name"); printf("Hello, %s.", name); json_value_free(schema); json_value_free(user_data); return; } void serialization_example(void) { JSON_Value *root_value = json_value_init_object(); JSON_Object *root_object = json_value_get_object(root_value); char *serialized_string = NULL; json_object_set_string(root_object, "name", "John Smith"); json_object_set_number(root_object, "age", 25); json_object_dotset_string(root_object, "address.city", "Cupertino"); json_object_dotset_value(root_object, "contact.emails", json_parse_string("[\"email@example.com\", \"email2@example.com\"]")); serialized_string = json_serialize_to_string_pretty(root_value); puts(serialized_string); json_free_serialized_string(serialized_string); json_value_free(root_value); } static char * read_file(const char * file_path) { FILE *fp = NULL; size_t size_to_read = 0; size_t size_read = 0; long pos; char *file_contents; fp = fopen(file_path, "r"); if (!fp) { assert(0); return NULL; } fseek(fp, 0L, SEEK_END); pos = ftell(fp); if (pos < 0) { fclose(fp); assert(0); return NULL; } size_to_read = pos; rewind(fp); file_contents = (char*)malloc(sizeof(char) * (size_to_read + 1)); if (!file_contents) { fclose(fp); assert(0); return NULL; } size_read = fread(file_contents, 1, size_to_read, fp); if (size_read == 0 || ferror(fp)) { fclose(fp); free(file_contents); assert(0); return NULL; } fclose(fp); file_contents[size_read] = '\0'; return file_contents; } const char* get_file_path(const char *filename) { static char path_buf[2048] = { 0 }; memset(path_buf, 0, sizeof(path_buf)); sprintf(path_buf, "%s/%s", g_tests_path, filename); return path_buf; } static void *counted_malloc(size_t size) { void *res = malloc(size); if (res != NULL) { g_malloc_count++; } return res; } static void counted_free(void *ptr) { if (ptr != NULL) { g_malloc_count--; } free(ptr); } static void *failing_malloc(size_t size) { void *res = NULL; if (g_failing_alloc.should_fail && g_failing_alloc.total_count >= g_failing_alloc.allocation_to_fail) { g_failing_alloc.has_failed = 1; return NULL; } res = malloc(size); if (res != NULL) { g_failing_alloc.total_count++; g_failing_alloc.alloc_count++; } return res; } static void failing_free(void *ptr) { if (ptr != NULL) { g_failing_alloc.alloc_count--; } free(ptr); } pg_auto_failover-1.6.3/src/bin/lib/parson/tests/000077500000000000000000000000001414244367200216265ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/lib/parson/tests/test_1_1.txt000066400000000000000000000027101414244367200240060ustar00rootroot00000000000000 [ "JSON Test Pattern pass1", {"object with 1 member":["array with 1 element"]}, {}, [], -42, true, false, null, { "integer": 1234567890, "real": -9876.543210, "e": 0.123456789e-12, "E": 1.234567890E+34, "": 23456789012E66, "zero": 0, "one": 1, "space": " ", "quote": "\"", "backslash": "\\", "controls": "\b\f\n\r\t", "slash": "/ & \/", "alpha": "abcdefghijklmnopqrstuvwyz", "ALPHA": "ABCDEFGHIJKLMNOPQRSTUVWYZ", "digit": "0123456789", "0123456789": "digit", "special": "`1~!@#$%^&*()_+-={':[,]}|;.?", "nullchar": "abc\u0000def", "hex": "\u0123\u4567\u89AB\uCDEF\uabcd\uef4A", "true": true, "false": false, "null": null, "array":[ ], "object":{ }, "address": "50 St. James Street", "url": "http://www.JSON.org/", "comment": "// /* */": " ", " s p a c e d " :[1,2 , 3 , 4 , 5 , 6 ,7 ],"compact":[1,2,3,4,5,6,7], "jsontext": "{\"object with 1 member\":[\"array with 1 element\"]}", "quotes": "" \u0022 %22 0x22 034 "", "\/\\\"\uCAFE\uBABE\uAB98\uFCDE\ubcda\uef4A\b\f\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?" : "A key can be any string" }, 0.5 ,98.6 , 99.44 , 1066, 1e1, 0.1e1, 1e-1, 1e00,2e+00,2e-00 ,"rosebud"]pg_auto_failover-1.6.3/src/bin/lib/parson/tests/test_1_2.txt000066400000000000000000000100141414244367200240030ustar00rootroot00000000000000[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]pg_auto_failover-1.6.3/src/bin/lib/parson/tests/test_1_3.txt000066400000000000000000000002241414244367200240060ustar00rootroot00000000000000{ "JSON Test Pattern pass3": { "The outermost value": "must be an object or array.", "In this test": "It is an object." } } pg_auto_failover-1.6.3/src/bin/lib/parson/tests/test_2.txt000066400000000000000000000021741414244367200235730ustar00rootroot00000000000000{ "string" : "lorem ipsum", "utf string" : "\u006corem\u0020ipsum", "utf-8 string": "あいうえお", "surrogate string": "lorem\uD834\uDD1Eipsum\uD834\uDF67lorem", "string with null": "abc\u0000def", "positive one" : 1, "negative one" : -1, "pi" : 3.14, "hard to parse number" : -3.14e-4, "big int": 2147483647, "big uint": 4294967295, "double underflow": 6.9041432094973937e-310, "boolean true" : true, "boolean false" : false, "null" : null, "string array" : ["lorem", "ipsum"], "x^2 array" : [0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100], "/*" : null, "object" : { "nested string" : "str", "nested true" : true, "nested false" : false, "nested null" : null, "nested number" : 123, "nested array" : ["lorem", "ipsum"], "nested object" : {"lorem": "ipsum"} }, "*/" : null, "/**/" : "comment", "//" : "comment", "url" : "https:\/\/www.example.com\/search?q=12345", "escaped chars" : "\" \\ \/", "empty object" : {}, "empty array" : [] } pg_auto_failover-1.6.3/src/bin/lib/parson/tests/test_2_comments.txt000066400000000000000000000027241414244367200255010ustar00rootroot00000000000000/* *Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor *ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud *dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. */ // Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor { /* lorem ipsum */ "string" : "lorem ipsum", // lorem ipsum "utf string" : "\u006corem\u0020ipsum", // lorem ipsum // "utf-8 string": "あいうえお", // /* lorem ipsum */ "surrogate string": "lorem\uD834\uDD1Eipsum\uD834\uDF67lorem", "string with null": "abc\u0000def", "positive one" : 1, "negative one" : -1, "pi" : 3.14, "hard to parse number" : -3.14e-4, "big int": 2147483647, "big uint": 4294967295, "double underflow": 6.9041432094973937e-310, "boolean true" : true, "boolean false" : false, "null" : null, "string array" : ["lorem",/*in array*/"ipsum"], "x^2 array" : [0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100], /* "x^2 array" : [], */ "/*" : null, "object" : { "nested string" : "str", "nested true" : /* lorem ipsum */ true, "nested false" : false, "nested null" : null, // lorem ipsum "nested number" : 123, "nested array" : ["lorem", "ipsum"], "nested object" : {"lorem": "ipsum"} }, "*/" : null, "/**/" : "comment", "//" : "comment", "url" : "https:\/\/www.example.com\/search?q=12345", "escaped chars" : "\" \\ \/", "empty object" : {}, "empty array" : [] } /**/ //pg_auto_failover-1.6.3/src/bin/lib/parson/tests/test_2_pretty.txt000066400000000000000000000022411414244367200251750ustar00rootroot00000000000000{ "string": "lorem ipsum", "utf string": "lorem ipsum", "utf-8 string": "あいうえお", "surrogate string": "lorem𝄞ipsum𝍧lorem", "string with null": "abc\u0000def", "positive one": 1, "negative one": -1, "pi": 3.1400000000000001, "hard to parse number": -0.00031399999999999999, "big int": 2147483647, "big uint": 4294967295, "double underflow": 6.9041432094973937e-310, "boolean true": true, "boolean false": false, "null": null, "string array": [ "lorem", "ipsum" ], "x^2 array": [ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100 ], "\/*": null, "object": { "nested string": "str", "nested true": true, "nested false": false, "nested null": null, "nested number": 123, "nested array": [ "lorem", "ipsum" ] }, "*\/": null, "\/**\/": "comment", "\/\/": "comment", "url": "https:\/\/www.example.com\/search?q=12345", "escaped chars": "\" \\ \/", "empty object": {}, "empty array": [] }pg_auto_failover-1.6.3/src/bin/lib/parson/tests/test_2_serialized.txt000066400000000000000000000015031414244367200260010ustar00rootroot00000000000000{"string":"lorem ipsum","utf string":"lorem ipsum","utf-8 string":"あいうえお","surrogate string":"lorem𝄞ipsum𝍧lorem","string with null":"abc\u0000def","positive one":1,"negative one":-1,"pi":3.1400000000000001,"hard to parse number":-0.00031399999999999999,"big int":2147483647,"big uint":4294967295,"double underflow":6.9041432094973937e-310,"boolean true":true,"boolean false":false,"null":null,"string array":["lorem","ipsum"],"x^2 array":[0,1,4,9,16,25,36,49,64,81,100],"\/*":null,"object":{"nested string":"str","nested true":true,"nested false":false,"nested null":null,"nested number":123,"nested array":["lorem","ipsum"],"nested object":{"lorem":"ipsum"}},"*\/":null,"\/**\/":"comment","\/\/":"comment","url":"https:\/\/www.example.com\/search?q=12345","escaped chars":"\" \\ \/","empty object":{},"empty array":[]}pg_auto_failover-1.6.3/src/bin/lib/parson/tests/test_2_serialized_pretty.txt000066400000000000000000000022411414244367200274100ustar00rootroot00000000000000{ "string": "lorem ipsum", "utf string": "lorem ipsum", "utf-8 string": "あいうえお", "surrogate string": "lorem𝄞ipsum𝍧lorem", "string with null": "abc\u0000def", "positive one": 1, "negative one": -1, "pi": 3.1400000000000001, "hard to parse number": -0.00031399999999999999, "big int": 2147483647, "big uint": 4294967295, "double underflow": 6.9041432094973937e-310, "boolean true": true, "boolean false": false, "null": null, "string array": [ "lorem", "ipsum" ], "x^2 array": [ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100 ], "\/*": null, "object": { "nested string": "str", "nested true": true, "nested false": false, "nested null": null, "nested number": 123, "nested array": [ "lorem", "ipsum" ] }, "*\/": null, "\/**\/": "comment", "\/\/": "comment", "url": "https:\/\/www.example.com\/search?q=12345", "escaped chars": "\" \\ \/", "empty object": {}, "empty array": [] }pg_auto_failover-1.6.3/src/bin/lib/parson/tests/test_5.txt000066400000000000000000000006311414244367200235720ustar00rootroot00000000000000{ "first": "John", "last": "Doe", "age": 25, "registered": true, "interests": [ "Reading", "Mountain Biking" ], "favorites": { "color": "blue", "sport": "running" }, "utf string" : "\u006corem\u0020ipsum", "utf-8 string": "あいうえお", "surrogate string": "lorem\uD834\uDD1Eipsum\uD834\uDF67lorem", "string with null": "abc\u0000def", "windows path": "C:\\Windows\\Path" }pg_auto_failover-1.6.3/src/bin/lib/pg/000077500000000000000000000000001414244367200175705ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/lib/pg/README.md000066400000000000000000000001251414244367200210450ustar00rootroot00000000000000# Postgres code This directory contains PostgreSQL code that we have vendored-in. pg_auto_failover-1.6.3/src/bin/lib/pg/snprintf.c000066400000000000000000001055171414244367200216100ustar00rootroot00000000000000/* * Copyright (c) 1983, 1995, 1996 Eric P. Allman * Copyright (c) 1988, 1993 * The Regents of the University of California. All rights reserved. * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * src/port/snprintf.c */ #include "postgres_fe.h" #include "snprintf.h" #include /* * We used to use the platform's NL_ARGMAX here, but that's a bad idea, * first because the point of this module is to remove platform dependencies * not perpetuate them, and second because some platforms use ridiculously * large values, leading to excessive stack consumption in dopr(). */ #define PG_NL_ARGMAX 31 /* * SNPRINTF, VSNPRINTF and friends * * These versions have been grabbed off the net. They have been * cleaned up to compile properly and support for most of the C99 * specification has been added. Remaining unimplemented features are: * * 1. No locale support: the radix character is always '.' and the ' * (single quote) format flag is ignored. * * 2. No support for the "%n" format specification. * * 3. No support for wide characters ("lc" and "ls" formats). * * 4. No support for "long double" ("Lf" and related formats). * * 5. Space and '#' flags are not implemented. * * In addition, we support some extensions over C99: * * 1. Argument order control through "%n$" and "*n$", as required by POSIX. * * 2. "%m" expands to the value of strerror(errno), where errno is the * value that variable had at the start of the call. This is a glibc * extension, but a very useful one. * * * Historically the result values of sprintf/snprintf varied across platforms. * This implementation now follows the C99 standard: * * 1. -1 is returned if an error is detected in the format string, or if * a write to the target stream fails (as reported by fwrite). Note that * overrunning snprintf's target buffer is *not* an error. * * 2. For successful writes to streams, the actual number of bytes written * to the stream is returned. * * 3. For successful sprintf/snprintf, the number of bytes that would have * been written to an infinite-size buffer (excluding the trailing '\0') * is returned. snprintf will truncate its output to fit in the buffer * (ensuring a trailing '\0' unless count == 0), but this is not reflected * in the function result. * * snprintf buffer overrun can be detected by checking for function result * greater than or equal to the supplied count. */ /************************************************************** * Original: * Patrick Powell Tue Apr 11 09:48:21 PDT 1995 * A bombproof version of doprnt (dopr) included. * Sigh. This sort of thing is always nasty do deal with. Note that * the version here does not include floating point. (now it does ... tgl) **************************************************************/ /* Prevent recursion */ #undef vsnprintf #undef snprintf #undef vsprintf #undef sprintf #undef vfprintf #undef fprintf #undef vprintf #undef printf /* * Info about where the formatted output is going. * * dopr and subroutines will not write at/past bufend, but snprintf * reserves one byte, ensuring it may place the trailing '\0' there. * * In snprintf, we use nchars to count the number of bytes dropped on the * floor due to buffer overrun. The correct result of snprintf is thus * (bufptr - bufstart) + nchars. (This isn't as inconsistent as it might * seem: nchars is the number of emitted bytes that are not in the buffer now, * either because we sent them to the stream or because we couldn't fit them * into the buffer to begin with.) */ typedef struct { char *bufptr; /* next buffer output position */ char *bufstart; /* first buffer element */ char *bufend; /* last+1 buffer element, or NULL */ /* bufend == NULL is for sprintf, where we assume buf is big enough */ FILE *stream; /* eventual output destination, or NULL */ int nchars; /* # chars sent to stream, or dropped */ bool failed; /* call is a failure; errno is set */ } PrintfTarget; /* * Info about the type and value of a formatting parameter. Note that we * don't currently support "long double", "wint_t", or "wchar_t *" data, * nor the '%n' formatting code; else we'd need more types. Also, at this * level we need not worry about signed vs unsigned values. */ typedef enum { ATYPE_NONE = 0, ATYPE_INT, ATYPE_LONG, ATYPE_LONGLONG, ATYPE_DOUBLE, ATYPE_CHARPTR } PrintfArgType; typedef union { int i; long l; long long ll; double d; char *cptr; } PrintfArgValue; static void flushbuffer(PrintfTarget *target); static void dopr(PrintfTarget *target, const char *format, va_list args); /* * Externally visible entry points. * * All of these are just wrappers around dopr(). Note it's essential that * they not change the value of "errno" before reaching dopr(). */ int pg_vsnprintf(char *str, size_t count, const char *fmt, va_list args) { PrintfTarget target; char onebyte[1]; /* * C99 allows the case str == NULL when count == 0. Rather than * special-casing this situation further down, we substitute a one-byte * local buffer. Callers cannot tell, since the function result doesn't * depend on count. */ if (count == 0) { str = onebyte; count = 1; } target.bufstart = target.bufptr = str; target.bufend = str + count - 1; target.stream = NULL; target.nchars = 0; target.failed = false; dopr(&target, fmt, args); *(target.bufptr) = '\0'; return target.failed ? -1 : (target.bufptr - target.bufstart + target.nchars); } int pg_snprintf(char *str, size_t count, const char *fmt,...) { int len; va_list args; va_start(args, fmt); len = pg_vsnprintf(str, count, fmt, args); va_end(args); return len; } int pg_vsprintf(char *str, const char *fmt, va_list args) { PrintfTarget target; target.bufstart = target.bufptr = str; target.bufend = NULL; target.stream = NULL; target.nchars = 0; /* not really used in this case */ target.failed = false; dopr(&target, fmt, args); *(target.bufptr) = '\0'; return target.failed ? -1 : (target.bufptr - target.bufstart + target.nchars); } int pg_sprintf(char *str, const char *fmt,...) { int len; va_list args; va_start(args, fmt); len = pg_vsprintf(str, fmt, args); va_end(args); return len; } int pg_vfprintf(FILE *stream, const char *fmt, va_list args) { PrintfTarget target; char buffer[1024]; /* size is arbitrary */ if (stream == NULL) { errno = EINVAL; return -1; } target.bufstart = target.bufptr = buffer; target.bufend = buffer + sizeof(buffer); /* use the whole buffer */ target.stream = stream; target.nchars = 0; target.failed = false; dopr(&target, fmt, args); /* dump any remaining buffer contents */ flushbuffer(&target); return target.failed ? -1 : target.nchars; } int pg_fprintf(FILE *stream, const char *fmt,...) { int len; va_list args; va_start(args, fmt); len = pg_vfprintf(stream, fmt, args); va_end(args); return len; } int pg_vprintf(const char *fmt, va_list args) { return pg_vfprintf(stdout, fmt, args); } int pg_printf(const char *fmt,...) { int len; va_list args; va_start(args, fmt); len = pg_vfprintf(stdout, fmt, args); va_end(args); return len; } /* * Attempt to write the entire buffer to target->stream; discard the entire * buffer in any case. Call this only when target->stream is defined. */ static void flushbuffer(PrintfTarget *target) { size_t nc = target->bufptr - target->bufstart; /* * Don't write anything if we already failed; this is to ensure we * preserve the original failure's errno. */ if (!target->failed && nc > 0) { size_t written; written = fwrite(target->bufstart, 1, nc, target->stream); target->nchars += written; if (written != nc) target->failed = true; } target->bufptr = target->bufstart; } static bool find_arguments(const char *format, va_list args, PrintfArgValue *argvalues); static void fmtstr(const char *value, int leftjust, int minlen, int maxwidth, int pointflag, PrintfTarget *target); static void fmtptr(const void *value, PrintfTarget *target); static void fmtint(long long value, char type, int forcesign, int leftjust, int minlen, int zpad, int precision, int pointflag, PrintfTarget *target); static void fmtchar(int value, int leftjust, int minlen, PrintfTarget *target); static void fmtfloat(double value, char type, int forcesign, int leftjust, int minlen, int zpad, int precision, int pointflag, PrintfTarget *target); static void dostr(const char *str, int slen, PrintfTarget *target); static void dopr_outch(int c, PrintfTarget *target); static void dopr_outchmulti(int c, int slen, PrintfTarget *target); static int adjust_sign(int is_negative, int forcesign, int *signvalue); static int compute_padlen(int minlen, int vallen, int leftjust); static void leading_pad(int zpad, int signvalue, int *padlen, PrintfTarget *target); static void trailing_pad(int padlen, PrintfTarget *target); /* * While Postgres sources do it the smart way and check HAVE_STRCHRNUL from the * auto-configure output, we just use the Postgres version of strchrnul here. */ static inline const char * pg_strchrnul(const char *s, int c) { while (*s != '\0' && *s != c) s++; return s; } /* * dopr(): the guts of *printf for all cases. */ static void dopr(PrintfTarget *target, const char *format, va_list args) { int save_errno = errno; const char *first_pct = NULL; int ch; bool have_dollar; bool have_star; bool afterstar; int accum; int longlongflag; int longflag; int pointflag; int leftjust; int fieldwidth; int precision; int zpad; int forcesign; int fmtpos; int cvalue; long long numvalue; double fvalue; const char *strvalue; PrintfArgValue argvalues[PG_NL_ARGMAX + 1]; /* * Initially, we suppose the format string does not use %n$. The first * time we come to a conversion spec that has that, we'll call * find_arguments() to check for consistent use of %n$ and fill the * argvalues array with the argument values in the correct order. */ have_dollar = false; while (*format != '\0') { /* Locate next conversion specifier */ if (*format != '%') { /* Scan to next '%' or end of string */ const char *next_pct = pg_strchrnul(format + 1, '%'); /* Dump literal data we just scanned over */ dostr(format, next_pct - format, target); if (target->failed) break; if (*next_pct == '\0') break; format = next_pct; } /* * Remember start of first conversion spec; if we find %n$, then it's * sufficient for find_arguments() to start here, without rescanning * earlier literal text. */ if (first_pct == NULL) first_pct = format; /* Process conversion spec starting at *format */ format++; /* Fast path for conversion spec that is exactly %s */ if (*format == 's') { format++; strvalue = va_arg(args, char *); if (strvalue == NULL) strvalue = "(null)"; dostr(strvalue, strlen(strvalue), target); if (target->failed) break; continue; } fieldwidth = precision = zpad = leftjust = forcesign = 0; longflag = longlongflag = pointflag = 0; fmtpos = accum = 0; have_star = afterstar = false; nextch2: ch = *format++; switch (ch) { case '-': leftjust = 1; goto nextch2; case '+': forcesign = 1; goto nextch2; case '0': /* set zero padding if no nonzero digits yet */ if (accum == 0 && !pointflag) zpad = '0'; /* FALL THRU */ case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': accum = accum * 10 + (ch - '0'); goto nextch2; case '.': if (have_star) have_star = false; else fieldwidth = accum; pointflag = 1; accum = 0; goto nextch2; case '*': if (have_dollar) { /* * We'll process value after reading n$. Note it's OK to * assume have_dollar is set correctly, because in a valid * format string the initial % must have had n$ if * does. */ afterstar = true; } else { /* fetch and process value now */ int starval = va_arg(args, int); if (pointflag) { precision = starval; if (precision < 0) { precision = 0; pointflag = 0; } } else { fieldwidth = starval; if (fieldwidth < 0) { leftjust = 1; fieldwidth = -fieldwidth; } } } have_star = true; accum = 0; goto nextch2; case '$': /* First dollar sign? */ if (!have_dollar) { /* Yup, so examine all conversion specs in format */ if (!find_arguments(first_pct, args, argvalues)) goto bad_format; have_dollar = true; } if (afterstar) { /* fetch and process star value */ int starval = argvalues[accum].i; if (pointflag) { precision = starval; if (precision < 0) { precision = 0; pointflag = 0; } } else { fieldwidth = starval; if (fieldwidth < 0) { leftjust = 1; fieldwidth = -fieldwidth; } } afterstar = false; } else fmtpos = accum; accum = 0; goto nextch2; case 'l': if (longflag) longlongflag = 1; else longflag = 1; goto nextch2; case 'z': #if SIZEOF_SIZE_T == 8 #ifdef HAVE_LONG_INT_64 longflag = 1; #elif defined(HAVE_LONG_LONG_INT_64) longlongflag = 1; #else #error "Don't know how to print 64bit integers" #endif #else /* assume size_t is same size as int */ #endif goto nextch2; case 'h': case '\'': /* ignore these */ goto nextch2; case 'd': case 'i': if (!have_star) { if (pointflag) precision = accum; else fieldwidth = accum; } if (have_dollar) { if (longlongflag) numvalue = argvalues[fmtpos].ll; else if (longflag) numvalue = argvalues[fmtpos].l; else numvalue = argvalues[fmtpos].i; } else { if (longlongflag) numvalue = va_arg(args, long long); else if (longflag) numvalue = va_arg(args, long); else numvalue = va_arg(args, int); } fmtint(numvalue, ch, forcesign, leftjust, fieldwidth, zpad, precision, pointflag, target); break; case 'o': case 'u': case 'x': case 'X': if (!have_star) { if (pointflag) precision = accum; else fieldwidth = accum; } if (have_dollar) { if (longlongflag) numvalue = (unsigned long long) argvalues[fmtpos].ll; else if (longflag) numvalue = (unsigned long) argvalues[fmtpos].l; else numvalue = (unsigned int) argvalues[fmtpos].i; } else { if (longlongflag) numvalue = (unsigned long long) va_arg(args, long long); else if (longflag) numvalue = (unsigned long) va_arg(args, long); else numvalue = (unsigned int) va_arg(args, int); } fmtint(numvalue, ch, forcesign, leftjust, fieldwidth, zpad, precision, pointflag, target); break; case 'c': if (!have_star) { if (pointflag) precision = accum; else fieldwidth = accum; } if (have_dollar) cvalue = (unsigned char) argvalues[fmtpos].i; else cvalue = (unsigned char) va_arg(args, int); fmtchar(cvalue, leftjust, fieldwidth, target); break; case 's': if (!have_star) { if (pointflag) precision = accum; else fieldwidth = accum; } if (have_dollar) strvalue = argvalues[fmtpos].cptr; else strvalue = va_arg(args, char *); /* If string is NULL, silently substitute "(null)" */ if (strvalue == NULL) strvalue = "(null)"; fmtstr(strvalue, leftjust, fieldwidth, precision, pointflag, target); break; case 'p': /* fieldwidth/leftjust are ignored ... */ if (have_dollar) strvalue = argvalues[fmtpos].cptr; else strvalue = va_arg(args, char *); fmtptr((const void *) strvalue, target); break; case 'e': case 'E': case 'f': case 'g': case 'G': if (!have_star) { if (pointflag) precision = accum; else fieldwidth = accum; } if (have_dollar) fvalue = argvalues[fmtpos].d; else fvalue = va_arg(args, double); fmtfloat(fvalue, ch, forcesign, leftjust, fieldwidth, zpad, precision, pointflag, target); break; case 'm': { char errbuf[PG_STRERROR_R_BUFLEN]; const char *errm = strerror_r(save_errno, errbuf, sizeof(errbuf)); dostr(errm, strlen(errm), target); } break; case '%': dopr_outch('%', target); break; default: /* * Anything else --- in particular, '\0' indicating end of * format string --- is bogus. */ goto bad_format; } /* Check for failure after each conversion spec */ if (target->failed) break; } return; bad_format: errno = EINVAL; target->failed = true; } /* * find_arguments(): sort out the arguments for a format spec with %n$ * * If format is valid, return true and fill argvalues[i] with the value * for the conversion spec that has %i$ or *i$. Else return false. */ static bool find_arguments(const char *format, va_list args, PrintfArgValue *argvalues) { int ch; bool afterstar; int accum; int longlongflag; int longflag; int fmtpos; int i; int last_dollar; PrintfArgType argtypes[PG_NL_ARGMAX + 1]; /* Initialize to "no dollar arguments known" */ last_dollar = 0; MemSet(argtypes, 0, sizeof(argtypes)); /* * This loop must accept the same format strings as the one in dopr(). * However, we don't need to analyze them to the same level of detail. * * Since we're only called if there's a dollar-type spec somewhere, we can * fail immediately if we find a non-dollar spec. Per the C99 standard, * all argument references in the format string must be one or the other. */ while (*format != '\0') { /* Locate next conversion specifier */ if (*format != '%') { /* Unlike dopr, we can just quit if there's no more specifiers */ format = strchr(format + 1, '%'); if (format == NULL) break; } /* Process conversion spec starting at *format */ format++; longflag = longlongflag = 0; fmtpos = accum = 0; afterstar = false; nextch1: ch = *format++; switch (ch) { case '-': case '+': goto nextch1; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': accum = accum * 10 + (ch - '0'); goto nextch1; case '.': accum = 0; goto nextch1; case '*': if (afterstar) return false; /* previous star missing dollar */ afterstar = true; accum = 0; goto nextch1; case '$': if (accum <= 0 || accum > PG_NL_ARGMAX) return false; if (afterstar) { if (argtypes[accum] && argtypes[accum] != ATYPE_INT) return false; argtypes[accum] = ATYPE_INT; last_dollar = Max(last_dollar, accum); afterstar = false; } else fmtpos = accum; accum = 0; goto nextch1; case 'l': if (longflag) longlongflag = 1; else longflag = 1; goto nextch1; case 'z': #if SIZEOF_SIZE_T == 8 #ifdef HAVE_LONG_INT_64 longflag = 1; #elif defined(HAVE_LONG_LONG_INT_64) longlongflag = 1; #else #error "Don't know how to print 64bit integers" #endif #else /* assume size_t is same size as int */ #endif goto nextch1; case 'h': case '\'': /* ignore these */ goto nextch1; case 'd': case 'i': case 'o': case 'u': case 'x': case 'X': if (fmtpos) { PrintfArgType atype; if (longlongflag) atype = ATYPE_LONGLONG; else if (longflag) atype = ATYPE_LONG; else atype = ATYPE_INT; if (argtypes[fmtpos] && argtypes[fmtpos] != atype) return false; argtypes[fmtpos] = atype; last_dollar = Max(last_dollar, fmtpos); } else return false; /* non-dollar conversion spec */ break; case 'c': if (fmtpos) { if (argtypes[fmtpos] && argtypes[fmtpos] != ATYPE_INT) return false; argtypes[fmtpos] = ATYPE_INT; last_dollar = Max(last_dollar, fmtpos); } else return false; /* non-dollar conversion spec */ break; case 's': case 'p': if (fmtpos) { if (argtypes[fmtpos] && argtypes[fmtpos] != ATYPE_CHARPTR) return false; argtypes[fmtpos] = ATYPE_CHARPTR; last_dollar = Max(last_dollar, fmtpos); } else return false; /* non-dollar conversion spec */ break; case 'e': case 'E': case 'f': case 'g': case 'G': if (fmtpos) { if (argtypes[fmtpos] && argtypes[fmtpos] != ATYPE_DOUBLE) return false; argtypes[fmtpos] = ATYPE_DOUBLE; last_dollar = Max(last_dollar, fmtpos); } else return false; /* non-dollar conversion spec */ break; case 'm': case '%': break; default: return false; /* bogus format string */ } /* * If we finish the spec with afterstar still set, there's a * non-dollar star in there. */ if (afterstar) return false; /* non-dollar conversion spec */ } /* * Format appears valid so far, so collect the arguments in physical * order. (Since we rejected any non-dollar specs that would have * collected arguments, we know that dopr() hasn't collected any yet.) */ for (i = 1; i <= last_dollar; i++) { switch (argtypes[i]) { case ATYPE_NONE: return false; case ATYPE_INT: argvalues[i].i = va_arg(args, int); break; case ATYPE_LONG: argvalues[i].l = va_arg(args, long); break; case ATYPE_LONGLONG: argvalues[i].ll = va_arg(args, long long); break; case ATYPE_DOUBLE: argvalues[i].d = va_arg(args, double); break; case ATYPE_CHARPTR: argvalues[i].cptr = va_arg(args, char *); break; } } return true; } static void fmtstr(const char *value, int leftjust, int minlen, int maxwidth, int pointflag, PrintfTarget *target) { int padlen, vallen; /* amount to pad */ /* * If a maxwidth (precision) is specified, we must not fetch more bytes * than that. */ if (pointflag) vallen = strnlen(value, maxwidth); else vallen = strlen(value); padlen = compute_padlen(minlen, vallen, leftjust); if (padlen > 0) { dopr_outchmulti(' ', padlen, target); padlen = 0; } dostr(value, vallen, target); trailing_pad(padlen, target); } static void fmtptr(const void *value, PrintfTarget *target) { int vallen; char convert[64]; /* we rely on regular C library's sprintf to do the basic conversion */ vallen = sprintf(convert, "%p", value); if (vallen < 0) target->failed = true; else dostr(convert, vallen, target); } static void fmtint(long long value, char type, int forcesign, int leftjust, int minlen, int zpad, int precision, int pointflag, PrintfTarget *target) { unsigned long long uvalue; int base; int dosign; const char *cvt = "0123456789abcdef"; int signvalue = 0; char convert[64]; int vallen = 0; int padlen; /* amount to pad */ int zeropad; /* extra leading zeroes */ switch (type) { case 'd': case 'i': base = 10; dosign = 1; break; case 'o': base = 8; dosign = 0; break; case 'u': base = 10; dosign = 0; break; case 'x': base = 16; dosign = 0; break; case 'X': cvt = "0123456789ABCDEF"; base = 16; dosign = 0; break; default: return; /* keep compiler quiet */ } /* disable MSVC warning about applying unary minus to an unsigned value */ #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable: 4146) #endif /* Handle +/- */ if (dosign && adjust_sign((value < 0), forcesign, &signvalue)) uvalue = -(unsigned long long) value; else uvalue = (unsigned long long) value; #ifdef _MSC_VER #pragma warning(pop) #endif /* * SUS: the result of converting 0 with an explicit precision of 0 is no * characters */ if (value == 0 && pointflag && precision == 0) vallen = 0; else { /* * Convert integer to string. We special-case each of the possible * base values so as to avoid general-purpose divisions. On most * machines, division by a fixed constant can be done much more * cheaply than a general divide. */ if (base == 10) { do { convert[sizeof(convert) - (++vallen)] = cvt[uvalue % 10]; uvalue = uvalue / 10; } while (uvalue); } else if (base == 16) { do { convert[sizeof(convert) - (++vallen)] = cvt[uvalue % 16]; uvalue = uvalue / 16; } while (uvalue); } else /* base == 8 */ { do { convert[sizeof(convert) - (++vallen)] = cvt[uvalue % 8]; uvalue = uvalue / 8; } while (uvalue); } } zeropad = Max(0, precision - vallen); padlen = compute_padlen(minlen, vallen + zeropad, leftjust); leading_pad(zpad, signvalue, &padlen, target); if (zeropad > 0) dopr_outchmulti('0', zeropad, target); dostr(convert + sizeof(convert) - vallen, vallen, target); trailing_pad(padlen, target); } static void fmtchar(int value, int leftjust, int minlen, PrintfTarget *target) { int padlen; /* amount to pad */ padlen = compute_padlen(minlen, 1, leftjust); if (padlen > 0) { dopr_outchmulti(' ', padlen, target); padlen = 0; } dopr_outch(value, target); trailing_pad(padlen, target); } static void fmtfloat(double value, char type, int forcesign, int leftjust, int minlen, int zpad, int precision, int pointflag, PrintfTarget *target) { int signvalue = 0; int prec; int vallen; char fmt[8]; char convert[1024]; int zeropadlen = 0; /* amount to pad with zeroes */ int padlen; /* amount to pad with spaces */ /* * We rely on the regular C library's sprintf to do the basic conversion, * then handle padding considerations here. * * The dynamic range of "double" is about 1E+-308 for IEEE math, and not * too wildly more than that with other hardware. In "f" format, sprintf * could therefore generate at most 308 characters to the left of the * decimal point; while we need to allow the precision to get as high as * 308+17 to ensure that we don't truncate significant digits from very * small values. To handle both these extremes, we use a buffer of 1024 * bytes and limit requested precision to 350 digits; this should prevent * buffer overrun even with non-IEEE math. If the original precision * request was more than 350, separately pad with zeroes. * * We handle infinities and NaNs specially to ensure platform-independent * output. */ if (precision < 0) /* cover possible overflow of "accum" */ precision = 0; prec = Min(precision, 350); if (isnan(value)) { strcpy(convert, "NaN"); vallen = 3; /* no zero padding, regardless of precision spec */ } else { /* * Handle sign (NaNs have no sign, so we don't do this in the case * above). "value < 0.0" will not be true for IEEE minus zero, so we * detect that by looking for the case where value equals 0.0 * according to == but not according to memcmp. */ static const double dzero = 0.0; if (adjust_sign((value < 0.0 || (value == 0.0 && memcmp(&value, &dzero, sizeof(double)) != 0)), forcesign, &signvalue)) value = -value; if (isinf(value)) { strcpy(convert, "Infinity"); vallen = 8; /* no zero padding, regardless of precision spec */ } else if (pointflag) { zeropadlen = precision - prec; fmt[0] = '%'; fmt[1] = '.'; fmt[2] = '*'; fmt[3] = type; fmt[4] = '\0'; vallen = sprintf(convert, fmt, prec, value); } else { fmt[0] = '%'; fmt[1] = type; fmt[2] = '\0'; vallen = sprintf(convert, fmt, value); } if (vallen < 0) goto fail; /* * Windows, alone among our supported platforms, likes to emit * three-digit exponent fields even when two digits would do. Hack * such results to look like the way everyone else does it. */ #ifdef WIN32 if (vallen >= 6 && convert[vallen - 5] == 'e' && convert[vallen - 3] == '0') { convert[vallen - 3] = convert[vallen - 2]; convert[vallen - 2] = convert[vallen - 1]; vallen--; } #endif } padlen = compute_padlen(minlen, vallen + zeropadlen, leftjust); leading_pad(zpad, signvalue, &padlen, target); if (zeropadlen > 0) { /* If 'e' or 'E' format, inject zeroes before the exponent */ char *epos = strrchr(convert, 'e'); if (!epos) epos = strrchr(convert, 'E'); if (epos) { /* pad before exponent */ dostr(convert, epos - convert, target); dopr_outchmulti('0', zeropadlen, target); dostr(epos, vallen - (epos - convert), target); } else { /* no exponent, pad after the digits */ dostr(convert, vallen, target); dopr_outchmulti('0', zeropadlen, target); } } else { /* no zero padding, just emit the number as-is */ dostr(convert, vallen, target); } trailing_pad(padlen, target); return; fail: target->failed = true; } /* * Nonstandard entry point to print a double value efficiently. * * This is approximately equivalent to strfromd(), but has an API more * adapted to what float8out() wants. The behavior is like snprintf() * with a format of "%.ng", where n is the specified precision. * However, the target buffer must be nonempty (i.e. count > 0), and * the precision is silently bounded to a sane range. */ int pg_strfromd(char *str, size_t count, int precision, double value) { PrintfTarget target; int signvalue = 0; int vallen; char fmt[8]; char convert[64]; /* Set up the target like pg_snprintf, but require nonempty buffer */ Assert(count > 0); target.bufstart = target.bufptr = str; target.bufend = str + count - 1; target.stream = NULL; target.nchars = 0; target.failed = false; /* * We bound precision to a reasonable range; the combination of this and * the knowledge that we're using "g" format without padding allows the * convert[] buffer to be reasonably small. */ if (precision < 1) precision = 1; else if (precision > 32) precision = 32; /* * The rest is just an inlined version of the fmtfloat() logic above, * simplified using the knowledge that no padding is wanted. */ if (isnan(value)) { strcpy(convert, "NaN"); vallen = 3; } else { static const double dzero = 0.0; if (value < 0.0 || (value == 0.0 && memcmp(&value, &dzero, sizeof(double)) != 0)) { signvalue = '-'; value = -value; } if (isinf(value)) { strcpy(convert, "Infinity"); vallen = 8; } else { fmt[0] = '%'; fmt[1] = '.'; fmt[2] = '*'; fmt[3] = 'g'; fmt[4] = '\0'; vallen = sprintf(convert, fmt, precision, value); if (vallen < 0) { target.failed = true; goto fail; } #ifdef WIN32 if (vallen >= 6 && convert[vallen - 5] == 'e' && convert[vallen - 3] == '0') { convert[vallen - 3] = convert[vallen - 2]; convert[vallen - 2] = convert[vallen - 1]; vallen--; } #endif } } if (signvalue) dopr_outch(signvalue, &target); dostr(convert, vallen, &target); fail: *(target.bufptr) = '\0'; return target.failed ? -1 : (target.bufptr - target.bufstart + target.nchars); } static void dostr(const char *str, int slen, PrintfTarget *target) { /* fast path for common case of slen == 1 */ if (slen == 1) { dopr_outch(*str, target); return; } while (slen > 0) { int avail; if (target->bufend != NULL) avail = target->bufend - target->bufptr; else avail = slen; if (avail <= 0) { /* buffer full, can we dump to stream? */ if (target->stream == NULL) { target->nchars += slen; /* no, lose the data */ return; } flushbuffer(target); continue; } avail = Min(avail, slen); memmove(target->bufptr, str, avail); target->bufptr += avail; str += avail; slen -= avail; } } static void dopr_outch(int c, PrintfTarget *target) { if (target->bufend != NULL && target->bufptr >= target->bufend) { /* buffer full, can we dump to stream? */ if (target->stream == NULL) { target->nchars++; /* no, lose the data */ return; } flushbuffer(target); } *(target->bufptr++) = c; } static void dopr_outchmulti(int c, int slen, PrintfTarget *target) { /* fast path for common case of slen == 1 */ if (slen == 1) { dopr_outch(c, target); return; } while (slen > 0) { int avail; if (target->bufend != NULL) avail = target->bufend - target->bufptr; else avail = slen; if (avail <= 0) { /* buffer full, can we dump to stream? */ if (target->stream == NULL) { target->nchars += slen; /* no, lose the data */ return; } flushbuffer(target); continue; } avail = Min(avail, slen); memset(target->bufptr, c, avail); target->bufptr += avail; slen -= avail; } } static int adjust_sign(int is_negative, int forcesign, int *signvalue) { if (is_negative) { *signvalue = '-'; return true; } else if (forcesign) *signvalue = '+'; return false; } static int compute_padlen(int minlen, int vallen, int leftjust) { int padlen; padlen = minlen - vallen; if (padlen < 0) padlen = 0; if (leftjust) padlen = -padlen; return padlen; } static void leading_pad(int zpad, int signvalue, int *padlen, PrintfTarget *target) { int maxpad; if (*padlen > 0 && zpad) { if (signvalue) { dopr_outch(signvalue, target); --(*padlen); signvalue = 0; } if (*padlen > 0) { dopr_outchmulti(zpad, *padlen, target); *padlen = 0; } } maxpad = (signvalue != 0); if (*padlen > maxpad) { dopr_outchmulti(' ', *padlen - maxpad, target); *padlen = maxpad; } if (signvalue) { dopr_outch(signvalue, target); if (*padlen > 0) --(*padlen); else if (*padlen < 0) ++(*padlen); } } static void trailing_pad(int padlen, PrintfTarget *target) { if (padlen < 0) dopr_outchmulti(' ', -padlen, target); } pg_auto_failover-1.6.3/src/bin/lib/pg/snprintf.h000066400000000000000000000031641414244367200216100ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * port.h * Header for src/port/ compatibility functions. * * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/include/port.h * *------------------------------------------------------------------------- */ #ifndef PG_SNPRINTF_H #define PG_SNPRINTF_H #include "postgres_fe.h" #ifndef USE_REPL_SNPRINTF int pg_vsnprintf(char *str, size_t count, const char *fmt, va_list args); int pg_snprintf(char *str, size_t count, const char *fmt,...) __attribute__((format(printf, 3, 4))); int pg_vsprintf(char *str, const char *fmt, va_list args); int pg_sprintf(char *str, const char *fmt,...) __attribute__((format(printf, 2, 3))); int pg_vfprintf(FILE *stream, const char *fmt, va_list args); int pg_fprintf(FILE *stream, const char *fmt,...) __attribute__((format(printf, 2, 3))); int pg_vprintf(const char *fmt, va_list args); int pg_printf(const char *fmt,...) __attribute__((format(printf, 1, 2))); /* This is also provided by snprintf.c */ int pg_strfromd(char *str, size_t count, int precision, double value); /* Replace strerror() with our own, somewhat more robust wrapper */ extern char *pg_strerror(int errnum); #define strerror pg_strerror /* Likewise for strerror_r(); note we prefer the GNU API for that */ extern char *pg_strerror_r(int errnum, char *buf, size_t buflen); #define strerror_r pg_strerror_r #define PG_STRERROR_R_BUFLEN 256 /* Recommended buffer size for strerror_r */ #endif /* USE_REPL_SNPRINTF */ #endif /* PG_SNPRINTF_H */ pg_auto_failover-1.6.3/src/bin/lib/pg/strerror.c000066400000000000000000000154041414244367200216220ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * strerror.c * Replacements for standard strerror() and strerror_r() functions * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * src/port/strerror.c * *------------------------------------------------------------------------- */ #include "postgres_fe.h" #include "snprintf.h" #include "c.h" /* * Within this file, "strerror" means the platform's function not pg_strerror, * and likewise for "strerror_r" */ #undef strerror #undef strerror_r static char *gnuish_strerror_r(int errnum, char *buf, size_t buflen); static char *get_errno_symbol(int errnum); #ifdef WIN32 static char *win32_socket_strerror(int errnum, char *buf, size_t buflen); #endif /* * A slightly cleaned-up version of strerror() */ char * pg_strerror(int errnum) { static char errorstr_buf[PG_STRERROR_R_BUFLEN]; return pg_strerror_r(errnum, errorstr_buf, sizeof(errorstr_buf)); } /* * A slightly cleaned-up version of strerror_r() */ char * pg_strerror_r(int errnum, char *buf, size_t buflen) { char *str; /* If it's a Windows Winsock error, that needs special handling */ #ifdef WIN32 /* Winsock error code range, per WinError.h */ if (errnum >= 10000 && errnum <= 11999) return win32_socket_strerror(errnum, buf, buflen); #endif /* Try the platform's strerror_r(), or maybe just strerror() */ str = gnuish_strerror_r(errnum, buf, buflen); /* * Some strerror()s return an empty string for out-of-range errno. This * is ANSI C spec compliant, but not exactly useful. Also, we may get * back strings of question marks if libc cannot transcode the message to * the codeset specified by LC_CTYPE. If we get nothing useful, first try * get_errno_symbol(), and if that fails, print the numeric errno. */ if (str == NULL || *str == '\0' || *str == '?') str = get_errno_symbol(errnum); if (str == NULL) { snprintf(buf, buflen, _("operating system error %d"), errnum); str = buf; } return str; } /* * Simple wrapper to emulate GNU strerror_r if what the platform provides is * POSIX. Also, if platform lacks strerror_r altogether, fall back to plain * strerror; it might not be very thread-safe, but tough luck. */ static char * gnuish_strerror_r(int errnum, char *buf, size_t buflen) { #ifdef HAVE_STRERROR_R #ifdef STRERROR_R_INT /* POSIX API */ if (strerror_r(errnum, buf, buflen) == 0) return buf; return NULL; /* let caller deal with failure */ #else /* GNU API */ return strerror_r(errnum, buf, buflen); #endif #else /* !HAVE_STRERROR_R */ char *sbuf = strerror(errnum); if (sbuf == NULL) /* can this still happen anywhere? */ return NULL; /* To minimize thread-unsafety hazard, copy into caller's buffer */ strlcpy(buf, sbuf, buflen); return buf; #endif } /* * Returns a symbol (e.g. "ENOENT") for an errno code. * Returns NULL if the code is unrecognized. */ static char * get_errno_symbol(int errnum) { switch (errnum) { case E2BIG: return "E2BIG"; case EACCES: return "EACCES"; case EADDRINUSE: return "EADDRINUSE"; case EADDRNOTAVAIL: return "EADDRNOTAVAIL"; case EAFNOSUPPORT: return "EAFNOSUPPORT"; #ifdef EAGAIN case EAGAIN: return "EAGAIN"; #endif #ifdef EALREADY case EALREADY: return "EALREADY"; #endif case EBADF: return "EBADF"; #ifdef EBADMSG case EBADMSG: return "EBADMSG"; #endif case EBUSY: return "EBUSY"; case ECHILD: return "ECHILD"; case ECONNABORTED: return "ECONNABORTED"; case ECONNREFUSED: return "ECONNREFUSED"; case ECONNRESET: return "ECONNRESET"; case EDEADLK: return "EDEADLK"; case EDOM: return "EDOM"; case EEXIST: return "EEXIST"; case EFAULT: return "EFAULT"; case EFBIG: return "EFBIG"; case EHOSTDOWN: return "EHOSTDOWN"; case EHOSTUNREACH: return "EHOSTUNREACH"; case EIDRM: return "EIDRM"; case EINPROGRESS: return "EINPROGRESS"; case EINTR: return "EINTR"; case EINVAL: return "EINVAL"; case EIO: return "EIO"; case EISCONN: return "EISCONN"; case EISDIR: return "EISDIR"; #ifdef ELOOP case ELOOP: return "ELOOP"; #endif case EMFILE: return "EMFILE"; case EMLINK: return "EMLINK"; case EMSGSIZE: return "EMSGSIZE"; case ENAMETOOLONG: return "ENAMETOOLONG"; case ENETDOWN: return "ENETDOWN"; case ENETRESET: return "ENETRESET"; case ENETUNREACH: return "ENETUNREACH"; case ENFILE: return "ENFILE"; case ENOBUFS: return "ENOBUFS"; case ENODEV: return "ENODEV"; case ENOENT: return "ENOENT"; case ENOEXEC: return "ENOEXEC"; case ENOMEM: return "ENOMEM"; case ENOSPC: return "ENOSPC"; case ENOSYS: return "ENOSYS"; case ENOTCONN: return "ENOTCONN"; case ENOTDIR: return "ENOTDIR"; #if defined(ENOTEMPTY) && (ENOTEMPTY != EEXIST) /* same code on AIX */ case ENOTEMPTY: return "ENOTEMPTY"; #endif case ENOTSOCK: return "ENOTSOCK"; #ifdef ENOTSUP case ENOTSUP: return "ENOTSUP"; #endif case ENOTTY: return "ENOTTY"; case ENXIO: return "ENXIO"; #if defined(EOPNOTSUPP) && (!defined(ENOTSUP) || (EOPNOTSUPP != ENOTSUP)) case EOPNOTSUPP: return "EOPNOTSUPP"; #endif #ifdef EOVERFLOW case EOVERFLOW: return "EOVERFLOW"; #endif case EPERM: return "EPERM"; case EPIPE: return "EPIPE"; case EPROTONOSUPPORT: return "EPROTONOSUPPORT"; case ERANGE: return "ERANGE"; #ifdef EROFS case EROFS: return "EROFS"; #endif case ESRCH: return "ESRCH"; case ETIMEDOUT: return "ETIMEDOUT"; #ifdef ETXTBSY case ETXTBSY: return "ETXTBSY"; #endif #if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN)) case EWOULDBLOCK: return "EWOULDBLOCK"; #endif case EXDEV: return "EXDEV"; } return NULL; } #ifdef WIN32 /* * Windows' strerror() doesn't know the Winsock codes, so handle them this way */ static char * win32_socket_strerror(int errnum, char *buf, size_t buflen) { static HANDLE handleDLL = INVALID_HANDLE_VALUE; if (handleDLL == INVALID_HANDLE_VALUE) { handleDLL = LoadLibraryEx("netmsg.dll", NULL, DONT_RESOLVE_DLL_REFERENCES | LOAD_LIBRARY_AS_DATAFILE); if (handleDLL == NULL) { snprintf(buf, buflen, "winsock error %d (could not load netmsg.dll to translate: error code %lu)", errnum, GetLastError()); return buf; } } ZeroMemory(buf, buflen); if (FormatMessage(FORMAT_MESSAGE_IGNORE_INSERTS | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_FROM_HMODULE, handleDLL, errnum, MAKELANGID(LANG_ENGLISH, SUBLANG_DEFAULT), buf, buflen - 1, NULL) == 0) { /* Failed to get id */ snprintf(buf, buflen, "unrecognized winsock error %d", errnum); } return buf; } #endif /* WIN32 */ pg_auto_failover-1.6.3/src/bin/lib/subcommands.c/000077500000000000000000000000001414244367200217165ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/lib/subcommands.c/commandline.c000066400000000000000000000142061414244367200243530ustar00rootroot00000000000000/* * commandline.h * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include "commandline.h" CommandLine *current_command = NULL; static void commandline_pretty_print_subcommands(CommandLine *command, FILE *stream); /* * Implementation of the main subcommands entry point. * * Parses the command line given the Command_t cmd context, and run commands * that match with the subcommand definitions. * * It returns false if the command parsing failed. In that case it will also * output a helpful error message to stderr. */ bool commandline_run(CommandLine *command, int argc, char **argv) { const char *argv0 = "unknown_command"; const char *breadcrumb = NULL; if (argc > 0) { argv0 = argv[0]; } breadcrumb = command->breadcrumb == NULL ? argv0 : command->breadcrumb; /* * If the user gives the --help option at this point, describe the current * command. */ if (argc >= 2 && (streq(argv[1], "--help") || streq(argv[1], "-h"))) { commandline_print_usage(command, stderr); return true; } current_command = command; /* Otherwise let the command parse any options that occur here. */ if (command->getopt != NULL) { int option_count = command->getopt(argc, argv); argc -= option_count; argv += option_count; } else { argc--; argv++; } if (command->run != NULL) { command->run(argc, argv); return true; } if (argc == 0) { /* * We're at the end of the command line already, and command->run is * not set, which means we expected a subcommand to be used, but none * have been given by the user. Inform him. */ fprintf(stderr, "%s: expected a command\n", breadcrumb); } else if (command->subcommands != NULL) { CommandLine **subcommand = command->subcommands; for (; *subcommand != NULL; subcommand++) { if (streq(argv[0], (*subcommand)->name)) { commandline_add_breadcrumb(command, *subcommand); return commandline_run(*subcommand, argc, argv); } } /* if we reach this code, we didn't find a subcommand */ fprintf(stderr, "%s: %s: unknown command\n", breadcrumb, argv[0]); } else { /* This should not be reached */ fprintf(stderr, "%s: an unexpected state was reached during command parsing\n", breadcrumb); } /* * Print the subcommands after the actual error message for easy fixing of * the command */ fprintf(stderr, "\n"); commandline_print_subcommands(command, stderr); return false; } /* * Print help message for the known currently running command. */ void commandline_help(FILE *stream) { if (current_command != NULL) { commandline_print_usage(current_command, stream); } } /* * Helper function to print usage and help message for a command. */ void commandline_print_usage(CommandLine *command, FILE *stream) { const char *breadcrumb = command->breadcrumb == NULL ? command->name : command->breadcrumb; fprintf(stream, "%s:", breadcrumb); if (command->shortDescription) { fprintf(stream, " %s", command->shortDescription); } fprintf(stream, "\n"); if (command->usageSuffix) { fprintf(stream, "usage: %s %s\n", breadcrumb, command->usageSuffix); fprintf(stream, "\n"); } if (command->help) { fprintf(stream, "%s\n", command->help); } if (command->subcommands) { fprintf(stream, "\n"); commandline_print_subcommands(command, stream); } fflush(stream); } /* * Print the list of subcommands accepted from a command. */ void commandline_print_subcommands(CommandLine *command, FILE *stream) { /* the root command doesn't have a breadcrumb at this point */ const char *breadcrumb = command->breadcrumb == NULL ? command->name : command->breadcrumb; fprintf(stream, "Available commands:\n %s\n", breadcrumb); commandline_pretty_print_subcommands(command, stream); fprintf(stream, "\n"); } /* * commandline_print_command_tree walks a command tree and prints out its whole * set of commands, recursively. */ void commandline_print_command_tree(CommandLine *command, FILE *stream) { if (command != NULL) { const char *breadcrumb = command->breadcrumb == NULL ? command->name : command->breadcrumb; if (command->subcommands != NULL) { CommandLine **subcommand; fprintf(stream, " %s\n", breadcrumb); commandline_pretty_print_subcommands(command, stream); fprintf(stream, "\n"); for (subcommand = command->subcommands; *subcommand != NULL; subcommand++) { commandline_add_breadcrumb(command, *subcommand); commandline_print_command_tree(*subcommand, stream); } } } } /* * commandline_pretty_print_subcommands pretty prints a list of subcommands. */ static void commandline_pretty_print_subcommands(CommandLine *command, FILE *stream) { if (command->subcommands != NULL) { CommandLine **subcommand; int maxLength = 0; /* pretty printing: reduce maximum length of subcommand names */ for (subcommand = command->subcommands; *subcommand != NULL; subcommand++) { int len = strlen((*subcommand)->name); if (maxLength < len) { maxLength = len; } } for (subcommand = command->subcommands; *subcommand != NULL; subcommand++) { const char *description = ""; if ((*subcommand)->shortDescription != NULL) { description = (*subcommand)->shortDescription; } fprintf(stream, " %c %*s %s\n", (*subcommand)->subcommands ? '+' : ' ', (int) -maxLength, (*subcommand)->name, description); } } } /* * Add command to the breadcrumb of subcommand. * * The idea is to be able to print the list of subcommands in the help * messages, as in the following example: * * $ ./foo env get --help * foo env get: short description */ void commandline_add_breadcrumb(CommandLine *command, CommandLine *subcommand) { const char *command_bc = command->breadcrumb ? command->breadcrumb : command->name; int breadcrumbLength = strlen(command_bc); int subcommandLength = strlen(subcommand->name); breadcrumbLength += subcommandLength + 2; subcommand->breadcrumb = (char *) malloc(breadcrumbLength * sizeof(char)); sprintf(subcommand->breadcrumb, "%s %s", command_bc, subcommand->name); } pg_auto_failover-1.6.3/src/bin/lib/subcommands.c/commandline.h000066400000000000000000000025321414244367200243570ustar00rootroot00000000000000/* * commandline.h * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef COMMANDLINE_H #define COMMANDLINE_H #include #include #include #include typedef int (*command_getopt)(int argc, char **argv); typedef void (*command_run)(int argc, char **argv); typedef struct CommandLine { const char *name; const char *shortDescription; const char *usageSuffix; const char *help; command_getopt getopt; command_run run; struct CommandLine **subcommands; char *breadcrumb; } CommandLine; extern CommandLine *current_command; #define make_command_set(name, desc, usage, help, getopt, set) \ { name, desc, usage, help, getopt, NULL, set, NULL } #define make_command(name, desc, usage, help, getopt, run) \ { name, desc, usage, help, getopt, run, NULL, NULL } bool commandline_run(CommandLine *command, int argc, char **argv); void commandline_help(FILE *stream); void commandline_print_usage(CommandLine *command, FILE *stream); void commandline_print_subcommands(CommandLine *command, FILE *stream); void commandline_print_command_tree(CommandLine *command, FILE *stream); void commandline_add_breadcrumb(CommandLine *command, CommandLine *subcommand); #define streq(a, b) (a != NULL && b != NULL && strcmp(a, b) == 0) #endif /* COMMANDLINE_H */ pg_auto_failover-1.6.3/src/bin/lib/subcommands.c/runprogram.h000066400000000000000000000357771414244367200243060ustar00rootroot00000000000000/* * runprogram.h * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include #include #include "pqexpbuffer.h" #define BUFSIZE 1024 #define ARGS_INCREMENT 12 #if defined(WIN32) && !defined(__CYGWIN__) #define DEV_NULL "NUL" #else #define DEV_NULL "/dev/null" #endif #define MAX(a, b) (((a) > (b)) ? (a) : (b)) typedef struct { char *program; char **args; bool setsid; /* shall we call setsid() ? */ int error; /* save errno when something's gone wrong */ int returnCode; bool capture; /* do we capture output, or redirect it? */ bool tty; /* do we share our tty? */ /* register a function to process output as it appears */ void (*processBuffer)(const char *buffer, bool error); int stdOutFd; /* redirect stdout to file descriptor */ int stdErrFd; /* redirect stderr to file descriptor */ char *stdOut; char *stdErr; } Program; Program run_program(const char *program, ...); void initialize_program(Program *prog, char **args, bool setsid); void execute_subprogram(Program *prog); void execute_program(Program *prog); void free_program(Program *prog); int snprintf_program_command_line(Program *prog, char *buffer, int size); #ifdef RUN_PROGRAM_IMPLEMENTATION #undef RUN_PROGRAM_IMPLEMENTATION static void exit_internal_error(void); static void dup2_or_exit(int fildes, int fildes2); static void close_or_exit(int fildes); static void read_from_pipes(Program *prog, pid_t childPid, int *outpipe, int *errpipe); static size_t read_into_buf(Program *prog, int filedes, PQExpBuffer buffer, bool error); static void waitprogram(Program *prog, pid_t childPid); /* * Run a program using fork() and exec(), get the stdOut and stdErr output from * the run and then return a Program struct instance with the result of running * the program. */ Program run_program(const char *program, ...) { int nb_args = 0; va_list args; const char *param; Program prog = { 0 }; prog.program = strdup(program); prog.returnCode = -1; prog.error = 0; prog.setsid = false; prog.capture = true; prog.tty = false; prog.processBuffer = NULL; prog.stdOutFd = -1; prog.stdErrFd = -1; prog.stdOut = NULL; prog.stdErr = NULL; prog.args = (char **) malloc(ARGS_INCREMENT * sizeof(char *)); prog.args[nb_args++] = prog.program; va_start(args, program); while ((param = va_arg(args, const char *)) != NULL) { if (nb_args % ARGS_INCREMENT == 0) { char **newargs = (char **) malloc((ARGS_INCREMENT * (nb_args / ARGS_INCREMENT + 1)) * sizeof(char *)); for (int i = 0; i < nb_args; i++) { newargs[i] = prog.args[i]; } free(prog.args); prog.args = newargs; } prog.args[nb_args++] = strdup(param); } va_end(args); prog.args[nb_args] = NULL; execute_subprogram(&prog); return prog; } /* * Initialize a program structure that can be executed later, allowing the * caller to manipulate the structure for itself. Safe to change are program, * args and setsid structure slots. */ void initialize_program(Program *prog, char **args, bool setsid) { int argsIndex, nb_args = 0; /* we want to have a deterministic starting point */ *prog = (Program) { 0 }; prog->returnCode = -1; prog->error = 0; prog->setsid = setsid; /* this could be changed by the caller before calling execute_program */ prog->capture = true; prog->tty = false; prog->processBuffer = NULL; prog->stdOutFd = -1; prog->stdErrFd = -1; prog->stdOut = NULL; prog->stdErr = NULL; for (argsIndex = 0; args[argsIndex] != NULL; argsIndex++) { ++nb_args; } /* add another one nb_args for the terminating NULL entry */ prog->args = (char **) malloc(++nb_args * sizeof(char *)); memset(prog->args, 0, nb_args * sizeof(char *)); for (argsIndex = 0; args[argsIndex] != NULL; argsIndex++) { prog->args[argsIndex] = strdup(args[argsIndex]); } prog->program = prog->args[0]; } /* * Run given program with its args, by doing the fork()/exec() dance, and also * capture the subprocess output by installing pipes. We accumulate the output * into a PQExpBuffer when prog->capture is true. */ void execute_subprogram(Program *prog) { pid_t pid; int outpipe[2] = { 0, 0 }; int errpipe[2] = { 0, 0 }; /* first level sanity check */ if (access(prog->program, F_OK | X_OK) == -1) { fprintf(stderr, "Failed to find executable program at \"%s\": %s\n", prog->program, strerror(errno)); prog->returnCode = -1; prog->error = errno; return; } /* Flush stdio channels just before fork, to avoid double-output problems */ fflush(stdout); fflush(stderr); /* create the output capture pipes now */ if (prog->capture) { if (pipe(outpipe) < 0) { prog->returnCode = -1; prog->error = errno; return; } if (pipe(errpipe) < 0) { prog->returnCode = -1; prog->error = errno; return; } } pid = fork(); switch (pid) { case -1: { /* fork failed */ prog->returnCode = -1; prog->error = errno; return; } case 0: { /* fork succeeded, in child */ if (prog->tty == false) { /* * We redirect /dev/null into stdIn rather than closing stdin, * because apparently closing it may cause undefined behavior * if any read was to happen. */ int stdIn = open(DEV_NULL, O_RDONLY); if (stdIn == -1) { (void) exit_internal_error(); } (void) dup2_or_exit(stdIn, STDIN_FILENO); (void) close_or_exit(stdIn); /* * Prepare either for capture the output in pipes, or redirect * to the given open file descriptors. */ if (prog->capture) { (void) dup2_or_exit(outpipe[1], STDOUT_FILENO); (void) dup2(errpipe[1], STDERR_FILENO); (void) close_or_exit(outpipe[0]); (void) close_or_exit(outpipe[1]); (void) close_or_exit(errpipe[0]); (void) close_or_exit(errpipe[1]); } else { (void) dup2_or_exit(prog->stdOutFd, STDOUT_FILENO); (void) dup2_or_exit(prog->stdErrFd, STDERR_FILENO); } } /* * When asked to do so, before creating the child process, we call * setsid() to create our own session group and detach from the * terminal. That's useful when starting a service in the * background. */ if (prog->setsid) { if (setsid() == -1) { prog->returnCode = -1; prog->error = errno; return; } } if (execv(prog->program, prog->args) == -1) { prog->returnCode = -1; prog->error = errno; fprintf(stdout, "%s\n", strerror(errno)); fprintf(stderr, "%s\n", strerror(errno)); exit(EXIT_CODE_INTERNAL_ERROR); } return; } default: { /* fork succeeded, in parent */ if (prog->capture) { read_from_pipes(prog, pid, outpipe, errpipe); } else { (void) waitprogram(prog, pid); } return; } } } /* * Run given program with its args, by using exec(). * * Using exec() means that we replace the currently running program and will * take ownership of its standard input, output and error streams, etc. This * routine is not supposed to ever return, so in case when something goes * wrong, it exits the current process, which is assumed to be a sub-process * started with fork(). * * When prog->tty is true we want to share the parent's program tty with the * subprocess, and then we refrain from doing any redirection of stdin, stdout, * or stderr. */ void execute_program(Program *prog) { if (prog->capture) { fprintf(stderr, "BUG: can't execute_program and capture the output"); return; } /* first level sanity check */ if (access(prog->program, F_OK | X_OK) == -1) { fprintf(stderr, "Failed to find executable program at \"%s\": %s\n", prog->program, strerror(errno)); prog->returnCode = -1; prog->error = errno; return; } if (prog->tty == false) { /* * We redirect /dev/null into stdIn rather than closing stdin, because * apparently closing it may cause undefined behavior if any read was * to happen. */ int stdIn = open(DEV_NULL, O_RDONLY); /* Avoid double-output problems */ fflush(stdout); fflush(stderr); (void) dup2_or_exit(stdIn, STDIN_FILENO); (void) close_or_exit(stdIn); (void) dup2_or_exit(prog->stdOutFd, STDOUT_FILENO); (void) dup2_or_exit(prog->stdErrFd, STDERR_FILENO); } /* * When asked to do so, before creating the child process, we call * setsid() to create our own session group and detach from the * terminal. That's useful when starting a service in the * background. */ if (prog->setsid) { if (setsid() == -1) { prog->returnCode = -1; prog->error = errno; return; } } if (execv(prog->program, prog->args) == -1) { prog->returnCode = -1; prog->error = errno; (void) exit_internal_error(); } /* now the parent should waitpid() and may use waitprogram() */ } /* * Free our memory. */ void free_program(Program *prog) { /* don't free prog->program, it's the same pointer as prog->args[0] */ for (int i = 0; prog->args[i] != NULL; i++) { free(prog->args[i]); } free(prog->args); if (prog->stdOut != NULL) { free(prog->stdOut); } if (prog->stdErr != NULL) { free(prog->stdErr); } } /* * exit_internal_error prints the strerror of the current errno to both stdin * and stdout and exits with the exit code EXIT_CODE_INTERNAL_ERROR. */ static void exit_internal_error() { fprintf(stdout, "%s\n", strerror(errno)); fprintf(stderr, "%s\n", strerror(errno)); exit(EXIT_CODE_INTERNAL_ERROR); } /* * dup2_or_exit calls dup2() on given arguments (file descriptors) and exits * when dup2() fails. */ static void dup2_or_exit(int fildes, int fildes2) { if (dup2(fildes, fildes2) == -1) { (void) exit_internal_error(); } } /* * close_or_exit calls close() on given file descriptor and exits when close() * fails. */ static void close_or_exit(int fildes) { if (close(fildes) == -1) { (void) exit_internal_error(); } } /* * read_from_pipes reads the output from the child process and sets the Program * slots stdOut and stdErr with the accumulated output we read. */ static void read_from_pipes(Program *prog, pid_t childPid, int *outpipe, int *errpipe) { bool doneReading = false; int countFdsReadyToRead, nfds; /* see man select(3) */ fd_set readFileDescriptorSet; ssize_t bytes_out = BUFSIZE, bytes_err = BUFSIZE; PQExpBuffer outbuf, errbuf; /* We read from the other side of the pipe, close that part. */ close(outpipe[1]); close(errpipe[1]); nfds = MAX(outpipe[0], errpipe[0]) + 1; /* * Ok. the child process is running, let's read the pipes content. */ outbuf = createPQExpBuffer(); errbuf = createPQExpBuffer(); while (!doneReading) { FD_ZERO(&readFileDescriptorSet); /* if we read 0 bytes on the previous run, we've reached EOF */ if (bytes_out > 0) { FD_SET(outpipe[0], &readFileDescriptorSet); } if (bytes_err > 0) { FD_SET(errpipe[0], &readFileDescriptorSet); } countFdsReadyToRead = select(nfds, &readFileDescriptorSet, NULL, NULL, NULL); if (countFdsReadyToRead == -1) { switch (errno) { case EAGAIN: case EINTR: { /* just loop again */ break; } case EBADF: case EINVAL: case ENOMEM: default: { /* that's unexpected, act as if doneReading */ log_error("Failed to read from command \"%s\": %s", prog->program, strerror(errno)); doneReading = true; break; } } } else if (countFdsReadyToRead == 0) { continue; } else { if (FD_ISSET(outpipe[0], &readFileDescriptorSet)) { bytes_out = read_into_buf(prog, outpipe[0], outbuf, false); if (bytes_out == -1 && errno != 0) { prog->returnCode = -1; prog->error = errno; } } if (FD_ISSET(errpipe[0], &readFileDescriptorSet)) { bytes_err = read_into_buf(prog, errpipe[0], errbuf, true); if (bytes_err == -1 && errno != 0) { prog->returnCode = -1; prog->error = errno; } } doneReading = (bytes_out < BUFSIZE && bytes_err < BUFSIZE); } } if (outbuf->len > 0) { prog->stdOut = strndup(outbuf->data, outbuf->len); } if (errbuf->len > 0) { prog->stdErr = strndup(errbuf->data, errbuf->len); } destroyPQExpBuffer(outbuf); destroyPQExpBuffer(errbuf); /* now, wait until the child process is done. */ (void) waitprogram(prog, childPid); /* * Now we're done reading from both stdOut and stdErr of the child * process, so close the file descriptors and prepare the char * * strings output in our Program structure. * * We must close the pipe after the child process has exited, * or the program may be terminated by SIGPIPE, i.e. writing to * an closed pipe. */ close(outpipe[0]); close(errpipe[0]); } /* * Wait until our Program is done. */ static void waitprogram(Program *prog, pid_t childPid) { int status; do { if (waitpid(childPid, &status, WUNTRACED) == -1) { prog->returnCode = -1; prog->error = errno; return; } } while (!WIFEXITED(status) && !WIFSIGNALED(status)); if (WIFEXITED(status)) { prog->returnCode = WEXITSTATUS(status); } else if (WIFSIGNALED(status)) { int signo = WTERMSIG(status); /* standard exit value with fatal error signal `n`: 128 + n */ prog->returnCode = 128 + signo; } else { log_fatal("unknown exit status: 0X%X", status); prog->returnCode = -1; } } /* * Read from a file descriptor and directly appends to our buffer string. */ static size_t read_into_buf(Program *prog, int filedes, PQExpBuffer buffer, bool error) { char temp_buffer[BUFSIZE+1] = { 0 }; size_t bytes = read(filedes, temp_buffer, BUFSIZE); if (bytes > 0) { /* terminate the buffer after the length we read */ temp_buffer[bytes] = '\0'; appendPQExpBufferStr(buffer, temp_buffer); if (prog->processBuffer) { (*prog->processBuffer)(temp_buffer, error); } } return bytes; } /* * Writes the full command line of the given program into the given * pre-allocated buffer of given size, and returns how many bytes would have * been written in the buffer if it was large enough, like snprintf would do. */ int snprintf_program_command_line(Program *prog, char *buffer, int size) { char *currentPtr = buffer; int index, remainingBytes = BUFSIZE; if (prog->args[0] == NULL) { return 0; } for (index = 0; prog->args[index] != NULL; index++) { int n; /* replace an empty char buffer with '' */ if (prog->args[index][0] == '\0') { n = snprintf(currentPtr, remainingBytes, " ''"); } /* single-quote are needed when argument contains special chars */ else if (strchr(prog->args[index], ' ') != NULL || strchr(prog->args[index], '?') != NULL || strchr(prog->args[index], '!') != NULL) { n = snprintf(currentPtr, remainingBytes, " '%s'", prog->args[index]); } else { n = snprintf(currentPtr, remainingBytes, " %s", prog->args[index]); } if (n >= remainingBytes) { return BUFSIZE - remainingBytes + n; } currentPtr += n; remainingBytes -= n; } return BUFSIZE - remainingBytes; } #endif /* RUN_PROGRAM_IMPLEMENTATION */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/000077500000000000000000000000001414244367200205555ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/bin/pg_autoctl/.gitignore000066400000000000000000000000131414244367200225370ustar00rootroot00000000000000pg_autoctl pg_auto_failover-1.6.3/src/bin/pg_autoctl/Makefile000066400000000000000000000070071414244367200222210ustar00rootroot00000000000000# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the PostgreSQL License. PG_AUTOCTL = ./pg_autoctl SRC_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) DEPDIR = $(SRC_DIR)/.deps INCLUDES = $(patsubst ${SRC_DIR}%.h,%.h,$(wildcard ${SRC_DIR}*.h)) SRC = $(patsubst ${SRC_DIR}%.c,%.c,$(wildcard ${SRC_DIR}*.c)) OBJS = $(patsubst %.c,%.o,$(SRC)) OBJS += lib-log.o lib-commandline.o lib-parson.o lib-snprintf.o lib-strerror.o PG_CONFIG ?= pg_config BINDIR ?= $(shell $(PG_CONFIG) --bindir) PG_SNPRINTF = $(wildcard ${SRC_DIR}../lib/pg/snprintf.*) LOG_SRC = $(wildcard ${SRC_DIR}../lib/log/src/log.*) COMMANDLINE_SRC = $(wildcard ${SRC_DIR}../lib/subcommands.c/commandline.*) PARSON_SRC = $(wildcard ${SRC_DIR}../lib/parson/parson.*) COMMON_LIBS = -I${SRC_DIR}../lib/pg COMMON_LIBS += -I${SRC_DIR}../lib/log/src/ COMMON_LIBS += -I${SRC_DIR}../lib/subcommands.c/ COMMON_LIBS += -I${SRC_DIR}../lib/libs/ COMMON_LIBS += -I${SRC_DIR}../lib/parson/ CC = $(shell $(PG_CONFIG) --cc) DEFAULT_CFLAGS = -std=c99 -D_GNU_SOURCE -g DEFAULT_CFLAGS += -I $(shell $(PG_CONFIG) --includedir) DEFAULT_CFLAGS += -I $(shell $(PG_CONFIG) --includedir-server) DEFAULT_CFLAGS += -I $(shell $(PG_CONFIG) --pkgincludedir)/internal DEFAULT_CFLAGS += $(shell $(PG_CONFIG) --cflags) DEFAULT_CFLAGS += -Wformat DEFAULT_CFLAGS += -Wall DEFAULT_CFLAGS += -Werror=implicit-int DEFAULT_CFLAGS += -Werror=implicit-function-declaration DEFAULT_CFLAGS += -Werror=return-type DEFAULT_CFLAGS += -Wno-declaration-after-statement # Needed for OSX DEFAULT_CFLAGS += -Wno-missing-braces DEFAULT_CFLAGS += $(COMMON_LIBS) ifdef USE_SECURITY_FLAGS # Flags taken from: https://liquid.microsoft.com/Web/Object/Read/ms.security/Requirements/Microsoft.Security.SystemsADM.10203#guide SECURITY_CFLAGS=-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -z noexecstack -fpie -Wl,-pie -Wl,-z,relro -Wl,-z,now -Wformat -Wformat-security -Werror=format-security DEFAULT_CFLAGS += $(SECURITY_CFLAGS) endif override CFLAGS := $(DEFAULT_CFLAGS) $(CFLAGS) LIBS = -L $(shell $(PG_CONFIG) --pkglibdir) LIBS += -L $(shell $(PG_CONFIG) --libdir) LIBS += $(shell $(PG_CONFIG) --ldflags) LIBS += $(shell $(PG_CONFIG) --libs) LIBS += -lpq LIBS += -lncurses all: $(PG_AUTOCTL) ; # Based on Postgres Makefile for automatic dependency generation # https://github.com/postgres/postgres/blob/1933ae629e7b706c6c23673a381e778819db307d/src/Makefile.global.in#L890-L924 %.o : %.c @if test ! -d $(DEPDIR); then mkdir -p $(DEPDIR); fi $(CC) $(CFLAGS) -c -MMD -MP -MF$(DEPDIR)/$(*F).Po -o $@ $< Po_files := $(wildcard $(DEPDIR)/*.Po) ifneq (,$(Po_files)) include $(Po_files) endif $(PG_AUTOCTL): $(OBJS) $(INCLUDES) $(CC) $(CFLAGS) $(OBJS) $(LDFLAGS) $(LIBS) -o $@ lib-snprintf.o: $(PG_SNPRINTF) $(CC) $(CFLAGS) -c -MMD -MP -MF$(DEPDIR)/$(*F).Po -MT$@ -o $@ ${SRC_DIR}../lib/pg/snprintf.c lib-strerror.o: $(PG_SNPRINTF) $(CC) $(CFLAGS) -c -MMD -MP -MF$(DEPDIR)/$(*F).Po -MT$@ -o $@ ${SRC_DIR}../lib/pg/strerror.c lib-log.o: $(LOG_SRC) $(CC) $(CFLAGS) -c -MMD -MP -MF$(DEPDIR)/$(*F).Po -MT$@ -o $@ ${SRC_DIR}../lib/log/src/log.c lib-commandline.o: $(COMMANDLINE_SRC) $(CC) $(CFLAGS) -c -MMD -MP -MF$(DEPDIR)/$(*F).Po -MT$@ -o $@ ${SRC_DIR}../lib/subcommands.c/commandline.c lib-parson.o: $(PARSON_SRC) $(CC) $(CFLAGS) -c -MMD -MP -MF$(DEPDIR)/$(*F).Po -MT$@ -o $@ ${SRC_DIR}../lib/parson/parson.c clean: rm -f $(OBJS) $(PG_AUTOCTL) rm -rf $(DEPDIR) install: $(PG_AUTOCTL) install -d $(DESTDIR)$(BINDIR) install -m 0755 $(PG_AUTOCTL) $(DESTDIR)$(BINDIR) .PHONY: all monitor clean pg_auto_failover-1.6.3/src/bin/pg_autoctl/README.md000066400000000000000000000165071414244367200220450ustar00rootroot00000000000000# pg_autoctl This directory contains the code the `pg_autoctl` utility, which implements the facilities needed to operate and run a pg_auto_failover installation. Such an installation is made of both a monitor and a keeper, and `pg_autoctl` allows to operate in those two modes. The `pg_autoctl` binary exposes a full command line with sub-commands. Most of the commands exposed to the user are compatible with running in the context of a monitor node or a keeper node. ## Code Structure The code is organized in the following way: - files with a name that starts with `cli_` implement the command line facilities, their role is to understand the user's command and then call into the implementation code. - files with a name that starts with `cli_do_` implement the DEBUG command line facilities, that are meant to expose all the `pg_autoctl` facilities in a way that make them easy to invoke from the command line, for better testability of the software, and shorter interaction loops for developers. - files with a name that contains `utils`, such as `env_utils.c`, `file_utils.c`, or `string_utils.c` implement abstractions and facilities used in many places in the rest of the code. Files `parsing.[ch]` complete this list and could have been named `parsing_utils` really. - files with a name that contains `config` are related to handling of the configuration files for either a monitor or a keeper instance, and this is detailed later in this file. - files with a names that start with `ini_` implement our higher level facilities to handle configuration written in the INI format. - files with a names that starts with `pg` implement abstrations used to handle a Postgres service: - the `pgsql` module contains code to query the local Postgres instance by using SQL queries, including the connection and result parsing code. - the `monitor` module contains code to query the monitor Postgres instance by using its SQL API, made with stored procedures, written as a C extension to Postgres. - the `pgctl` module contains code to run Postgres commands such as `pg_controldata`, `pg_basebackup`, or `pg_ctl`. - the `pgsetup` module contains code that discovers the current status of a Postgres PGDATA directory, including whether the Postgres service is currently running, on which port, etc. - files with a name starting with `service` implement either process control or a subprocess in the `pg_autoctl` process tree, and the `supervisor.[ch]` files implement the main restart strategy to control our process tree, see later for more details. - the `primary_standby` file implements facilities to control Postgres streaming replication primary and standby nodes and is mainly used from the `fsm_transition.c` operations. - files with a name that contains `fsm` and `state` implement the “client-side” Finite State Machine that controls and implement pg_auto_failover. There are more files needed to implement `pg_autoctl` and the remaining files have specific charters: - the `main.c` file contains the `main(argc, argv)` function and initializes our program. - the `loop.c` file implements the keeper service. - the `keeper_pg_init` module implements the `pg_autoctl create postgres` command, which initializes a Postgres node for pg_auto_failover. - the `monitor_pg_init` module implements the `pg_autoctl create monitor` command, which initializes a monitor node for pg_auto_failover. - the `debian` module contains code that recognize if we're given a debian style cluster, such as created with `pg_createcluster`, and tools to move the configuration files back in PGDATA and allow `pg_autoctl` to own that cluster again. - the `signals` module implements our signal masks and how we react to receiving a SIGHUP or a SIGTERM signal, etc. - the `systemd_config` module uses our INI file abstractions to create a systemd unit configuration file that can be deployed and registered to make `pg_autoctl` a systemd unit service. ## Command Line and Configuration The `pg_autoctl` tool provides a complex set of commands and sub-commands, and handles user-given configuration. The configuration is handled in the INI format and can be all managed through the `pg_autoctl config` commands: ``` Available commands: pg_autoctl config check Check pg_autoctl configuration get Get the value of a given pg_autoctl configuration variable set Set the value of a given pg_autoctl configuration variable ``` The modules in `keeper_config` and `monitor_config` define macros allowing to sync values read from the command line option parsing and the INI file together. All the read and edit operations for the configuration of `pg_autoctl` may be done through the `pg_autoctl get|set` function, rather than having to open the configuration file. ## Software Architecture As the `pg_autoctl` tool unifies the management of two different process with two different modes of operation, the internal structure of the software reflects that. - the monitor parts of the code are designed around the monitor data structures: Monitor MonitorConfig LocalPostgresServer PostgresSetup - the keeper parts of the code are designed around the keeper data structures: Keeper KeeperConfig KeeperStateData LocalPostgresServer PostgresSetup We already see that we have common modules that are needed in both the keeper and the monitor, that both have to manage a local Postgres instance. ## Process Supervision and Process Tree The `pg_autoctl` owns the Postgres service as a sub-process. A typical process tree for the monitor looks like the following: ``` -+= 84202 dim ./src/bin/pg_autoctl/pg_autoctl run |-+- 84205 dim pg_autoctl: start/stop postgres | \-+- 84212 dim /Applications/Postgres.app/Contents/Versions/12/bin/postgres -D /private/tmp/plop/m -p 4000 -h * | |--= 84213 dim postgres: logger | |--= 84215 dim postgres: checkpointer | |--= 84216 dim postgres: background writer | |--= 84217 dim postgres: walwriter | |--= 84218 dim postgres: autovacuum launcher | |--= 84219 dim postgres: stats collector | |--= 84220 dim postgres: pg_auto_failover monitor | |--= 84221 dim postgres: logical replication launcher | |--= 84222 dim postgres: pg_auto_failover monitor worker | |--= 84223 dim postgres: pg_auto_failover monitor worker | |--= 84228 dim postgres: dim template1 [local] idle | \--= 84229 dim postgres: autoctl_node pg_auto_failover [local] idle \--- 84206 dim pg_autoctl: monitor listener ``` The main process start by `pg_autoctl` is a supervisor process. Its job is to loop around `waitpid()` and notice when sub-processes are finished. When the termination of them is not expected, the supervisor then restarts them. Given this process structure, the lifetime of the Postgres service is tied to that of the `pg_autoctl` service. Yet, we might want to restart the `pg_autoctl` code (to install a bugfix, for instance) and avoid restarting Postgres. To that end, the supervisor process starts its services by using `fork()` and then `exec("/path/to/pg_autoctl")`. This allows to later easily stop and restart that sub-process and load the new binary from disk, then loaded also the bug fixes that possibly come with the new version. pg_auto_failover-1.6.3/src/bin/pg_autoctl/azure.c000066400000000000000000001634451414244367200220640ustar00rootroot00000000000000/* * src/bin/pg_autoctl/azure.c * Implementation of a CLI which lets you call `az` cli commands to prepare * a pg_auto_failover demo or QA environment. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "snprintf.h" #include "azure.h" #include "cli_common.h" #include "cli_do_root.h" #include "cli_root.h" #include "commandline.h" #include "config.h" #include "env_utils.h" #include "log.h" #include "parsing.h" #include "pgsql.h" #include "pidfile.h" #include "signals.h" #include "string_utils.h" #include "runprogram.h" char azureCLI[MAXPGPATH] = { 0 }; static int azure_run_command(Program *program); static pid_t azure_start_command(Program *program); static bool azure_wait_for_commands(int count, pid_t pidArray[]); static bool run_ssh(const char *username, const char *ip); static bool run_ssh_command(const char *username, const char *ip, bool tty, const char *command); static bool start_ssh_command(const char *username, const char *ip, const char *command); static bool azure_git_toplevel(char *srcDir, size_t size); static bool start_rsync_command(const char *username, const char *ip, const char *srcDir); static bool azure_rsync_vms(AzureRegionResources *azRegion); static bool azure_fetch_resource_list(const char *group, AzureRegionResources *azRegion); static bool azure_fetch_vm_addresses(const char *group, const char *vm, AzureVMipAddresses *addresses); /* log_program_output logs the output of the given program. */ static void log_program_output(Program *prog, int outLogLevel, int errorLogLevel) { if (prog->stdOut != NULL) { char *outLines[BUFSIZE]; int lineCount = splitLines(prog->stdOut, outLines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_level(outLogLevel, "%s", outLines[lineNumber]); } } if (prog->stdErr != NULL) { char *errorLines[BUFSIZE]; int lineCount = splitLines(prog->stdErr, errorLines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_level(errorLogLevel, "%s", errorLines[lineNumber]); } } } /* * azure_run_command runs a command line using the azure CLI command, and when * dryRun is true instead of running the command it only shows the command it * would run as the output of the pg_autoctl command. */ static int azure_run_command(Program *program) { int returnCode; char command[BUFSIZE] = { 0 }; (void) snprintf_program_command_line(program, command, sizeof(command)); if (dryRun) { appendPQExpBuffer(azureScript, "\n%s", command); /* fake successful execution */ return 0; } log_debug("%s", command); (void) execute_subprogram(program); returnCode = program->returnCode; if (returnCode != 0) { (void) log_program_output(program, LOG_INFO, LOG_ERROR); } free_program(program); return returnCode; } /* * azure_start_command starts a command in the background, as a subprocess of * the current process, and returns the sub-process pid as soon as the * sub-process is started. It's the responsibility of the caller to then * implement waitpid() on the returned pid. * * This allows running several commands in parallel, as in the shell sequence: * * $ az vm create & * $ az vm create & * $ az vm create & * $ wait */ static pid_t azure_start_command(Program *program) { pid_t fpid; char command[BUFSIZE] = { 0 }; (void) snprintf_program_command_line(program, command, sizeof(command)); if (dryRun) { appendPQExpBuffer(azureScript, "\n%s &", command); /* fake successful execution */ return 0; } log_debug("%s", command); /* Flush stdio channels just before fork, to avoid double-output problems */ fflush(stdout); fflush(stderr); /* time to create the node_active sub-process */ fpid = fork(); switch (fpid) { case -1: { log_error("Failed to fork a process for command: %s", command); return -1; } case 0: { /* child process runs the command */ int returnCode; /* initialize the semaphore used for locking log output */ if (!semaphore_init(&log_semaphore)) { exit(EXIT_CODE_INTERNAL_ERROR); } /* set our logging facility to use our semaphore as a lock */ (void) log_set_udata(&log_semaphore); (void) log_set_lock(&semaphore_log_lock_function); (void) execute_subprogram(program); returnCode = program->returnCode; log_debug("Command %s exited with return code %d", program->args[0], returnCode); if (returnCode != 0) { (void) log_program_output(program, LOG_INFO, LOG_ERROR); free_program(program); /* the parent will have to use exit status */ (void) semaphore_finish(&log_semaphore); exit(EXIT_CODE_INTERNAL_ERROR); } free_program(program); (void) semaphore_finish(&log_semaphore); exit(EXIT_CODE_QUIT); } default: { /* fork succeeded, in parent */ return fpid; } } } /* * azure_wait_for_commands waits until all processes with pids from the array * are done. */ static bool azure_wait_for_commands(int count, pid_t pidArray[]) { int subprocessCount = count; bool allReturnCodeAreZero = true; while (subprocessCount > 0) { pid_t pid; int status; /* ignore errors */ pid = waitpid(-1, &status, WNOHANG); switch (pid) { case -1: { if (errno == ECHILD) { /* no more childrens */ return subprocessCount == 0; } pg_usleep(100 * 1000); /* 100 ms */ break; } case 0: { /* * We're using WNOHANG, 0 means there are no stopped or * exited children, it's all good. It's the expected case * when everything is running smoothly, so enjoy and sleep * for awhile. */ pg_usleep(100 * 1000); /* 100 ms */ break; } default: { /* * One of the az vm create sub-commands has finished, find * which and if it went all okay. */ int returnCode = WEXITSTATUS(status); /* find which VM is done now */ for (int index = 0; index < count; index++) { if (pidArray[index] == pid) { if (returnCode == 0) { log_debug("Process %d exited successfully", pid); } else { log_error("Process %d exited with code %d", pid, returnCode); allReturnCodeAreZero = false; } } } --subprocessCount; break; } } } return allReturnCodeAreZero; } /* * azure_psleep runs count parallel sleep process at the same time. */ bool azure_psleep(int count, bool force) { char sleep[MAXPGPATH] = { 0 }; pid_t pidArray[26] = { 0 }; bool saveDryRun = dryRun; if (!search_path_first("sleep", sleep, LOG_ERROR)) { log_fatal("Failed to find program sleep in PATH"); return false; } if (force) { dryRun = false; } for (int i = 0; i < count; i++) { char *args[3]; int argsIndex = 0; Program program = { 0 }; args[argsIndex++] = sleep; args[argsIndex++] = "5"; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); pidArray[i] = azure_start_command(&program); } if (force) { dryRun = saveDryRun; } if (!azure_wait_for_commands(count, pidArray)) { log_fatal("Failed to sleep concurrently with %d processes", count); return false; } return true; } /* * azure_get_remote_ip gets the local IP address by using the command `curl * ifconfig.me` */ bool azure_get_remote_ip(char *ipAddress, size_t ipAddressSize) { Program program; char curl[MAXPGPATH] = { 0 }; if (!search_path_first("curl", curl, LOG_ERROR)) { log_fatal("Failed to find program curl in PATH"); return false; } program = run_program(curl, "ifconfig.me", NULL); if (program.returnCode != 0) { (void) log_program_output(&program, LOG_INFO, LOG_ERROR); free_program(&program); return false; } else { /* we expect a single line of output, no end-of-line */ strlcpy(ipAddress, program.stdOut, ipAddressSize); free_program(&program); return true; } } /* * azure_create_group creates a new resource group on Azure. */ bool azure_create_group(const char *name, const char *location) { char *args[16]; int argsIndex = 0; Program program = { 0 }; args[argsIndex++] = azureCLI; args[argsIndex++] = "group"; args[argsIndex++] = "create"; args[argsIndex++] = "--name"; args[argsIndex++] = (char *) name; args[argsIndex++] = "--location"; args[argsIndex++] = (char *) location; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); log_info("Creating group \"%s\" in location \"%s\"", name, location); return azure_run_command(&program) == 0; } /* * azure_create_vnet creates a new vnet on Azure. */ bool azure_create_vnet(const char *group, const char *name, const char *prefix) { char *args[16]; int argsIndex = 0; Program program = { 0 }; args[argsIndex++] = azureCLI; args[argsIndex++] = "network"; args[argsIndex++] = "vnet"; args[argsIndex++] = "create"; args[argsIndex++] = "--resource-group"; args[argsIndex++] = (char *) group; args[argsIndex++] = "--name"; args[argsIndex++] = (char *) name; args[argsIndex++] = "--address-prefix"; args[argsIndex++] = (char *) prefix; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); log_info("Creating network vnet \"%s\" using address prefix \"%s\"", name, prefix); return azure_run_command(&program) == 0; } /* * azure_create_vnet creates a new vnet on Azure. */ bool azure_create_nsg(const char *group, const char *name) { char *args[16]; int argsIndex = 0; Program program = { 0 }; args[argsIndex++] = azureCLI; args[argsIndex++] = "network"; args[argsIndex++] = "nsg"; args[argsIndex++] = "create"; args[argsIndex++] = "--resource-group"; args[argsIndex++] = (char *) group; args[argsIndex++] = "--name"; args[argsIndex++] = (char *) name; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); log_info("Creating network nsg \"%s\"", name); return azure_run_command(&program) == 0; } /* * azure_create_vnet creates a new network security rule. */ bool azure_create_nsg_rule(const char *group, const char *nsgName, const char *name, const char *ipAddress) { char *args[38]; int argsIndex = 0; Program program = { 0 }; args[argsIndex++] = azureCLI; args[argsIndex++] = "network"; args[argsIndex++] = "nsg"; args[argsIndex++] = "rule"; args[argsIndex++] = "create"; args[argsIndex++] = "--resource-group"; args[argsIndex++] = (char *) group; args[argsIndex++] = "--nsg-name"; args[argsIndex++] = (char *) nsgName; args[argsIndex++] = "--name"; args[argsIndex++] = (char *) name; args[argsIndex++] = "--access"; args[argsIndex++] = "allow"; args[argsIndex++] = "--protocol"; args[argsIndex++] = "Tcp"; args[argsIndex++] = "--direction"; args[argsIndex++] = "Inbound"; args[argsIndex++] = "--priority"; args[argsIndex++] = "100"; args[argsIndex++] = "--source-address-prefixes"; args[argsIndex++] = (char *) ipAddress; args[argsIndex++] = "--source-port-range"; args[argsIndex++] = dryRun ? "\"*\"" : "*"; args[argsIndex++] = "--destination-address-prefix"; args[argsIndex++] = dryRun ? "\"*\"" : "*"; args[argsIndex++] = "--destination-port-ranges"; args[argsIndex++] = "22"; args[argsIndex++] = "5432"; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); log_info("Creating network nsg rules \"%s\" for our IP address \"%s\" " "for ports 22 and 5432", name, ipAddress); return azure_run_command(&program) == 0; } /* * azure_create_subnet creates a new subnet on Azure. */ bool azure_create_subnet(const char *group, const char *vnet, const char *name, const char *prefixes, const char *nsg) { char *args[16]; int argsIndex = 0; Program program = { 0 }; args[argsIndex++] = azureCLI; args[argsIndex++] = "network"; args[argsIndex++] = "vnet"; args[argsIndex++] = "subnet"; args[argsIndex++] = "create"; args[argsIndex++] = "--resource-group"; args[argsIndex++] = (char *) group; args[argsIndex++] = "--vnet-name"; args[argsIndex++] = (char *) vnet; args[argsIndex++] = "--name"; args[argsIndex++] = (char *) name; args[argsIndex++] = "--address-prefixes"; args[argsIndex++] = (char *) prefixes; args[argsIndex++] = "--network-security-group"; args[argsIndex++] = (char *) nsg; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); log_info("Creating network subnet \"%s\" using address prefix \"%s\"", name, prefixes); return azure_run_command(&program) == 0; } /* * az_group_delete runs the command az group delete. */ bool az_group_delete(const char *group) { char *args[16]; int argsIndex = 0; Program program = { 0 }; args[argsIndex++] = azureCLI; args[argsIndex++] = "group"; args[argsIndex++] = "delete"; args[argsIndex++] = "--name"; args[argsIndex++] = (char *) group; args[argsIndex++] = "--yes"; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); log_info("Deleting azure resource group \"%s\"", group); return azure_run_command(&program) == 0; } /* * azure_prepare_node_name is a utility function that prepares a node name to * use for a VM in our pg_auto_failover deployment in a target Azure region. * * In the resource group "ha-demo-dim-paris" when creating a monitor (index 0), * an app VM (index 27), and 2 pg nodes VMs we would have the following names: * * - [0] ha-demo-dim-paris-monitor * - [1] ha-demo-dim-paris-a * - [2] ha-demo-dim-paris-b * - [27] ha-demo-dim-paris-app */ static void azure_prepare_node(AzureRegionResources *azRegion, int index) { char vmsuffix[] = "abcdefghijklmnopqrstuvwxyz"; if (index == 0) { sformat(azRegion->vmArray[index].name, sizeof(azRegion->vmArray[index].name), "%s-monitor", azRegion->group); } else if (index == MAX_VMS_PER_REGION - 1) { sformat(azRegion->vmArray[index].name, sizeof(azRegion->vmArray[index].name), "%s-app", azRegion->group); } else { sformat(azRegion->vmArray[index].name, sizeof(azRegion->vmArray[index].name), "%s-%c", azRegion->group, vmsuffix[index - 1]); } } /* * azure_node_index_from_name is the complement to azure_prepare_node. * Given a VM name such as ha-demo-dim-paris-monitor or ha-demo-dim-paris-a, * the function returns respectively 0 and 1, which is the array index where we * want to find information about the VM (name, IP addresses, etc) in an array * of VMs. */ static int azure_node_index_from_name(const char *group, const char *name) { int groupNameLen = strlen(group); char *ptr; if (strncmp(name, group, groupNameLen) != 0 || strlen(name) < (groupNameLen + 1)) { log_error("VM name \"%s\" does not start with group name \"%s\"", name, group); return -1; } /* skip group name and dash: ha-demo-dim-paris- */ ptr = (char *) name + groupNameLen + 1; /* * ha-demo-dim-paris-monitor is always index 0 * ha-demo-dim-paris-app is always index 27 (last in the array) * ha-demo-dim-paris-a is index 1 * ha-demo-dim-paris-b is index 2 * ... * ha-demo-dim-paris-z is index 26 */ if (strcmp(ptr, "monitor") == 0) { return 0; } else if (strcmp(ptr, "app") == 0) { return MAX_VMS_PER_REGION - 1; } else { if (strlen(ptr) != 1) { log_error("Failed to parse VM index from name \"%s\"", name); return -1; } /* 'a' is 1, 'b' is 2, etc */ return *ptr - 'a' + 1; } } /* * azure_create_vm creates a Virtual Machine in our azure resource group. */ bool azure_create_vm(AzureRegionResources *azRegion, const char *name, const char *image, const char *username) { char *args[26]; int argsIndex = 0; Program program = { 0 }; char publicIpAddressName[BUFSIZE] = { 0 }; sformat(publicIpAddressName, BUFSIZE, "%s-ip", name); args[argsIndex++] = azureCLI; args[argsIndex++] = "vm"; args[argsIndex++] = "create"; args[argsIndex++] = "--resource-group"; args[argsIndex++] = (char *) azRegion->group; args[argsIndex++] = "--name"; args[argsIndex++] = (char *) name; args[argsIndex++] = "--vnet-name"; args[argsIndex++] = (char *) azRegion->vnet; args[argsIndex++] = "--subnet"; args[argsIndex++] = (char *) azRegion->subnet; args[argsIndex++] = "--nsg"; args[argsIndex++] = (char *) azRegion->nsg; args[argsIndex++] = "--public-ip-address"; args[argsIndex++] = (char *) publicIpAddressName; args[argsIndex++] = "--image"; args[argsIndex++] = (char *) image; args[argsIndex++] = "--admin-username"; args[argsIndex++] = (char *) username; args[argsIndex++] = "--generate-ssh-keys"; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); log_info("Creating %s virtual machine \"%s\" with user \"%s\"", image, name, username); return azure_start_command(&program); } /* * azure_create_vms creates several azure virtual machine in parallel and waits * until all the commands have finished. */ bool azure_create_vms(AzureRegionResources *azRegion, const char *image, const char *username) { int pending = 0; pid_t pidArray[MAX_VMS_PER_REGION] = { 0 }; /* we read from left to right, have the smaller number on the left */ if (26 < azRegion->nodes) { log_error("pg_autoctl only supports up to 26 VMs per region"); return false; } log_info("Creating Virtual Machines for %s%d Postgres nodes, in parallel", azRegion->monitor ? "a monitor and " : " ", azRegion->nodes); /* index == 0 for the monitor, then 1..count for the other nodes */ for (int index = 0; index <= azRegion->nodes; index++) { /* skip index 0 when we're not creating a monitor */ if (index == 0 && !azRegion->monitor) { continue; } /* skip VMs that already exist, unless --script is used */ if (!dryRun && !IS_EMPTY_STRING_BUFFER(azRegion->vmArray[index].name) && !IS_EMPTY_STRING_BUFFER(azRegion->vmArray[index].public) && !IS_EMPTY_STRING_BUFFER(azRegion->vmArray[index].private)) { log_info("Skipping creation of VM \"%s\", " "which already exists with public IP address %s", azRegion->vmArray[index].name, azRegion->vmArray[index].public); continue; } (void) azure_prepare_node(azRegion, index); pidArray[index] = azure_create_vm(azRegion, azRegion->vmArray[index].name, image, username); ++pending; } /* also create the application node VM when asked to */ if (azRegion->appNodes > 0) { int index = MAX_VMS_PER_REGION - 1; if (!dryRun && !IS_EMPTY_STRING_BUFFER(azRegion->vmArray[index].name) && !IS_EMPTY_STRING_BUFFER(azRegion->vmArray[index].public) && !IS_EMPTY_STRING_BUFFER(azRegion->vmArray[index].private)) { log_info("Skipping creation of VM \"%s\", " "which already exists with public IP address %s", azRegion->vmArray[index].name, azRegion->vmArray[index].public); } else { (void) azure_prepare_node(azRegion, index); pidArray[index] = azure_create_vm(azRegion, azRegion->vmArray[index].name, image, username); ++pending; } } /* now wait for the child processes to be done */ if (dryRun && pending > 0) { appendPQExpBuffer(azureScript, "\nwait"); } else { if (!azure_wait_for_commands(pending, pidArray)) { log_fatal("Failed to create all %d azure VMs, " "see above for details", pending); return false; } } return true; } /* * azure_git_toplevel calls `git rev-parse --show-toplevel` and uses the result * as the directory to rsync to our VMs when provisionning from sources. */ static bool azure_git_toplevel(char *srcDir, size_t size) { Program program; char git[MAXPGPATH] = { 0 }; if (!search_path_first("git", git, LOG_ERROR)) { log_fatal("Failed to find program git in PATH"); return false; } program = run_program(git, "rev-parse", "--show-toplevel", NULL); if (program.returnCode != 0) { (void) log_program_output(&program, LOG_INFO, LOG_ERROR); free_program(&program); return false; } else { char *outLines[BUFSIZE]; /* git rev-parse --show-toplevel outputs a single line */ splitLines(program.stdOut, outLines, BUFSIZE); strlcpy(srcDir, outLines[0], size); free_program(&program); return true; } } /* * start_rsync_command is used to sync our local source directory with a remote * place on a target VM. */ static bool start_rsync_command(const char *username, const char *ip, const char *srcDir) { char *args[16]; int argsIndex = 0; Program program = { 0 }; char ssh[MAXPGPATH] = { 0 }; char essh[MAXPGPATH] = { 0 }; char rsync[MAXPGPATH] = { 0 }; char sourceDir[MAXPGPATH] = { 0 }; char rsync_remote[MAXPGPATH] = { 0 }; if (!search_path_first("rsync", rsync, LOG_ERROR)) { log_fatal("Failed to find program rsync in PATH"); return false; } if (!search_path_first("ssh", ssh, LOG_ERROR)) { log_fatal("Failed to find program ssh in PATH"); return false; } /* use our usual ssh options even when using it through rsync */ sformat(essh, sizeof(essh), "%s -o '%s' -o '%s' -o '%s'", ssh, "StrictHostKeyChecking=no", "UserKnownHostsFile /dev/null", "LogLevel=quiet"); /* we need the rsync remote as one string */ sformat(rsync_remote, sizeof(rsync_remote), "%s@%s:/home/%s/pg_auto_failover/", username, ip, username); /* we need to ensure that the source directory terminates with a "/" */ if (strcmp(strrchr(srcDir, '/'), "/") != 0) { sformat(sourceDir, sizeof(sourceDir), "%s/", srcDir); } else { strlcpy(sourceDir, srcDir, sizeof(sourceDir)); } args[argsIndex++] = rsync; args[argsIndex++] = "-a"; args[argsIndex++] = "-e"; args[argsIndex++] = essh; args[argsIndex++] = "--exclude='.git'"; args[argsIndex++] = "--exclude='*.o'"; args[argsIndex++] = "--exclude='*.deps'"; args[argsIndex++] = "--exclude='./src/bin/pg_autoctl/pg_autoctl'"; args[argsIndex++] = sourceDir; args[argsIndex++] = rsync_remote; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); return azure_start_command(&program); } /* * azure_rsync_vms runs the rsync command for target VMs in parallel. */ static bool azure_rsync_vms(AzureRegionResources *azRegion) { int pending = 0; pid_t pidArray[MAX_VMS_PER_REGION] = { 0 }; char srcDir[MAXPGPATH] = { 0 }; if (!azure_git_toplevel(srcDir, sizeof(srcDir))) { /* errors have already been logged */ return false; } log_info("Syncing local directory \"%s\" to %d Azure VMs", srcDir, azRegion->nodes + azRegion->monitor + azRegion->appNodes); /* index == 0 for the monitor, then 1..count for the other nodes */ for (int index = 0; index <= azRegion->nodes; index++) { /* skip index 0 when we're not creating a monitor */ if (index == 0 && !azRegion->monitor) { continue; } (void) azure_prepare_node(azRegion, index); pidArray[index] = start_rsync_command("ha-admin", azRegion->vmArray[index].public, srcDir); ++pending; } /* also provision the application node VM when asked to */ if (azRegion->appNodes > 0) { int index = MAX_VMS_PER_REGION - 1; (void) azure_prepare_node(azRegion, index); pidArray[index] = start_rsync_command("ha-admin", azRegion->vmArray[index].public, srcDir); ++pending; } /* now wait for the child processes to be done */ if (dryRun) { appendPQExpBuffer(azureScript, "\nwait"); } else { if (!azure_wait_for_commands(pending, pidArray)) { log_fatal("Failed to provision all %d azure VMs, " "see above for details", pending); return false; } } return true; } /* * azure_build_pg_autoctl runs `make all` then `make install` on all the target * VMs in parallel, using an ssh command line. */ static bool azure_build_pg_autoctl(AzureRegionResources *azRegion) { int pending = 0; pid_t pidArray[MAX_VMS_PER_REGION] = { 0 }; char *buildCommand = "make PG_CONFIG=/usr/lib/postgresql/11/bin/pg_config " "-C pg_auto_failover -s clean all " " && " "sudo make PG_CONFIG=/usr/lib/postgresql/11/bin/pg_config " "BINDIR=/usr/local/bin -C pg_auto_failover install"; log_info("Building pg_auto_failover from sources on %d Azure VMs", azRegion->nodes + azRegion->monitor + azRegion->appNodes); log_info("%s", buildCommand); /* index == 0 for the monitor, then 1..count for the other nodes */ for (int index = 0; index <= azRegion->nodes; index++) { /* skip index 0 when we're not creating a monitor */ if (index == 0 && !azRegion->monitor) { continue; } (void) azure_prepare_node(azRegion, index); pidArray[index] = start_ssh_command("ha-admin", azRegion->vmArray[index].public, buildCommand); ++pending; } /* also provision the application node VM when asked to */ if (azRegion->appNodes > 0) { int index = MAX_VMS_PER_REGION - 1; (void) azure_prepare_node(azRegion, index); pidArray[index] = start_ssh_command("ha-admin", azRegion->vmArray[index].public, buildCommand); ++pending; } /* now wait for the child processes to be done */ if (dryRun) { appendPQExpBuffer(azureScript, "\nwait"); } else { if (!azure_wait_for_commands(pending, pidArray)) { log_fatal("Failed to provision all %d azure VMs, " "see above for details", pending); return false; } } return true; } /* * azure_prepare_target_versions prepares the environment variables that we * need to grasp for provisioning our target Azure VMs. We use the following * environment variables: * * AZ_PG_VERSION ?= 13 * AZ_PGAF_DEB_VERSION ?= 1.5 * AZ_PGAF_DEB_REVISION ?= 1.5.2-1 */ bool azure_prepare_target_versions(KeyVal *env) { char *keywords[] = { "AZ_PG_VERSION", "AZ_PGAF_DEB_VERSION", "AZ_PGAF_DEB_REVISION" }; /* set our static set of 3 variables from the environment */ env->count = 3; /* default values */ sformat(env->values[0], MAXCONNINFO, "13"); /* AZ_PG_VERSION */ sformat(env->values[1], MAXCONNINFO, "1.6"); /* AZ_PGAF_DEB_VERSION */ sformat(env->values[2], MAXCONNINFO, "1.6.3-1"); /* AZ_PGAF_DEB_REVISION */ for (int i = 0; i < 3; i++) { /* install the environment variable name as the keyword */ strlcpy(env->keywords[i], keywords[i], MAXCONNINFO); /* pick values from the environment when they exist */ if (env_exists(env->keywords[i])) { if (!get_env_copy(env->keywords[i], env->values[i], MAXCONNINFO)) { /* errors have already been logged */ return false; } } } return true; } /* * azure_prepare_debian_command prepares the debian command to install our * target pg_auto_failover package on the Azure VMs. * * sudo apt-get install -q -y \ * postgresql-13-auto-failover-1.5=1.5.2-1 \ * pg-auto-failover-cli-1.5=1.5.2-1 * * We are using environment variables to fill in the actual version numbers, * and we hard-code some defaults in case the environment has not be provided * for. */ static bool azure_prepare_debian_install_command(char *command, size_t size) { /* re-use our generic data structure from Postgres URI parsing */ KeyVal env = { 0 }; if (!azure_prepare_target_versions(&env)) { /* errors have already been logged */ return false; } sformat(command, size, "sudo apt-get install -q -y " " postgresql-%s-auto-failover-%s=%s" " pg-auto-failover-cli-%s=%s", env.values[0], /* AZ_PG_VERSION */ env.values[1], /* AZ_PGAF_DEB_VERSION */ env.values[2], /* AZ_PGAF_DEB_REVISION */ env.values[1], /* AZ_PGAF_DEB_VERSION */ env.values[2]); /* AZ_PGAF_DEB_REVISION */ return true; } /* * azure_prepare_debian_install_postgres_command prepares the debian command to * install our target Postgres version when building from sources. * * sudo apt-get build-dep -q -y postgresql-11 */ static bool azure_prepare_debian_install_postgres_command(char *command, size_t size) { /* re-use our generic data structure from Postgres URI parsing */ KeyVal env = { 0 }; if (!azure_prepare_target_versions(&env)) { /* errors have already been logged */ return false; } sformat(command, size, "sudo apt-get build-dep -q -y postgresql-%s", /* AZ_PG_VERSION */ env.values[0]); return true; } /* * azure_prepare_debian_build_dep_postgres_command_command prepares the debian * command to install our target Postgres version when building from sources. * * As we don't have deb-src for pg_auto_failover packages, we do the list * manually, and we add also rsync to be able to push sources from the local * git repository. * * sudo apt-get install -q -y \ * postgresql-server-dev-all libkrb5-dev postgresql-11 rsync */ static bool azure_prepare_debian_build_dep_postgres_command(char *command, size_t size) { /* re-use our generic data structure from Postgres URI parsing */ KeyVal env = { 0 }; if (!azure_prepare_target_versions(&env)) { /* errors have already been logged */ return false; } sformat(command, size, "sudo apt-get install -q -y " "postgresql-server-dev-all " "postgresql-%s " "libkrb5-dev " "rsync ", /* AZ_PG_VERSION */ env.values[0]); return true; } /* * azure_provision_vm runs the command `az vm run-command invoke` with our * provisioning script. */ bool azure_provision_vm(const char *group, const char *name, bool fromSource) { char *args[26]; int argsIndex = 0; Program program = { 0 }; char aptGetInstall[BUFSIZE] = { 0 }; char aptGetInstallPostgres[BUFSIZE] = { 0 }; char aptGetBuildDepPostgres[BUFSIZE] = { 0 }; const char *scriptsFromPackage[] = { "curl https://install.citusdata.com/community/deb.sh | sudo bash", "sudo apt-get install -q -y postgresql-common", "echo 'create_main_cluster = false' " "| sudo tee -a /etc/postgresql-common/createcluster.conf", aptGetInstall, "sudo usermod -a -G postgres ha-admin", NULL }; const char *scriptsFromSource[] = { "curl https://install.citusdata.com/community/deb.sh | sudo bash", "sudo apt-get install -q -y postgresql-common", "echo 'create_main_cluster = false' " "| sudo tee -a /etc/postgresql-common/createcluster.conf", aptGetInstallPostgres, aptGetBuildDepPostgres, "sudo usermod -a -G postgres ha-admin", NULL }; char **scripts = fromSource ? (char **) scriptsFromSource : (char **) scriptsFromPackage; char *quotedScripts[10][BUFSIZE] = { 0 }; if (!azure_prepare_debian_install_command(aptGetInstall, BUFSIZE)) { /* errors have already been logged */ return false; } if (!azure_prepare_debian_install_postgres_command(aptGetInstallPostgres, BUFSIZE)) { /* errors have already been logged */ return false; } if (!azure_prepare_debian_build_dep_postgres_command(aptGetBuildDepPostgres, BUFSIZE)) { /* errors have already been logged */ return false; } args[argsIndex++] = azureCLI; args[argsIndex++] = "vm"; args[argsIndex++] = "run-command"; args[argsIndex++] = "invoke"; args[argsIndex++] = "--resource-group"; args[argsIndex++] = (char *) group; args[argsIndex++] = "--name"; args[argsIndex++] = (char *) name; args[argsIndex++] = "--command-id"; args[argsIndex++] = "RunShellScript"; args[argsIndex++] = "--scripts"; if (dryRun) { for (int i = 0; scripts[i] != NULL; i++) { sformat((char *) quotedScripts[i], BUFSIZE, "\"%s\"", scripts[i]); args[argsIndex++] = (char *) quotedScripts[i]; } } else { for (int i = 0; scripts[i] != NULL; i++) { args[argsIndex++] = (char *) scripts[i]; } } args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); log_info("Provisioning Virtual Machine \"%s\"", name); return azure_start_command(&program); } /* * azure_provision_vms provisions several azure virtual machine in parallel and * waits until all the commands have finished. */ bool azure_provision_vms(AzureRegionResources *azRegion, bool fromSource) { int pending = 0; pid_t pidArray[MAX_VMS_PER_REGION] = { 0 }; char aptGetInstall[BUFSIZE] = { 0 }; /* we read from left to right, have the smaller number on the left */ if (26 < azRegion->nodes) { log_error("pg_autoctl only supports up to 26 VMs per region"); return false; } log_info("Provisioning %d Virtual Machines in parallel", azRegion->nodes + azRegion->monitor + azRegion->appNodes); if (!azure_prepare_debian_install_command(aptGetInstall, BUFSIZE)) { /* errors have already been logged */ return false; } log_info("Using: %s", aptGetInstall); /* index == 0 for the monitor, then 1..count for the other nodes */ for (int index = 0; index <= azRegion->nodes; index++) { /* skip index 0 when we're not creating a monitor */ if (index == 0 && azRegion->monitor == 0) { continue; } (void) azure_prepare_node(azRegion, index); pidArray[index] = azure_provision_vm(azRegion->group, azRegion->vmArray[index].name, fromSource); ++pending; } /* also provision the application node VM when asked to */ if (azRegion->appNodes > 0) { int index = MAX_VMS_PER_REGION - 1; (void) azure_prepare_node(azRegion, index); pidArray[index] = azure_provision_vm(azRegion->group, azRegion->vmArray[index].name, fromSource); ++pending; } /* now wait for the child processes to be done */ if (dryRun) { appendPQExpBuffer(azureScript, "\nwait"); } else { if (!azure_wait_for_commands(pending, pidArray)) { log_fatal("Failed to provision all %d azure VMs, " "see above for details", pending); return false; } } return true; } /* * azure_resource_list runs the command azure resource list. * * az resource list --output table --query "[?resourceGroup=='ha-demo-dim-paris'].{ name: name, flavor: kind, resourceType: type, region: location }" */ bool azure_resource_list(const char *group) { char *args[16]; int argsIndex = 0; bool success = true; Program program = { 0 }; char query[BUFSIZE] = { 0 }; char command[BUFSIZE] = { 0 }; sformat(query, BUFSIZE, "[?resourceGroup=='%s']" ".{ name: name, flavor: kind, resourceType: type, region: location }", group); args[argsIndex++] = azureCLI; args[argsIndex++] = "resource"; args[argsIndex++] = "list"; args[argsIndex++] = "--output"; args[argsIndex++] = "table"; args[argsIndex++] = "--query"; args[argsIndex++] = (char *) query; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); (void) snprintf_program_command_line(&program, command, sizeof(command)); log_info("%s", command); (void) execute_subprogram(&program); success = program.returnCode == 0; if (success) { fformat(stdout, "%s", program.stdOut); } else { (void) log_program_output(&program, LOG_INFO, LOG_ERROR); } free_program(&program); return success; } /* * azure_fetch_resource_list fetches existing resource names for a short list * of known objects in a target azure resource group. */ static bool azure_fetch_resource_list(const char *group, AzureRegionResources *azRegion) { char *args[16]; int argsIndex = 0; bool success = true; Program program = { 0 }; char query[BUFSIZE] = { 0 }; char command[BUFSIZE] = { 0 }; sformat(query, BUFSIZE, "[?resourceGroup=='%s'].{ name: name, resourceType: type }", group); args[argsIndex++] = azureCLI; args[argsIndex++] = "resource"; args[argsIndex++] = "list"; args[argsIndex++] = "--output"; args[argsIndex++] = "json"; args[argsIndex++] = "--query"; args[argsIndex++] = (char *) query; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); (void) snprintf_program_command_line(&program, command, sizeof(command)); log_info("Fetching resources that might already exist from a previous run"); log_info("%s", command); (void) execute_subprogram(&program); success = program.returnCode == 0; if (success) { /* parson insists on having fresh heap allocated memory, apparently */ char *jsonString = strdup(program.stdOut); JSON_Value *js = json_parse_string(jsonString); JSON_Array *jsArray = json_value_get_array(js); int count = json_array_get_count(jsArray); if (js == NULL) { log_error("Failed to parse JSON string: %s", program.stdOut); return false; } log_info("Found %d Azure resources already created in group \"%s\"", count, group); for (int index = 0; index < count; index++) { JSON_Object *jsObj = json_array_get_object(jsArray, index); char *name = (char *) json_object_get_string(jsObj, "name"); char *type = (char *) json_object_get_string(jsObj, "resourceType"); if (streq(type, "Microsoft.Network/virtualNetworks")) { strlcpy(azRegion->vnet, name, sizeof(azRegion->vnet)); log_info("Found existing vnet \"%s\"", azRegion->vnet); } else if (streq(type, "Microsoft.Network/networkSecurityGroups")) { strlcpy(azRegion->nsg, name, sizeof(azRegion->nsg)); log_info("Found existing nsg \"%s\"", azRegion->nsg); } else if (streq(type, "Microsoft.Compute/virtualMachines")) { int index = azure_node_index_from_name(group, name); strlcpy(azRegion->vmArray[index].name, name, NAMEDATALEN); log_info("Found existing VM \"%s\"", name); } else { /* ignore the resource Type listed */ log_debug("Unknown resource type: \"%s\" with name \"%s\"", type, name); } } free(jsonString); } else { (void) log_program_output(&program, LOG_INFO, LOG_ERROR); } free_program(&program); return success; } /* * azure_show_ip_addresses shows public and private IP addresses for our list * of nodes created in a specific resource group. * * az vm list-ip-addresses -g ha-demo-dim-paris --query '[] [] . { name: virtualMachine.name, "public address": virtualMachine.network.publicIpAddresses[0].ipAddress, "private address": virtualMachine.network.privateIpAddresses[0] }' -o table */ bool azure_show_ip_addresses(const char *group) { char *args[16]; int argsIndex = 0; bool success = true; Program program = { 0 }; char query[BUFSIZE] = { 0 }; char command[BUFSIZE] = { 0 }; sformat(query, BUFSIZE, "[] [] . { name: virtualMachine.name, " "\"public address\": " "virtualMachine.network.publicIpAddresses[0].ipAddress, " "\"private address\": " "virtualMachine.network.privateIpAddresses[0] }"); args[argsIndex++] = azureCLI; args[argsIndex++] = "vm"; args[argsIndex++] = "list-ip-addresses"; args[argsIndex++] = "--resource-group"; args[argsIndex++] = (char *) group; args[argsIndex++] = "--query"; args[argsIndex++] = (char *) query; args[argsIndex++] = "-o"; args[argsIndex++] = "table"; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); (void) snprintf_program_command_line(&program, command, sizeof(command)); log_info("%s", command); (void) execute_subprogram(&program); success = program.returnCode == 0; if (success) { fformat(stdout, "%s", program.stdOut); } else { (void) log_program_output(&program, LOG_INFO, LOG_ERROR); } free_program(&program); return success; } /* * azure_fetch_ip_addresses fetches IP address (both public and private) for * VMs created in an Azure resource group, and fill-in the given array. */ bool azure_fetch_ip_addresses(const char *group, AzureVMipAddresses *vmArray) { char *args[16]; int argsIndex = 0; bool success = true; Program program = { 0 }; char query[BUFSIZE] = { 0 }; char command[BUFSIZE] = { 0 }; sformat(query, BUFSIZE, "[] [] . { name: virtualMachine.name, " "\"public address\": " "virtualMachine.network.publicIpAddresses[0].ipAddress, " "\"private address\": " "virtualMachine.network.privateIpAddresses[0] }"); args[argsIndex++] = azureCLI; args[argsIndex++] = "vm"; args[argsIndex++] = "list-ip-addresses"; args[argsIndex++] = "--resource-group"; args[argsIndex++] = (char *) group; args[argsIndex++] = "--query"; args[argsIndex++] = (char *) query; args[argsIndex++] = "-o"; args[argsIndex++] = "json"; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); (void) snprintf_program_command_line(&program, command, sizeof(command)); if (dryRun) { appendPQExpBuffer(azureScript, "\n%s", command); return true; } log_info("%s", command); (void) execute_subprogram(&program); success = program.returnCode == 0; if (success) { JSON_Value *js = json_parse_string(program.stdOut); JSON_Array *jsArray = json_value_get_array(js); int count = json_array_get_count(jsArray); for (int index = 0; index < count; index++) { JSON_Object *jsObj = json_array_get_object(jsArray, index); char *str = NULL; int vmIndex = -1; str = (char *) json_object_get_string(jsObj, "name"); vmIndex = azure_node_index_from_name(group, str); if (vmIndex == -1) { /* errors have already been logged */ return false; } strlcpy(vmArray[vmIndex].name, str, NAMEDATALEN); str = (char *) json_object_get_string(jsObj, "private address"); strlcpy(vmArray[vmIndex].private, str, BUFSIZE); str = (char *) json_object_get_string(jsObj, "public address"); strlcpy(vmArray[vmIndex].public, str, BUFSIZE); log_debug( "Parsed VM %d as \"%s\" with public IP %s and private IP %s", vmIndex, vmArray[vmIndex].name, vmArray[vmIndex].public, vmArray[vmIndex].private); } } else { (void) log_program_output(&program, LOG_INFO, LOG_ERROR); } free_program(&program); return success; } /* * run_ssh runs the ssh command to the specified IP address as the given * username, sharing the current terminal tty. */ static bool run_ssh(const char *username, const char *ip) { char *args[16]; int argsIndex = 0; Program program = { 0 }; char ssh[MAXPGPATH] = { 0 }; char command[BUFSIZE] = { 0 }; if (!search_path_first("ssh", ssh, LOG_ERROR)) { log_fatal("Failed to find program ssh in PATH"); return false; } args[argsIndex++] = ssh; args[argsIndex++] = "-o"; args[argsIndex++] = "StrictHostKeyChecking=no"; args[argsIndex++] = "-o"; args[argsIndex++] = "UserKnownHostsFile /dev/null"; args[argsIndex++] = "-o"; args[argsIndex++] = "LogLevel=quiet"; args[argsIndex++] = "-l"; args[argsIndex++] = (char *) username; args[argsIndex++] = (char *) ip; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); program.capture = false; /* don't capture output */ program.tty = true; /* allow sharing the parent's tty */ (void) snprintf_program_command_line(&program, command, sizeof(command)); log_info("%s", command); (void) execute_subprogram(&program); return true; } /* * run_ssh_command runs the given command on the remote machine given by ip * address, as the given username. */ static bool run_ssh_command(const char *username, const char *ip, bool tty, const char *command) { char *args[16]; int argsIndex = 0; Program program = { 0 }; char ssh[MAXPGPATH] = { 0 }; char ssh_command[BUFSIZE] = { 0 }; if (!search_path_first("ssh", ssh, LOG_ERROR)) { log_fatal("Failed to find program ssh in PATH"); return false; } args[argsIndex++] = ssh; if (tty) { args[argsIndex++] = "-t"; } args[argsIndex++] = "-o"; args[argsIndex++] = "StrictHostKeyChecking=no"; args[argsIndex++] = "-o"; args[argsIndex++] = "UserKnownHostsFile /dev/null"; args[argsIndex++] = "-o"; args[argsIndex++] = "LogLevel=quiet"; args[argsIndex++] = "-l"; args[argsIndex++] = (char *) username; args[argsIndex++] = (char *) ip; args[argsIndex++] = "--"; args[argsIndex++] = (char *) command; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); program.capture = false; /* don't capture output */ program.tty = true; /* allow sharing the parent's tty */ (void) snprintf_program_command_line(&program, ssh_command, BUFSIZE); if (dryRun) { appendPQExpBuffer(azureScript, "\n%s", ssh_command); return true; } log_info("%s", ssh_command); (void) execute_subprogram(&program); return true; } /* * start_ssh_command starts the given command on the remote machine given by ip * address, as the given username. */ static bool start_ssh_command(const char *username, const char *ip, const char *command) { char *args[16]; int argsIndex = 0; Program program = { 0 }; char ssh[MAXPGPATH] = { 0 }; char ssh_command[BUFSIZE] = { 0 }; if (!search_path_first("ssh", ssh, LOG_ERROR)) { log_fatal("Failed to find program ssh in PATH"); return false; } args[argsIndex++] = ssh; args[argsIndex++] = "-o"; args[argsIndex++] = "StrictHostKeyChecking=no"; args[argsIndex++] = "-o"; args[argsIndex++] = "UserKnownHostsFile /dev/null"; args[argsIndex++] = "-o"; args[argsIndex++] = "LogLevel=quiet"; args[argsIndex++] = "-l"; args[argsIndex++] = (char *) username; args[argsIndex++] = (char *) ip; args[argsIndex++] = "--"; args[argsIndex++] = (char *) command; args[argsIndex++] = NULL; (void) initialize_program(&program, args, false); (void) snprintf_program_command_line(&program, ssh_command, BUFSIZE); if (dryRun) { appendPQExpBuffer(azureScript, "\n%s", ssh_command); return true; } return azure_start_command(&program); } /* * azure_fetch_vm_addresses fetches a given VM addresses. */ static bool azure_fetch_vm_addresses(const char *group, const char *vm, AzureVMipAddresses *addresses) { char groupName[BUFSIZE] = { 0 }; char vmName[BUFSIZE] = { 0 }; int vmIndex = -1; AzureVMipAddresses vmAddresses[MAX_VMS_PER_REGION] = { 0 }; /* if the vmName is already complete, just use it already */ if (strstr(vm, group) == NULL) { sformat(vmName, sizeof(vmName), "%s-%s", group, vm); } else { sformat(vmName, sizeof(vmName), "%s", vm); } vmIndex = azure_node_index_from_name(group, vmName); if (vmIndex == -1) { /* errors have already been logged */ return false; } /* * It takes as much time fetching all the IP addresses at once compared to * fetching a single IP address, so we always fetch them all internally. */ if (!azure_fetch_ip_addresses(group, vmAddresses)) { /* errors have already been logged */ return false; } if (IS_EMPTY_STRING_BUFFER(vmAddresses[vmIndex].name)) { log_error( "Failed to find Virtual Machine \"%s\" in resource group \"%s\"", vmName, groupName); return false; } /* copy the structure wholesale to the target address */ *addresses = vmAddresses[vmIndex]; return true; } /* * azure_vm_ssh runs an ssh command to the given VM public IP address. */ bool azure_vm_ssh(const char *group, const char *vm) { AzureVMipAddresses addresses = { 0 }; if (!azure_fetch_vm_addresses(group, vm, &addresses)) { /* errors have already been logged */ return false; } return run_ssh("ha-admin", addresses.public); } /* * azure_vm_ssh runs an ssh command to the given VM public IP address. */ bool azure_vm_ssh_command(const char *group, const char *vm, bool tty, const char *command) { AzureVMipAddresses addresses = { 0 }; if (!azure_fetch_vm_addresses(group, vm, &addresses)) { /* errors have already been logged */ return false; } return run_ssh_command("ha-admin", addresses.public, tty, command); } /* * azure_create_region creates a region on Azure and prepares it for * pg_auto_failover demo/QA activities. * * We need to create a vnet, a subnet, a network security group with a rule * that opens ports 22 (ssh) and 5432 (Postgres) for direct access from the * current IP address of the "client" machine where this pg_autoctl command is * being run. */ bool azure_create_region(AzureRegionResources *azRegion) { AzureRegionResources azRegionFound = { 0 }; /* * Fetch Azure objects that might have already been created in the target * resource group, we're going to re-use them, allowing the command to be * run several times in a row and then "fix itself", or at least continue * from where it failed. */ if (!dryRun) { if (!azure_fetch_resource_list(azRegion->group, &azRegionFound)) { /* errors have already been logged */ return false; } } /* * First create the resource group in the target location. */ if (!azure_create_group(azRegion->group, azRegion->location)) { /* errors have already been logged */ return false; } /* never skip a step when --script is used */ if (dryRun || IS_EMPTY_STRING_BUFFER(azRegionFound.vnet)) { if (!azure_create_vnet(azRegion->group, azRegion->vnet, azRegion->vnetPrefix)) { /* errors have already been logged */ return false; } } else { log_info("Skipping creation of vnet \"%s\" which already exist", azRegion->vnet); } /* * Get our IP address as seen by the outside world. */ if (!azure_get_remote_ip(azRegion->ipAddress, sizeof(azRegion->ipAddress))) { /* errors have already been logged */ return false; } /* * Create the network security group. */ if (dryRun || IS_EMPTY_STRING_BUFFER(azRegionFound.nsg)) { if (!azure_create_nsg(azRegion->group, azRegion->nsg)) { /* errors have already been logged */ return false; } } else { log_info("Skipping creation of nsg \"%s\" which already exist", azRegion->nsg); } /* * Create the network security rules for SSH and Postgres protocols. * * Some objects won't show up in the list from azure_fetch_resource_list * and it would be quite surprising that we find everything but those, so * we skip their creation even though we don't see them in azRegionFound. */ if (dryRun || IS_EMPTY_STRING_BUFFER(azRegionFound.nsg)) { if (!azure_create_nsg_rule(azRegion->group, azRegion->nsg, azRegion->rule, azRegion->ipAddress)) { /* errors have already been logged */ return false; } } else { log_info("Skipping creation of nsg rule \"%s\", " "because nsg \"%s\" already exists", azRegion->rule, azRegion->nsg); } /* * Create the network subnet using previous network security group. */ if (dryRun || IS_EMPTY_STRING_BUFFER(azRegionFound.vnet)) { if (!azure_create_subnet(azRegion->group, azRegion->vnet, azRegion->subnet, azRegion->subnetPrefix, azRegion->nsg)) { /* errors have already been logged */ return false; } } else { log_info("Skipping creation of subnet \"%s\" for prefix \"%s\", " "because vnet \"%s\" already exists", azRegion->subnet, azRegion->subnetPrefix, azRegion->vnet); } /* * Now is time to create the virtual machines. */ return azure_provision_nodes(azRegion); } /* * azure_drop_region runs the command az group delete --name ... --yes */ bool azure_drop_region(AzureRegionResources *azRegion) { return az_group_delete(azRegion->group); } /* * azure_provision_nodes creates the pg_autoctl VM nodes that we need, and * provision them with our provisioning script. */ bool azure_provision_nodes(AzureRegionResources *azRegion) { if (!azure_fetch_ip_addresses(azRegion->group, azRegion->vmArray)) { /* errors have already been logged */ return false; } if (azRegion->monitor > 0 || azRegion->nodes > 0) { /* * Here we run the following commands: * * $ az vm create --name a & * $ az vm create --name b & * $ wait * * $ az vm run-command invoke --name a --scripts ... & * $ az vm run-command invoke --name b --scripts ... & * $ wait * * We could optimize our code so that we run the provisioning scripts * for a VM as soon as it's been created, without having to wait until * the other VMs are created. Two things to keep in mind, though: * * - overall, being cleverer here might not be a win as we're going to * have to wait until all the VMs are provisioned anyway * * - in dry-run mode (--script), we still want to produce the more * naive script as shown above, for lack of known advanced control * structures in the target shell (we don't require a specific one). */ if (!azure_create_vms(azRegion, "debian", "ha-admin")) { /* errors have already been logged */ return false; } if (!azure_provision_vms(azRegion, azRegion->fromSource)) { /* errors have already been logged */ return false; } /* * When provisioning from sources, after the OS related steps in * azure_provision_vms, we still need to upload our local sources (this * requires rsync to have been installed in the previous step), and to * build our software from same sources. */ if (azRegion->fromSource) { if (!azure_rsync_vms(azRegion)) { /* errors have already been logged */ return false; } return azure_build_pg_autoctl(azRegion); } } return true; } /* * azure_deploy_monitor deploys pg_autoctl on a monitor node, running both the * pg_autoctl create monitor command and then the systemd integration commands. */ bool azure_deploy_monitor(AzureRegionResources *azRegion) { KeyVal env = { 0 }; char create_monitor[BUFSIZE] = { 0 }; char *systemd = "pg_autoctl -q show systemd --pgdata /home/ha-admin/monitor " "> pgautofailover.service; " "sudo mv pgautofailover.service /etc/systemd/system; " "sudo systemctl daemon-reload; " "sudo systemctl enable pgautofailover; " "sudo systemctl start pgautofailover"; bool tty = false; char *host = azRegion->vmArray[0].public; if (!azure_prepare_target_versions(&env)) { /* errors have already been logged */ return false; } /* build pg_autoctl create monitor command with target Postgres version */ sformat(create_monitor, sizeof(create_monitor), "pg_autoctl create monitor " "--auth trust " "--ssl-self-signed " "--pgdata /home/ha-admin/monitor " "--pgctl /usr/lib/postgresql/%s/bin/pg_ctl", /* AZ_PG_VERSION */ env.values[0]); if (azRegion->monitor == 0) { /* no monitor to deploy, we're done already */ return true; } /* the monitor is always at index 0 in the vmArray */ if (!run_ssh_command("ha-admin", host, tty, create_monitor)) { /* errors have already been logged */ return false; } if (!run_ssh_command("ha-admin", host, tty, systemd)) { /* errors have already been logged */ return false; } return true; } /* * azure_deploy_postgres deploys pg_autoctl on a Postgres node, running both * the pg_autoctl create postgres command and then the systemd integration * commands. */ bool azure_deploy_postgres(AzureRegionResources *azRegion, int vmIndex) { KeyVal env = { 0 }; char create_postgres[BUFSIZE] = { 0 }; char *systemd = "pg_autoctl -q show systemd --pgdata /home/ha-admin/pgdata " "> pgautofailover.service; " "sudo mv pgautofailover.service /etc/systemd/system; " "sudo systemctl daemon-reload; " "sudo systemctl enable pgautofailover; " "sudo systemctl start pgautofailover"; bool tty = false; char *host = azRegion->vmArray[vmIndex].public; if (!azure_prepare_target_versions(&env)) { /* errors have already been logged */ return false; } /* build pg_autoctl create monitor command with target Postgres version */ sformat(create_postgres, sizeof(create_postgres), "pg_autoctl create postgres " "--pgctl /usr/lib/postgresql/%s/bin/pg_ctl " "--pgdata /home/ha-admin/pgdata " "--auth trust " "--ssl-self-signed " "--username ha-admin " "--dbname appdb " "--hostname %s " "--name %s-%c " "--monitor " "'postgres://autoctl_node@%s/pg_auto_failover?sslmode=require'", /* AZ_PG_VERSION */ env.values[0], azRegion->vmArray[vmIndex].private, azRegion->region, 'a' + vmIndex - 1, azRegion->vmArray[0].private); /* the monitor is always at index 0 in the vmArray */ if (!run_ssh_command("ha-admin", host, tty, create_postgres)) { /* errors have already been logged */ return false; } if (!run_ssh_command("ha-admin", host, tty, systemd)) { /* errors have already been logged */ return false; } return true; } /* * azure_create_nodes run the pg_autoctl commands that create our nodes, and * then register them with systemd on the remote VMs. */ bool azure_create_nodes(AzureRegionResources *azRegion) { bool success = true; if (!azure_fetch_ip_addresses(azRegion->group, azRegion->vmArray)) { /* errors have already been logged */ return false; } if (azRegion->monitor > 0) { success = success && azure_deploy_monitor(azRegion); } /* * Now prepare all the other nodes, one at a time, so that we have a the * primary, etc. It could also be all at once, but one at a time is good * for a tutorial. */ for (int vmIndex = 1; vmIndex <= azRegion->nodes; vmIndex++) { success = success && azure_deploy_postgres(azRegion, vmIndex); } return success; } /* * azure_deploy_vm deploys a vm given by name ("monitor", "a", ...). */ bool azure_deploy_vm(AzureRegionResources *azRegion, const char *vmName) { int vmIndex = -1; if (!azure_fetch_ip_addresses(azRegion->group, azRegion->vmArray)) { /* errors have already been logged */ return false; } vmIndex = azure_node_index_from_name(azRegion->group, vmName); switch (vmIndex) { case -1: { /* errors have already been logged */ return false; } case 0: { return azure_deploy_monitor(azRegion); } default: { return azure_deploy_postgres(azRegion, vmIndex); } } } /* * azure_ls lists the azure resources we created in a specific resource group. */ bool azure_ls(AzureRegionResources *azRegion) { return azure_resource_list(azRegion->group); } /* * azure_show_ips shows the azure ip addresses for the VMs we created in a * specific resource group. */ bool azure_show_ips(AzureRegionResources *azRegion) { return azure_show_ip_addresses(azRegion->group); } /* * azure_ssh runs the ssh -l ha-admin command for given * node in given azure group, identified as usual with a prefix and a name. */ bool azure_ssh(AzureRegionResources *azRegion, const char *vm) { /* return azure_vm_ssh_command(groupName, vm, true, "watch date -R"); */ return azure_vm_ssh(azRegion->group, vm); } /* * azure_ssh_command runs the ssh -l ha-admin for * given node in given azure group, identified as usual with a prefix and a * name. */ bool azure_ssh_command(AzureRegionResources *azRegion, const char *vm, bool tty, const char *command) { return azure_vm_ssh_command(azRegion->group, vm, tty, command); } /* * azure_sync_source_dir runs rsync in parallel to all the created VMs. */ bool azure_sync_source_dir(AzureRegionResources *azRegion) { if (!azure_fetch_ip_addresses(azRegion->group, azRegion->vmArray)) { /* errors have already been logged */ return false; } if (!azure_rsync_vms(azRegion)) { /* errors have already been logged */ return false; } return azure_build_pg_autoctl(azRegion); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/azure.h000066400000000000000000000105401414244367200220540ustar00rootroot00000000000000/* * src/bin/pg_autoctl/azure.h * Implementation of a CLI which lets you call `az` cli commands to prepare * a pg_auto_failover demo or QA environment. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef AZURE_H #define AZURE_H #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "defaults.h" #include "parsing.h" /* global variables from azure.c */ extern bool dryRun; extern PQExpBuffer azureScript; extern char azureCLI[MAXPGPATH]; /* command line parsing */ typedef struct AzureOptions { char prefix[NAMEDATALEN]; char region[NAMEDATALEN]; char location[NAMEDATALEN]; int nodes; int cidr; bool fromSource; bool appNode; bool monitor; bool all; bool watch; } AzureOptions; #define MAX_VMS_PER_REGION 28 /* monitor, then pg ndoes [a-z], then app */ typedef struct AzureVMipAddresses { char name[NAMEDATALEN]; char public[BUFSIZE]; char private[BUFSIZE]; } AzureVMipAddresses; typedef struct AzureRegionResources { char filename[MAXPGPATH]; /* on-disk configuration file path */ char prefix[NAMEDATALEN]; /* ha-demo-dim- */ char region[NAMEDATALEN]; /* nickname, such as paris */ char location[NAMEDATALEN]; /* francecentral, eastus, etc */ char group[NAMEDATALEN]; /* ha-demo-dim-paris */ char vnet[BUFSIZE]; /* ha-demo-dim-paris-vnet */ char nsg[BUFSIZE]; /* ha-demo-dim-paris-nsg */ char rule[BUFSIZE]; /* ha-demo-dim-paris-ssh-and-pg */ char subnet[BUFSIZE]; /* ha-demo-dim-paris-subnet */ char vnetPrefix[BUFSIZE]; /* 10.%d.0.0/16 */ char subnetPrefix[BUFSIZE]; /* 10.%d.%d.0/24 */ char ipAddress[BUFSIZE]; /* our IP address as seen from the outside */ int monitor; /* do we want a monitor in that region? */ int appNodes; /* application nodes count */ int nodes; /* node count */ bool fromSource; AzureVMipAddresses vmArray[MAX_VMS_PER_REGION]; } AzureRegionResources; bool azure_psleep(int count, bool force); bool azure_get_remote_ip(char *ipAddress, size_t ipAddressSize); bool azure_create_group(const char *name, const char *location); bool azure_create_vnet(const char *group, const char *name, const char *prefix); bool azure_create_nsg(const char *group, const char *name); bool azure_create_nsg_rule(const char *group, const char *nsgName, const char *name, const char *prefixes); bool azure_create_subnet(const char *group, const char *vnet, const char *name, const char *prefixes, const char *nsg); bool az_group_delete(const char *group); bool azure_create_vm(AzureRegionResources *azRegion, const char *name, const char *image, const char *username); bool azure_create_vms(AzureRegionResources *azRegion, const char *image, const char *username); bool azure_prepare_target_versions(KeyVal *env); bool azure_provision_vm(const char *group, const char *name, bool fromSource); bool azure_provision_vms(AzureRegionResources *azRegion, bool fromSource); bool azure_fetch_ip_addresses(const char *group, AzureVMipAddresses *vmArray); bool azure_resource_list(const char *group); bool azure_show_ip_addresses(const char *group); bool azure_vm_ssh(const char *group, const char *vm); bool azure_vm_ssh_command(const char *group, const char *vm, bool tty, const char *command); bool azure_create_region(AzureRegionResources *azRegion); bool azure_drop_region(AzureRegionResources *azRegion); bool azure_provision_nodes(AzureRegionResources *azRegion); bool azure_deploy_monitor(AzureRegionResources *azRegion); bool azure_deploy_postgres(AzureRegionResources *azRegion, int vmIndex); bool azure_deploy_vm(AzureRegionResources *azRegion, const char *vmName); bool azure_create_nodes(AzureRegionResources *azRegion); bool azure_ls(AzureRegionResources *azRegion); bool azure_show_ips(AzureRegionResources *azRegion); bool azure_ssh(AzureRegionResources *azRegion, const char *vm); bool azure_ssh_command(AzureRegionResources *azRegion, const char *vm, bool tty, const char *command); bool azure_sync_source_dir(AzureRegionResources *azRegion); /* src/bin/pg_autoctl/cli_do_tmux_azure.c */ bool tmux_azure_start_or_attach_session(AzureRegionResources *azRegion); bool tmux_azure_kill_session(AzureRegionResources *azRegion); #endif /* AZURE_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/azure_config.c000066400000000000000000000130251414244367200233750ustar00rootroot00000000000000/* * src/bin/pg_autoctl/azure_config.c * Configuration file for azure QA/test environments * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include "postgres_fe.h" #include "azure.h" #include "azure_config.h" #include "cli_root.h" #include "config.h" #include "defaults.h" #include "ini_file.h" #include "file_utils.h" #include "log.h" #include "string_utils.h" #define OPTION_AZURE_PREFIX(config) \ make_strbuf_option_default("az", "prefix", "prefix", true, NAMEDATALEN, \ config->prefix, "ha-demo-") #define OPTION_AZURE_REGION(config) \ make_strbuf_option_default("az", "region", "region", true, NAMEDATALEN, \ config->region, "paris") #define OPTION_AZURE_LOCATION(config) \ make_strbuf_option_default("az", "location", "location", true, NAMEDATALEN, \ config->location, "francecentral") #define OPTION_AZURE_MONITOR(config) \ make_int_option_default("group", "monitor", "monitor", true, \ &(config->monitor), 1) #define OPTION_AZURE_NODES(config) \ make_int_option_default("group", "nodes", "nodes", true, \ &(config->nodes), 2) #define OPTION_AZURE_APP_NODES(config) \ make_int_option_default("group", "appNodes", NULL, true, \ &(config->appNodes), 0) #define OPTION_AZURE_GROUP(config) \ make_strbuf_option("resource", "group", NULL, false, NAMEDATALEN, \ config->group) #define OPTION_AZURE_VNET(config) \ make_strbuf_option("resource", "vnet", NULL, false, NAMEDATALEN, \ config->vnet) #define OPTION_AZURE_NSG(config) \ make_strbuf_option("resource", "nsg", NULL, false, NAMEDATALEN, \ config->nsg) #define OPTION_AZURE_RULE(config) \ make_strbuf_option("resource", "rule", NULL, false, NAMEDATALEN, \ config->rule) #define OPTION_AZURE_SUBNET(config) \ make_strbuf_option("resource", "subnet", NULL, false, NAMEDATALEN, \ config->subnet) #define SET_INI_OPTIONS_ARRAY(config) \ { \ OPTION_AZURE_PREFIX(config), \ OPTION_AZURE_REGION(config), \ OPTION_AZURE_LOCATION(config), \ OPTION_AZURE_MONITOR(config), \ OPTION_AZURE_NODES(config), \ OPTION_AZURE_APP_NODES(config), \ OPTION_AZURE_GROUP(config), \ OPTION_AZURE_VNET(config), \ OPTION_AZURE_NSG(config), \ OPTION_AZURE_RULE(config), \ OPTION_AZURE_SUBNET(config), \ INI_OPTION_LAST \ } /* * azure_config_read_file reads our azure configuration from an INI * configuration file that has been previously created by our pg_autoctl do * azure commands. */ bool azure_config_read_file(AzureRegionResources *azRegion) { IniOption azureOptions[] = SET_INI_OPTIONS_ARRAY(azRegion); log_debug("Reading azure configuration from %s", azRegion->filename); if (!read_ini_file(azRegion->filename, azureOptions)) { log_error("Failed to parse azure configuration file \"%s\"", azRegion->filename); return false; } return true; } /* * azure_config_write write the current azure config to given STREAM. */ bool azure_config_write(FILE *stream, AzureRegionResources *azRegion) { IniOption azureOptions[] = SET_INI_OPTIONS_ARRAY(azRegion); return write_ini_to_stream(stream, azureOptions); } /* * azure_config_write_file writes the current values in given azRegion to the * given filename. */ bool azure_config_write_file(AzureRegionResources *azRegion) { bool success = false; FILE *fileStream = NULL; log_trace("azure_config_write_file \"%s\"", azRegion->filename); fileStream = fopen_with_umask(azRegion->filename, "w", FOPEN_FLAGS_W, 0644); if (fileStream == NULL) { /* errors have already been logged */ return false; } success = azure_config_write(fileStream, azRegion); if (fclose(fileStream) == EOF) { log_error("Failed to write file \"%s\"", azRegion->filename); return false; } return success; } /* * azure_config_prepare prepares the names we use for the different * Azure network objects that we need: vnet, nsg, nsgrule, subnet. */ void azure_config_prepare(AzureOptions *options, AzureRegionResources *azRegion) { /* build the path to our configuration file on-disk */ if (!build_xdg_path(azRegion->filename, XDG_CONFIG, ".", "azure.cfg")) { log_fatal("Failed to prepare azure configuration filename"); exit(EXIT_CODE_INTERNAL_ERROR); } strlcpy(azRegion->prefix, options->prefix, sizeof(azRegion->prefix)); strlcpy(azRegion->region, options->region, sizeof(azRegion->region)); strlcpy(azRegion->location, options->location, sizeof(azRegion->location)); sformat(azRegion->group, sizeof(azRegion->group), "%s-%s", options->prefix, options->region); /* * Prepare our Azure object names from the group objects: vnet, subnet, * nsg, nsg rule. */ sformat(azRegion->vnet, sizeof(azRegion->vnet), "%s-net", azRegion->group); sformat(azRegion->nsg, sizeof(azRegion->nsg), "%s-nsg", azRegion->group); sformat(azRegion->rule, sizeof(azRegion->rule), "%s-ssh-and-pg", azRegion->group); sformat(azRegion->subnet, sizeof(azRegion->subnet), "%s-subnet", azRegion->group); /* transform --monitor and --no-app booleans into integer counts */ azRegion->monitor = options->monitor ? 1 : 0; azRegion->appNodes = options->appNode ? 1 : 0; azRegion->nodes = options->nodes; azRegion->fromSource = options->fromSource; /* * Prepare vnet and subnet IP addresses prefixes. */ sformat(azRegion->vnetPrefix, sizeof(azRegion->vnetPrefix), "10.%d.0.0/16", options->cidr); sformat(azRegion->subnetPrefix, sizeof(azRegion->subnetPrefix), "10.%d.%d.0/24", options->cidr, options->cidr); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/azure_config.h000066400000000000000000000014621414244367200234040ustar00rootroot00000000000000/* * src/bin/pg_autoctl/azure_config.h * Implementation of a CLI which lets you call `az` cli commands to prepare * a pg_auto_failover demo or QA environment. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef AZURE_CONFIG_H #define AZURE_CONFIG_H #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "azure.h" #include "defaults.h" bool azure_config_read_file(AzureRegionResources *azRegion); bool azure_config_write(FILE *stream, AzureRegionResources *azRegion); bool azure_config_write_file(AzureRegionResources *azRegion); void azure_config_prepare(AzureOptions *options, AzureRegionResources *azRegion); bool azure_get_remote_ip(char *ipAddress, size_t ipAddressSize); #endif /* AZURE_CONFIG_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_common.c000066400000000000000000001522471414244367200230530ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_common.c * Implementation of a CLI which lets you run individual keeper routines * directly * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include "commandline.h" #include "cli_common.h" #include "cli_root.h" #include "commandline.h" #include "env_utils.h" #include "ipaddr.h" #include "keeper.h" #include "keeper_config.h" #include "log.h" #include "monitor.h" #include "monitor_config.h" #include "parsing.h" #include "pgsetup.h" #include "pgsql.h" #include "pidfile.h" #include "state.h" #include "string_utils.h" /* handle command line options for our setup. */ KeeperConfig keeperOptions; bool createAndRun = false; bool outputJSON = false; bool openAppHBAonLAN = false; int ssl_flag = 0; /* stores --node-id, only used with --disable-monitor */ int monitorDisabledNodeId = -1; /* * cli_common_keeper_getopts parses the CLI options for the pg_autoctl create * postgres command, and others such as pg_autoctl do discover. An example of a * long_options parameter would look like this: * * static struct option long_options[] = { * { "pgctl", required_argument, NULL, 'C' }, * { "pgdata", required_argument, NULL, 'D' }, * { "pghost", required_argument, NULL, 'H' }, * { "pgport", required_argument, NULL, 'p' }, * { "listen", required_argument, NULL, 'l' }, * { "proxyport", required_argument, NULL, 'y' }, * { "username", required_argument, NULL, 'U' }, * { "auth", required_argument, NULL, 'A' }, * { "skip-pg-hba", no_argument, NULL, 'S' }, * { "pg-hba-lan", no_argument, NULL, 'L' }, * { "dbname", required_argument, NULL, 'd' }, * { "name", required_argument, NULL, 'a' }, * { "hostname", required_argument, NULL, 'n' }, * { "formation", required_argument, NULL, 'f' }, * { "group", required_argument, NULL, 'g' }, * { "monitor", required_argument, NULL, 'm' }, * { "node-id", required_argument, NULL, 'I' }, * { "disable-monitor", no_argument, NULL, 'M' }, * { "version", no_argument, NULL, 'V' }, * { "verbose", no_argument, NULL, 'v' }, * { "quiet", no_argument, NULL, 'q' }, * { "help", no_argument, NULL, 'h' }, * { "secondary", no_argument, NULL, 'z' } * { "citus-cluster", required_argument, NULL, 'Z' }, * { "candidate-priority", required_argument, NULL, 'P'}, * { "replication-quorum", required_argument, NULL, 'r'}, * { "maximum-backup-rate", required_argument, NULL, 'R' }, * { "help", no_argument, NULL, 0 }, * { "run", no_argument, NULL, 'x' }, * { "ssl-self-signed", no_argument, NULL, 's' }, * { "no-ssl", no_argument, NULL, 'N' }, * { "ssl-ca-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG }, * { "server-cert", required_argument, &ssl_flag, SSL_SERVER_CRT_FLAG }, * { "server-key", required_argument, &ssl_flag, SSL_SERVER_KEY_FLAG }, * { "ssl-mode", required_argument, &ssl_flag, SSL_MODE_FLAG }, * { NULL, 0, NULL, 0 } * }; * */ int cli_common_keeper_getopts(int argc, char **argv, struct option *long_options, const char *optstring, KeeperConfig *options, SSLCommandLineOptions *sslCommandLineOptions) { KeeperConfig LocalOptionConfig = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; /* force some non-zero default values */ LocalOptionConfig.monitorDisabled = false; LocalOptionConfig.groupId = -1; LocalOptionConfig.network_partition_timeout = -1; LocalOptionConfig.prepare_promotion_catchup = -1; LocalOptionConfig.prepare_promotion_walreceiver = -1; LocalOptionConfig.postgresql_restart_failure_timeout = -1; LocalOptionConfig.postgresql_restart_failure_max_retries = -1; LocalOptionConfig.pgSetup.settings.candidatePriority = -1; LocalOptionConfig.pgSetup.settings.replicationQuorum = FAILOVER_NODE_REPLICATION_QUORUM; /* default to a "primary" in citus node_role terms */ LocalOptionConfig.citusRole = CITUS_ROLE_PRIMARY; optind = 0; while ((c = getopt_long(argc, argv, optstring, long_options, &option_index)) != -1) { /* * The switch statement is ready for all the common letters of the * different nodes that `pg_autoctl create` knows how to deal with. The * parameter optstring restrict which letters we are going to actually * parsed, and there's no command that has all of them. */ switch (c) { case 'C': { /* { "pgctl", required_argument, NULL, 'C' } */ strlcpy(LocalOptionConfig.pgSetup.pg_ctl, optarg, MAXPGPATH); log_trace("--pg_ctl %s", LocalOptionConfig.pgSetup.pg_ctl); break; } case 'D': { /* { "pgdata", required_argument, NULL, 'D' } */ strlcpy(LocalOptionConfig.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", LocalOptionConfig.pgSetup.pgdata); break; } case 'H': { /* { "pghost", required_argument, NULL, 'h' } */ strlcpy(LocalOptionConfig.pgSetup.pghost, optarg, _POSIX_HOST_NAME_MAX); log_trace("--pghost %s", LocalOptionConfig.pgSetup.pghost); break; } case 'p': { /* { "pgport", required_argument, NULL, 'p' } */ if (!stringToInt(optarg, &LocalOptionConfig.pgSetup.pgport)) { log_error("Failed to parse --pgport number \"%s\"", optarg); errors++; } log_trace("--pgport %d", LocalOptionConfig.pgSetup.pgport); break; } case 'l': { /* { "listen", required_argument, NULL, 'l' } */ strlcpy(LocalOptionConfig.pgSetup.listen_addresses, optarg, MAXPGPATH); log_trace("--listen %s", LocalOptionConfig.pgSetup.listen_addresses); break; } case 'y': { /* { "proxyport", required_argument, NULL,'y' } */ if (!stringToInt(optarg, &LocalOptionConfig.pgSetup.proxyport)) { log_error("Failed to parse --proxyport number \"%s\"", optarg); errors++; } log_trace("--proxy %d", LocalOptionConfig.pgSetup.proxyport); break; } case 'z': { /* { "secondary", no_argument, NULL, 'z' } */ strlcpy(LocalOptionConfig.citusRoleStr, "secondary", NAMEDATALEN); LocalOptionConfig.citusRole = CITUS_ROLE_SECONDARY; log_trace("--secondary"); break; } case 'Z': { /* { "citus-cluster", required_argument, NULL, 'Z' }, */ strlcpy(LocalOptionConfig.pgSetup.citusClusterName, optarg, NAMEDATALEN); log_trace("--citus-cluster %s", LocalOptionConfig.pgSetup.citusClusterName); break; } case 'U': { /* { "username", required_argument, NULL, 'U' } */ strlcpy(LocalOptionConfig.pgSetup.username, optarg, NAMEDATALEN); log_trace("--username %s", LocalOptionConfig.pgSetup.username); break; } case 'A': { /* { "auth", required_argument, NULL, 'A' }, */ if (LocalOptionConfig.pgSetup.hbaLevel == HBA_EDIT_SKIP) { errors++; log_error("Please use either --auth or --skip-pg-hba"); } strlcpy(LocalOptionConfig.pgSetup.authMethod, optarg, NAMEDATALEN); log_trace("--auth %s", LocalOptionConfig.pgSetup.authMethod); if (LocalOptionConfig.pgSetup.hbaLevel == HBA_EDIT_UNKNOWN) { strlcpy(LocalOptionConfig.pgSetup.hbaLevelStr, pgsetup_hba_level_to_string(HBA_EDIT_MINIMAL), sizeof(LocalOptionConfig.pgSetup.hbaLevelStr)); LocalOptionConfig.pgSetup.hbaLevel = HBA_EDIT_MINIMAL; } break; } case 'S': { /* { "skip-pg-hba", required_argument, NULL, 'S' }, */ if (!IS_EMPTY_STRING_BUFFER(LocalOptionConfig.pgSetup.authMethod)) { errors++; log_error("Please use either --auth or --skip-pg-hba"); } /* force default authentication method then */ strlcpy(LocalOptionConfig.pgSetup.authMethod, DEFAULT_AUTH_METHOD, NAMEDATALEN); strlcpy(LocalOptionConfig.pgSetup.hbaLevelStr, pgsetup_hba_level_to_string(HBA_EDIT_SKIP), sizeof(LocalOptionConfig.pgSetup.hbaLevelStr)); LocalOptionConfig.pgSetup.hbaLevel = HBA_EDIT_SKIP; log_trace("--skip-pg-hba"); break; } case 'L': { /* { "pg-hba-lan", required_argument, NULL, 'L' }, */ if (LocalOptionConfig.pgSetup.hbaLevel == HBA_EDIT_SKIP) { errors++; log_error("Please use either --skip-pg-hba or --pg-hba-lan"); } strlcpy(LocalOptionConfig.pgSetup.hbaLevelStr, pgsetup_hba_level_to_string(HBA_EDIT_LAN), sizeof(LocalOptionConfig.pgSetup.hbaLevelStr)); LocalOptionConfig.pgSetup.hbaLevel = HBA_EDIT_LAN; log_trace("--pg-hba-lan"); break; } case 'd': { /* { "dbname", required_argument, NULL, 'd' } */ strlcpy(LocalOptionConfig.pgSetup.dbname, optarg, NAMEDATALEN); log_trace("--dbname %s", LocalOptionConfig.pgSetup.dbname); break; } case 'a': { /* { "name", required_argument, NULL, 'a' }, */ strlcpy(LocalOptionConfig.name, optarg, _POSIX_HOST_NAME_MAX); log_trace("--name %s", LocalOptionConfig.name); break; } case 'n': { /* { "hostname", required_argument, NULL, 'n' } */ strlcpy(LocalOptionConfig.hostname, optarg, _POSIX_HOST_NAME_MAX); log_trace("--hostname %s", LocalOptionConfig.hostname); break; } case 'f': { /* { "formation", required_argument, NULL, 'f' } */ strlcpy(LocalOptionConfig.formation, optarg, NAMEDATALEN); log_trace("--formation %s", LocalOptionConfig.formation); break; } case 'g': { /* { "group", required_argument, NULL, 'g' } */ if (!stringToInt(optarg, &LocalOptionConfig.groupId)) { log_fatal("--group argument is not a valid group ID: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--group %d", LocalOptionConfig.groupId); break; } case 'm': { /* { "monitor", required_argument, NULL, 'm' } */ if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(LocalOptionConfig.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", LocalOptionConfig.monitor_pguri); break; } case 'M': { /* { "disable-monitor", required_argument, NULL, 'M' }, */ LocalOptionConfig.monitorDisabled = true; log_trace("--disable-monitor"); break; } case 'I': { /* { "node-id", required_argument, NULL, 'I' }, */ if (!stringToInt(optarg, &monitorDisabledNodeId)) { log_fatal("--node-id argument is not a valid ID: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--node-id %d", monitorDisabledNodeId); break; } case 'P': { /* { "candidate-priority", required_argument, NULL, 'P'} */ int candidatePriority = strtol(optarg, NULL, 10); if (errno == EINVAL || candidatePriority < 0 || candidatePriority > 100) { log_fatal("--candidate-priority argument is not valid." " Valid values are integers from 0 to 100. "); exit(EXIT_CODE_BAD_ARGS); } LocalOptionConfig.pgSetup.settings.candidatePriority = candidatePriority; log_trace("--candidate-priority %d", candidatePriority); break; } case 'r': { /* { "replication-quorum", required_argument, NULL, 'r'} */ bool replicationQuorum = false; if (!parse_bool(optarg, &replicationQuorum)) { log_fatal("--replication-quorum argument is not valid." " Valid values are \"true\" or \"false\"."); exit(EXIT_CODE_BAD_ARGS); } LocalOptionConfig.pgSetup.settings.replicationQuorum = replicationQuorum; log_trace("--replication-quorum %s", boolToString(replicationQuorum)); break; } case 'R': { /* { "maximum-backup-rate", required_argument, NULL, 'R' } */ strlcpy(LocalOptionConfig.maximum_backup_rate, optarg, MAXIMUM_BACKUP_RATE_LEN); log_trace("--maximum-backup-rate %s", LocalOptionConfig.maximum_backup_rate); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'x': { /* { "run", no_argument, NULL, 'x' }, */ createAndRun = true; log_trace("--run"); break; } case 's': { /* { "ssl-self-signed", no_argument, NULL, 's' }, */ if (!cli_getopt_accept_ssl_options(SSL_CLI_SELF_SIGNED, *sslCommandLineOptions)) { errors++; break; } *sslCommandLineOptions = SSL_CLI_SELF_SIGNED; LocalOptionConfig.pgSetup.ssl.active = 1; LocalOptionConfig.pgSetup.ssl.createSelfSignedCert = true; log_trace("--ssl-self-signed"); break; } case 'N': { /* { "no-ssl", no_argument, NULL, 'N' }, */ if (!cli_getopt_accept_ssl_options(SSL_CLI_NO_SSL, *sslCommandLineOptions)) { errors++; break; } *sslCommandLineOptions = SSL_CLI_NO_SSL; LocalOptionConfig.pgSetup.ssl.active = 0; LocalOptionConfig.pgSetup.ssl.createSelfSignedCert = false; log_trace("--no-ssl"); break; } /* * { "ssl-ca-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG } * { "ssl-crl-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG } * { "server-cert", required_argument, &ssl_flag, SSL_SERVER_CRT_FLAG } * { "server-key", required_argument, &ssl_flag, SSL_SERVER_KEY_FLAG } * { "ssl-mode", required_argument, &ssl_flag, SSL_MODE_FLAG }, */ case 0: { if (ssl_flag != SSL_MODE_FLAG) { if (!cli_getopt_accept_ssl_options(SSL_CLI_USER_PROVIDED, *sslCommandLineOptions)) { errors++; break; } *sslCommandLineOptions = SSL_CLI_USER_PROVIDED; LocalOptionConfig.pgSetup.ssl.active = 1; } if (!cli_getopt_ssl_flags(ssl_flag, optarg, &(LocalOptionConfig.pgSetup))) { errors++; } break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } /* check --disable-monitor and --node-id */ if (LocalOptionConfig.monitorDisabled && monitorDisabledNodeId == -1) { log_fatal("When using --disable-monitor, also use --node-id"); exit(EXIT_CODE_BAD_ARGS); } if (!LocalOptionConfig.monitorDisabled && monitorDisabledNodeId != -1) { log_fatal("Option --node-id is only accepted with --disable-monitor"); exit(EXIT_CODE_BAD_ARGS); } /* check --secondary and --candidate-priority */ if (LocalOptionConfig.pgSetup.settings.candidatePriority == -1) { /* --candidate-priority has not been used */ if (LocalOptionConfig.citusRole == CITUS_ROLE_SECONDARY) { /* a Citus secondary can't be a target for failover */ LocalOptionConfig.pgSetup.settings.candidatePriority = 0; } else { /* here we install the default candidate priority */ LocalOptionConfig.pgSetup.settings.candidatePriority = FAILOVER_NODE_CANDIDATE_PRIORITY; } } else if (LocalOptionConfig.pgSetup.settings.candidatePriority > 0 && LocalOptionConfig.citusRole == CITUS_ROLE_SECONDARY) { log_fatal("Citus does not support secondary roles that are " "also a candidate for failover: please use --secondary " "with --candidate-priority 0"); exit(EXIT_CODE_BAD_ARGS); } /* a --secondary citus worker requires a cluster name */ if (LocalOptionConfig.citusRole == CITUS_ROLE_SECONDARY) { if (IS_EMPTY_STRING_BUFFER(LocalOptionConfig.pgSetup.citusClusterName)) { log_fatal("When using --secondary, also use --citus-cluster"); exit(EXIT_CODE_BAD_ARGS); } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* the default HBA editing level is MINIMAL, time to install it */ if (LocalOptionConfig.pgSetup.hbaLevel == HBA_EDIT_UNKNOWN) { strlcpy(LocalOptionConfig.pgSetup.hbaLevelStr, pgsetup_hba_level_to_string(HBA_EDIT_MINIMAL), sizeof(LocalOptionConfig.pgSetup.hbaLevelStr)); LocalOptionConfig.pgSetup.hbaLevel = HBA_EDIT_MINIMAL; } /* * Now, all commands need PGDATA validation. */ cli_common_get_set_pgdata_or_exit(&(LocalOptionConfig.pgSetup)); /* * We have a PGDATA setting, prepare our configuration pathnames from it. */ if (!keeper_config_set_pathnames_from_pgdata( &(LocalOptionConfig.pathnames), LocalOptionConfig.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* publish our option parsing now */ *options = LocalOptionConfig; return optind; } /* * cli_create_node_getopts parses the CLI options for the pg_autoctl create * command. An example of a long_options parameter would look like this: * * static struct option long_options[] = { * { "pgctl", required_argument, NULL, 'C' }, * { "pgdata", required_argument, NULL, 'D' }, * { "pghost", required_argument, NULL, 'H' }, * { "pgport", required_argument, NULL, 'p' }, * { "listen", required_argument, NULL, 'l' }, * { "proxyport", required_argument, NULL, 'y' }, * { "username", required_argument, NULL, 'U' }, * { "auth", required_argument, NULL, 'A' }, * { "skip-pg-hba", no_argument, NULL, 'S' }, * { "pg-hba-lan", no_argument, NULL, 'L' }, * { "dbname", required_argument, NULL, 'd' }, * { "hostname", required_argument, NULL, 'n' }, * { "formation", required_argument, NULL, 'f' }, * { "group", required_argument, NULL, 'g' }, * { "monitor", required_argument, NULL, 'm' }, * { "disable-monitor", no_argument, NULL, 'M' }, * { "version", no_argument, NULL, 'V' }, * { "verbose", no_argument, NULL, 'v' }, * { "quiet", no_argument, NULL, 'q' }, * { "help", no_argument, NULL, 'h' }, * { "secondary", no_argument, NULL, 'z' }, * { "citus-cluster", required_argument, NULL, 'Z' }, * { "candidate-priority", required_argument, NULL, 'P'}, * { "replication-quorum", required_argument, NULL, 'r'}, * { "help", no_argument, NULL, 0 }, * { "run", no_argument, NULL, 'x' }, * { "ssl-self-signed", no_argument, NULL, 's' }, * { "no-ssl", no_argument, NULL, 'N' }, * { "ssl-ca-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG }, * { "server-crt", required_argument, &ssl_flag, SSL_SERVER_CRT_FLAG }, * { "server-key", required_argument, &ssl_flag, SSL_SERVER_KEY_FLAG }, * { "ssl-mode", required_argument, &ssl_flag, SSL_MODE_FLAG }, * { NULL, 0, NULL, 0 } * }; * */ int cli_create_node_getopts(int argc, char **argv, struct option *long_options, const char *optstring, KeeperConfig *options) { SSLCommandLineOptions sslCommandLineOptions = SSL_CLI_UNKNOWN; optind = cli_common_keeper_getopts(argc, argv, long_options, optstring, options, &sslCommandLineOptions); /* * We require the user to specify an authentication mechanism, or to use * ---skip-pg-hba. Our documentation tutorial will use --auth trust, and we * should make it obvious that this is not the right choice for production. */ if (IS_EMPTY_STRING_BUFFER(options->pgSetup.authMethod)) { log_fatal("Please use either --auth trust|md5|... or --skip-pg-hba"); log_info("pg_auto_failover can be set to edit Postgres HBA rules " "automatically when needed. For quick testing '--auth trust' " "makes it easy to get started, " "consider another authentication mechanism for production."); exit(EXIT_CODE_BAD_ARGS); } /* * If we have --ssl-self-signed, we don't want to have --ssl-ca-file and * others in use anywhere. If we have --no-ssl, same thing. If we have the * SSL files setup, we want to have neither --ssl-self-signed nor the other * SSL files specified. * * We also need to either use the given sslMode or compute our default. */ if (sslCommandLineOptions == SSL_CLI_UNKNOWN) { log_fatal("Explicit SSL choice is required: please use either " "--ssl-self-signed or provide your certificates " "using --ssl-ca-file, --ssl-crl-file, " "--server-key, and --server-cert (or use --no-ssl if you " "are very sure that you do not want encrypted traffic)"); exit(EXIT_CODE_BAD_ARGS); } if (!pgsetup_validate_ssl_settings(&(options->pgSetup))) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* * You can't both have a monitor a use --disable-monitor. */ if (!IS_EMPTY_STRING_BUFFER(options->monitor_pguri) && options->monitorDisabled) { log_fatal("Use either --monitor or --disable-monitor, not both."); exit(EXIT_CODE_BAD_ARGS); } else if (!options->monitorDisabled) { if (IS_EMPTY_STRING_BUFFER(options->monitor_pguri) && !(env_exists(PG_AUTOCTL_MONITOR) && get_env_copy(PG_AUTOCTL_MONITOR, options->monitor_pguri, sizeof(options->monitor_pguri)))) { log_fatal("Failed to set the monitor URI: " "use either --monitor postgresql://... " "or --disable-monitor"); exit(EXIT_CODE_BAD_ARGS); } } else if (options->monitorDisabled) { /* * We must be able to restore this setup from the configuration file, * and for that we set the pg_autoctl.monitor URI in the file to the * "magic" value PG_AUTOCTL_DISABLED. */ strlcpy(options->monitor_pguri, PG_AUTOCTL_MONITOR_DISABLED, MAXCONNINFO); } return optind; } /* * cli_getopt_accept_ssl_options compute if we can accept the newSSLoption * (such as --no-ssl or --ssl-ca-file) given the previous one we have already * accepted. */ bool cli_getopt_accept_ssl_options(SSLCommandLineOptions newSSLOption, SSLCommandLineOptions currentSSLOptions) { if (currentSSLOptions == SSL_CLI_UNKNOWN) { /* first SSL option being parsed */ return true; } if (currentSSLOptions != newSSLOption) { if (currentSSLOptions == SSL_CLI_USER_PROVIDED || newSSLOption == SSL_CLI_USER_PROVIDED) { log_error( "Using either --no-ssl or --ssl-self-signed " "with user-provided SSL certificates " "is not supported"); return false; } /* * At this point we know that currentSSLOptions and newSSLOption are * different and none of them are SSL_CLI_USER_PROVIDED. */ log_error("Using both --no-ssl and --ssl-self-signed " "is not supported"); return false; } return true; } /* * cli_getopt_ssl_flags parses the SSL related options from the command line. * * { "ssl-ca-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG } * { "ssl-crl-file", required_argument, &ssl_flag, SSL_CRL_FILE_FLAG } * { "server-cert", required_argument, &ssl_flag, SSL_SERVER_CRT_FLAG } * { "server-key", required_argument, &ssl_flag, SSL_SERVER_KEY_FLAG } * { "ssl-mode", required_argument, &ssl_flag, SSL_MODE_FLAG }, * * As those options are not using any short option (one-char) variant, they all * fall in the case 0, and we can process them thanks to the global variable * ssl_flag, an int. */ bool cli_getopt_ssl_flags(int ssl_flag, char *optarg, PostgresSetup *pgSetup) { switch (ssl_flag) { case SSL_CA_FILE_FLAG: { strlcpy(pgSetup->ssl.caFile, optarg, MAXPGPATH); log_trace("--ssl-ca-file %s", pgSetup->ssl.caFile); break; } case SSL_CRL_FILE_FLAG: { strlcpy(pgSetup->ssl.crlFile, optarg, MAXPGPATH); log_trace("--ssl-crl-file %s", pgSetup->ssl.crlFile); break; } case SSL_SERVER_CRT_FLAG: { strlcpy(pgSetup->ssl.serverCert, optarg, MAXPGPATH); log_trace("--server-cert %s", pgSetup->ssl.serverCert); break; } case SSL_SERVER_KEY_FLAG: { strlcpy(pgSetup->ssl.serverKey, optarg, MAXPGPATH); log_trace("--server-key %s", pgSetup->ssl.serverKey); break; } case SSL_MODE_FLAG: { strlcpy(pgSetup->ssl.sslModeStr, optarg, SSL_MODE_STRLEN); pgSetup->ssl.sslMode = pgsetup_parse_sslmode(optarg); log_trace("--ssl-mode %s", pgsetup_sslmode_to_string(pgSetup->ssl.sslMode)); if (pgSetup->ssl.sslMode == SSL_MODE_UNKNOWN) { log_fatal("Failed to parse ssl mode \"%s\"", optarg); return false; } break; } default: { log_fatal("BUG: unknown ssl flag value: %d", ssl_flag); return false; } } return true; } /* * cli_common_get_set_pgdata_or_exit gets pgdata from either --pgdata or PGDATA * in the environment, and when we have a value for it, then we set it in the * environment. */ void cli_common_get_set_pgdata_or_exit(PostgresSetup *pgSetup) { /* if --pgdata is not given, fetch PGDATA from the environment or exit */ if (IS_EMPTY_STRING_BUFFER(pgSetup->pgdata)) { get_env_pgdata_or_exit(pgSetup->pgdata); } else { /* from now on want PGDATA set in the environment */ setenv("PGDATA", pgSetup->pgdata, 1); } } /* * keeper_cli_getopt_pgdata gets the PGDATA options or environment variable, * either of those must be set for all of pg_autoctl's commands. This parameter * allows to know which PostgreSQL instance we are the keeper of, and also * allows to determine where is our configuration file. */ int cli_getopt_pgdata(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; bool printVersion = false; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "json", no_argument, NULL, 'J' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; /* * The only command lines that are using keeper_cli_getopt_pgdata are * terminal ones: they don't accept subcommands. In that case our option * parsing can happen in any order and we don't need getopt_long to behave * in a POSIXLY_CORRECT way. * * The unsetenv() call allows getopt_long() to reorder arguments for us. */ unsetenv("POSIXLY_CORRECT"); while ((c = getopt_long(argc, argv, "D:JVvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'J': { outputJSON = true; log_trace("--json"); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ printVersion = true; break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } if (printVersion) { keeper_cli_print_version(argc, argv); } /* now that we have the command line parameters, prepare the options */ (void) prepare_keeper_options(&options); /* publish our option parsing in the global variable */ keeperOptions = options; return optind; } /* * prepare_keeper_options finishes the preparation of the keeperOptions that * hosts the command line options. */ void prepare_keeper_options(KeeperConfig *options) { cli_common_get_set_pgdata_or_exit(&(options->pgSetup)); log_debug("Managing PostgreSQL installation at \"%s\"", options->pgSetup.pgdata); if (!keeper_config_set_pathnames_from_pgdata(&options->pathnames, options->pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* * The function keeper_cli_getopt_pgdata is only used by commands needing a * configuration file to already exists: * * - `pg_autoctl do ...` are coded is a way that they don't need a * configuration file, instead using their own command line options * parser, so that test files specify the options on the command line, * making it easier to maintain, * * - `pg_autoctl config|create|run` are using this function * keeper_cli_getopt_pgdata and expect the configuration file to exists. * * A typo in PGDATA might be responsible for a failure that is hard to * understand later, because of the way to derive the configuration * filename from the PGDATA value. So we're going to go a little out of our * way and be helpful to the user. */ if (!file_exists(options->pathnames.config)) { log_fatal("Expected configuration file does not exist: \"%s\"", options->pathnames.config); if (!directory_exists(options->pgSetup.pgdata)) { log_warn("HINT: Check your PGDATA setting: \"%s\"", options->pgSetup.pgdata); } exit(EXIT_CODE_BAD_ARGS); } } /* * set_first_pgctl sets the first pg_ctl found in PATH to given KeeperConfig. */ void set_first_pgctl(PostgresSetup *pgSetup) { /* first, use PG_CONFIG when it exists in the environment */ if (set_pg_ctl_from_PG_CONFIG(pgSetup)) { return; } /* then, use PATH and fetch the first entry there for the monitor */ if (search_path_first("pg_ctl", pgSetup->pg_ctl, LOG_WARN)) { if (!pg_ctl_version(pgSetup)) { /* errors have been logged in pg_ctl_version */ exit(EXIT_CODE_PGCTL); } return; } /* then, use PATH and fetch pg_config --bindir from there */ if (set_pg_ctl_from_pg_config(pgSetup)) { return; } /* at this point we don't have any other ways to find a pg_ctl */ exit(EXIT_CODE_PGCTL); } /* * monitor_init_from_pgsetup might be called either from a monitor or * a keeper node. * * First, see if we are on a keeper node with a configuration file for given * PGDATA. If that's the case, then we'll use the pg_autoctl.monitor_pguri * setting from there to contact the monitor. * * Then, if we failed to get the monitor's uri from a keeper's configuration * file, probe the given PGDATA to see if there's a running PostgreSQL instance * there, and if that's the case consider it's a monitor, and build its * connection string from discovered PostgreSQL parameters. */ bool monitor_init_from_pgsetup(Monitor *monitor, PostgresSetup *pgSetup) { ConfigFilePaths pathnames = { 0 }; if (!keeper_config_set_pathnames_from_pgdata(&pathnames, pgSetup->pgdata)) { /* errors have already been logged */ return false; } switch (ProbeConfigurationFileRole(pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { bool missingPgdataIsOk = false; bool pgIsNotRunningIsOk = false; char connInfo[MAXCONNINFO]; MonitorConfig mconfig = { 0 }; if (!monitor_config_init_from_pgsetup(&mconfig, pgSetup, missingPgdataIsOk, pgIsNotRunningIsOk)) { /* errors have already been logged */ return false; } pg_setup_get_local_connection_string(&mconfig.pgSetup, connInfo); monitor_init(monitor, connInfo); break; } case PG_AUTOCTL_ROLE_KEEPER: { KeeperConfig config = { 0 }; Keeper keeper; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; log_trace("monitor_init_from_pgsetup: keeper"); /* * the dereference of pgSetup is safe as it only contains literals, * no pointers. keeper_config_read_file expects pgSetup to be set. */ config.pgSetup = *pgSetup; config.pathnames = pathnames; /* * All we need here is a pg_autoctl.monitor URI to connect to. We * don't need that the local PostgreSQL instance has been created * already. */ if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ return false; } if (config.monitorDisabled) { log_error("This node has disabled monitor, " "pg_autoctl get and set commands are not available."); return false; } if (!monitor_init(&(keeper.monitor), config.monitor_pguri)) { return false; } *monitor = keeper.monitor; *pgSetup = config.pgSetup; break; } default: { log_fatal("Unrecognized configuration file \"%s\"", pathnames.config); return false; } } return true; } /* * exit_unless_role_is_keeper exits when the configured role for the local node * is not a pg_autoctl keeper, meaning either we fail to parse the * configuration file (maybe it doesn't exists), or we parse it correctly and * pg_autoctl.role is "monitor". */ void exit_unless_role_is_keeper(KeeperConfig *kconfig) { if (!keeper_config_set_pathnames_from_pgdata(&kconfig->pathnames, kconfig->pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } switch (ProbeConfigurationFileRole(kconfig->pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { log_fatal("The command `%s` does not apply to a monitor node.", current_command->breadcrumb); exit(EXIT_CODE_BAD_CONFIG); } case PG_AUTOCTL_ROLE_KEEPER: { /* pg_autoctl.role is as expected, we may continue */ break; } default: { log_fatal("Unrecognized configuration file \"%s\"", kconfig->pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } } } /* * Provide help. */ void keeper_cli_help(int argc, char **argv) { CommandLine command = root; if (env_exists(PG_AUTOCTL_DEBUG)) { command = root_with_debug; } (void) commandline_print_command_tree(&command, stdout); } /* * cli_print_version_getopts parses the CLI options for the pg_autoctl version * command, which are the usual suspects. */ int cli_print_version_getopts(int argc, char **argv) { int c, option_index = 0; static struct option long_options[] = { { "json", no_argument, NULL, 'J' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; /* * The only command lines that are using keeper_cli_getopt_pgdata are * terminal ones: they don't accept subcommands. In that case our option * parsing can happen in any order and we don't need getopt_long to behave * in a POSIXLY_CORRECT way. * * The unsetenv() call allows getopt_long() to reorder arguments for us. */ unsetenv("POSIXLY_CORRECT"); while ((c = getopt_long(argc, argv, "JVvqh", long_options, &option_index)) != -1) { switch (c) { case 'J': { outputJSON = true; log_trace("--json"); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* * Ignore errors, ignore most of the things, just print the * version and exit(0) */ break; } } } return optind; } /* * keeper_cli_print_version prints the pg_autoctl version and exits with * successful exit code of zero. */ void keeper_cli_print_version(int argc, char **argv) { const char *version = PG_AUTOCTL_VERSION; if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *root = json_value_get_object(js); json_object_set_string(root, "pg_autoctl", version); json_object_set_string(root, "pgautofailover", PG_AUTOCTL_EXTENSION_VERSION); json_object_set_string(root, "pg_major", PG_MAJORVERSION); json_object_set_string(root, "pg_version", PG_VERSION); json_object_set_string(root, "pg_version_str", PG_VERSION_STR); json_object_set_number(root, "pg_version_num", (double) PG_VERSION_NUM); (void) cli_pprint_json(js); } else { fformat(stdout, "pg_autoctl version %s\n", version); fformat(stdout, "pg_autoctl extension version %s\n", PG_AUTOCTL_EXTENSION_VERSION); fformat(stdout, "compiled with %s\n", PG_VERSION_STR); fformat(stdout, "compatible with Postgres 10, 11, 12, 13, and 14\n"); } exit(0); } /* * cli_pprint_json pretty prints the given JSON value to stdout and frees the * JSON related memory. */ void cli_pprint_json(JSON_Value *js) { /* output our nice JSON object, pretty printed please */ char *serialized_string = json_serialize_to_string_pretty(js); fformat(stdout, "%s\n", serialized_string); /* free intermediate memory */ json_free_serialized_string(serialized_string); json_value_free(js); } /* * logLevelToString returns the string to use to enable the same logLevel in a * sub-process. * * enum { LOG_TRACE, LOG_DEBUG, LOG_INFO, LOG_WARN, LOG_ERROR, LOG_FATAL }; */ char * logLevelToString(int logLevel) { switch (logLevel) { case LOG_TRACE: { return "-vvv"; } case LOG_DEBUG: { return "-vv"; } case LOG_WARN: case LOG_INFO: { return "-v"; } case LOG_ERROR: case LOG_FATAL: { return "-q"; } } return ""; } /* * cli_common_pgsetup_init prepares a pgSetup instance from either a keeper or * a monitor configuration file. */ bool cli_common_pgsetup_init(ConfigFilePaths *pathnames, PostgresSetup *pgSetup) { KeeperConfig kconfig = keeperOptions; if (!keeper_config_set_pathnames_from_pgdata(&(kconfig.pathnames), kconfig.pgSetup.pgdata)) { /* errors have already been logged */ return false; } /* copy the pathnames over to the caller */ *pathnames = kconfig.pathnames; switch (ProbeConfigurationFileRole(kconfig.pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { MonitorConfig mconfig = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; if (!monitor_config_init_from_pgsetup(&mconfig, &kconfig.pgSetup, missingPgdataIsOk, pgIsNotRunningIsOk)) { /* errors have already been logged */ return false; } /* copy the pgSetup from the config to the Local Postgres instance */ *pgSetup = mconfig.pgSetup; break; } case PG_AUTOCTL_ROLE_KEEPER: { bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; if (!keeper_config_read_file(&kconfig, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ return false; } /* copy the pgSetup from the config to the Local Postgres instance */ *pgSetup = kconfig.pgSetup; break; } default: { log_fatal("Unrecognized configuration file \"%s\"", kconfig.pathnames.config); return false; } } return true; } /* * cli_common_ensure_formation reads the formation name from the configuration * file where it's not been given on the command line. When the local node is a * monitor, the target formation should be found on the command line with the * option --formation, otherwise we default to FORMATION_DEFAULT. */ bool cli_common_ensure_formation(KeeperConfig *options) { /* if --formation has been used, we're good */ if (!IS_EMPTY_STRING_BUFFER(options->formation)) { return true; } /* * When --monitor has been used rather than --pgdata, we are operating at a * distance and we don't expect a configuration file to exist. */ if (IS_EMPTY_STRING_BUFFER(options->pgSetup.pgdata)) { strlcpy(options->formation, FORMATION_DEFAULT, sizeof(options->formation)); return true; } switch (ProbeConfigurationFileRole(options->pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { /* on a monitor node, default to using the "default" formation */ strlcpy(options->formation, FORMATION_DEFAULT, sizeof(options->formation)); break; } case PG_AUTOCTL_ROLE_KEEPER: { KeeperConfig config = { 0 }; bool monitorDisabledIsOk = true; /* copy the pathnames to our temporary config struct */ config.pathnames = options->pathnames; if (!keeper_config_read_file_skip_pgsetup(&config, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } strlcpy(options->formation, config.formation, sizeof(options->formation)); log_debug("Using --formation \"%s\"", options->formation); break; } default: { log_fatal("Unrecognized configuration file \"%s\"", options->pathnames.config); exit(EXIT_CODE_INTERNAL_ERROR); } } return true; } /* * cli_pg_autoctl_reload signals the pg_autoctl process to reload its * configuration by sending it the SIGHUP signal. */ bool cli_pg_autoctl_reload(const char *pidfile) { pid_t pid; if (read_pidfile(pidfile, &pid)) { if (pid <= 0) { log_error("Failed to reload pg_autoctl: " "pid file \"%s\" contains negative-or-zero pid %d", pidfile, pid); return false; } if (kill(pid, SIGHUP) != 0) { log_error("Failed to send SIGHUP to the pg_autoctl's pid %d: %m", pid); return false; } } return true; } /* * cli_node_metadata_getopts parses the command line options for the * pg_autoctl set node metadata command. */ int cli_node_metadata_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "name", required_argument, NULL, 'n' }, { "hostname", required_argument, NULL, 'H' }, { "pgport", required_argument, NULL, 'p' }, { "json", no_argument, NULL, 'J' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; /* set default values for our options, when we have some */ options.groupId = -1; options.network_partition_timeout = -1; options.prepare_promotion_catchup = -1; options.prepare_promotion_walreceiver = -1; options.postgresql_restart_failure_timeout = -1; options.postgresql_restart_failure_max_retries = -1; /* do not set a default formation, it should be found in the config file */ optind = 0; while ((c = getopt_long(argc, argv, "D:n:H:p:JVvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'H': { /* { "hostname", required_argument, NULL, 'h' } */ strlcpy(options.hostname, optarg, _POSIX_HOST_NAME_MAX); log_trace("--hostname %s", options.hostname); break; } case 'n': { /* { "name", required_argument, NULL, 'n' } */ strlcpy(options.name, optarg, _POSIX_HOST_NAME_MAX); log_trace("--name %s", options.name); break; } case 'p': { /* { "pgport", required_argument, NULL, 'p' } */ if (!stringToInt(optarg, &options.pgSetup.pgport)) { log_error("Failed to parse --pgport number \"%s\"", optarg); errors++; } log_trace("--pgport %d", options.pgSetup.pgport); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'J': { outputJSON = true; log_trace("--json"); break; } default: { /* getopt_long already wrote an error message */ errors++; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* * Now, all commands need PGDATA validation. */ cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); /* * We have a PGDATA setting, prepare our configuration pathnames from it. */ if (!keeper_config_set_pathnames_from_pgdata(&(options.pathnames), options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* publish our option parsing now */ keeperOptions = options; return optind; } /* * cli_get_name_getopts parses the command line options for the command * `pg_autoctl get|set` commands and the `pg_autoctl perform promotion` * command, a list of commands which needs to target a node given by name. */ int cli_get_name_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "monitor", required_argument, NULL, 'm' }, { "formation", required_argument, NULL, 'f' }, { "name", required_argument, NULL, 'a' }, { "json", no_argument, NULL, 'J' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; /* set default values for our options, when we have some */ options.groupId = -1; options.network_partition_timeout = -1; options.prepare_promotion_catchup = -1; options.prepare_promotion_walreceiver = -1; options.postgresql_restart_failure_timeout = -1; options.postgresql_restart_failure_max_retries = -1; optind = 0; /* * The only command lines that are using keeper_cli_getopt_pgdata are * terminal ones: they don't accept subcommands. In that case our option * parsing can happen in any order and we don't need getopt_long to behave * in a POSIXLY_CORRECT way. * * The unsetenv() call allows getopt_long() to reorder arguments for us. */ unsetenv("POSIXLY_CORRECT"); while ((c = getopt_long(argc, argv, "D:f:g:n:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'm': { if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'f': { strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'a': { /* { "name", required_argument, NULL, 'a' }, */ strlcpy(options.name, optarg, _POSIX_HOST_NAME_MAX); log_trace("--name %s", options.name); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'J': { outputJSON = true; log_trace("--json"); break; } default: { /* getopt_long already wrote an error message */ errors++; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* now that we have the command line parameters, prepare the options */ /* when we have a monitor URI we don't need PGDATA */ if (cli_use_monitor_option(&options)) { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { log_warn("Given --monitor URI, the --pgdata option is ignored"); log_info("Connecting to monitor at \"%s\"", options.monitor_pguri); /* the rest of the program needs pgdata actually empty */ bzero((void *) options.pgSetup.pgdata, sizeof(options.pgSetup.pgdata)); } } else { (void) prepare_keeper_options(&options); } /* ensure --formation, or get it from the configuration file */ if (!cli_common_ensure_formation(&options)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* publish our option parsing in the global variable */ keeperOptions = options; return optind; } /* * cli_use_monitor_option returns true when the --monitor option should be * used, or when PG_AUTOCTL_MONITOR has been set in the environment. In that * case the options->monitor_pguri is also set to the value found in the * environment. */ bool cli_use_monitor_option(KeeperConfig *options) { /* if --monitor is used, then use it */ if (!IS_EMPTY_STRING_BUFFER(options->monitor_pguri)) { return true; } /* otherwise, have a look at the PG_AUTOCTL_MONITOR environment variable */ if (env_exists(PG_AUTOCTL_MONITOR) && get_env_copy(PG_AUTOCTL_MONITOR, options->monitor_pguri, sizeof(options->monitor_pguri)) && !IS_EMPTY_STRING_BUFFER(options->monitor_pguri)) { log_debug("Using environment PG_AUTOCTL_MONITOR \"%s\"", options->monitor_pguri); return true; } /* * Still nothing? well don't use --monitor then. * * Now, on commands that are compatible with using just a monitor and no * local pg_autoctl node, we want to include an error message about the * lack of a --monitor when we also lack --pgdata. */ if (IS_EMPTY_STRING_BUFFER(options->pgSetup.pgdata) && !env_exists("PGDATA")) { log_error("Failed to get value for environment variable '%s', " "which is unset", PG_AUTOCTL_MONITOR); log_warn("This command also supports the --monitor option, which " "is not used here"); } return false; } /* * cli_monitor_init_from_option_or_config initialises a monitor connection * either from the --monitor Postgres URI given on the command line, or from * the configuration file of the local node (monitor or keeper). */ void cli_monitor_init_from_option_or_config(Monitor *monitor, KeeperConfig *kconfig) { if (IS_EMPTY_STRING_BUFFER(kconfig->monitor_pguri)) { if (!monitor_init_from_pgsetup(monitor, &(kconfig->pgSetup))) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } } else { if (!monitor_init(monitor, kconfig->monitor_pguri)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } } } /* * cli_ensure_node_name ensures that we have a node name to continue with, * either from the command line itself, or from the configuration file when * we're dealing with a keeper node. */ void cli_ensure_node_name(Keeper *keeper) { /* if we have a --name option, we're done already */ if (!IS_EMPTY_STRING_BUFFER(keeper->config.name)) { return; } /* we might have --monitor instead of --pgdata */ if (IS_EMPTY_STRING_BUFFER(keeper->config.pgSetup.pgdata)) { log_fatal("Please use either --name or --pgdata " "to target a specific node"); exit(EXIT_CODE_BAD_ARGS); } switch (ProbeConfigurationFileRole(keeper->config.pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { log_fatal("Please use --name to target a specific node"); exit(EXIT_CODE_BAD_ARGS); break; } case PG_AUTOCTL_ROLE_KEEPER: { bool monitorDisabledIsOk = false; if (!keeper_config_read_file_skip_pgsetup(&(keeper->config), monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } break; } default: { log_fatal("Unrecognized configuration file \"%s\"", keeper->config.pathnames.config); exit(EXIT_CODE_INTERNAL_ERROR); } } } /* * cli_set_groupId sets the kconfig.groupId depending on the --group argument * given on the command line, and if that was not given then figures it out: * * - it could be that we have a single group in the formation, in that case * --group must be zero, so we set it that way, * * - we may have a local keeper node setup thanks to --pgdata, in that case * read the configuration file and grab the groupId from there. */ void cli_set_groupId(Monitor *monitor, KeeperConfig *kconfig) { int groupsCount = 0; if (!monitor_count_groups(monitor, kconfig->formation, &groupsCount)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } if (groupsCount == 0) { /* nothing to be done here */ log_fatal("The monitor currently has no Postgres nodes " "registered in formation \"%s\"", kconfig->formation); exit(EXIT_CODE_BAD_STATE); } /* * When --group was not given, we may proceed when there is only one * possible target group in the formation, which is the case with Postgres * standalone setups. */ if (kconfig->groupId == -1) { /* * When --group is not given and we have a keeper node, we can grab a * default from the configuration file. We have to support the usage * either --monitor or --pgdata. We have a local keeper node/role only * when we have been given --pgdata. */ if (!IS_EMPTY_STRING_BUFFER(kconfig->pgSetup.pgdata)) { pgAutoCtlNodeRole role = ProbeConfigurationFileRole(kconfig->pathnames.config); if (role == PG_AUTOCTL_ROLE_KEEPER) { const bool missingPgdataIsOk = true; const bool pgIsNotRunningIsOk = true; const bool monitorDisabledIsOk = false; if (!keeper_config_read_file(kconfig, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } log_info("Targetting group %d in formation \"%s\"", kconfig->groupId, kconfig->formation); } } } /* * We tried to see if we have a local keeper configuration to grab the * groupId from, what if we don't have a local setup, or the local setup is * not a keeper role. */ if (kconfig->groupId == -1) { if (groupsCount == 1) { /* we have only one group, it's group number zero, proceed */ kconfig->groupId = 0; kconfig->pgSetup.pgKind = NODE_KIND_STANDALONE; } else { log_error("Please use the --group option to target a " "specific group in formation \"%s\"", kconfig->formation); exit(EXIT_CODE_BAD_ARGS); } } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_common.h000066400000000000000000000161201414244367200230450ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_common.h * Implementation of a CLI which lets you run individual keeper routines * directly * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef CLI_COMMON_H #define CLI_COMMON_H #include #include "keeper.h" #include "keeper_config.h" #include "monitor.h" #include "monitor_config.h" extern MonitorConfig monitorOptions; extern KeeperConfig keeperOptions; extern bool createAndRun; extern bool outputJSON; extern bool openAppHBAonLAN; extern bool dropAndDestroy; #define SSL_CA_FILE_FLAG 1 /* root public certificate */ #define SSL_CRL_FILE_FLAG 2 /* certificates revocation list */ #define SSL_SERVER_CRT_FLAG 3 /* server.key (private key) */ #define SSL_SERVER_KEY_FLAG 4 /* server.crt (public certificate) */ #define SSL_MODE_FLAG 5 /* client side sslmode for connection strings */ extern int ssl_flag; extern int monitorDisabledNodeId; #define KEEPER_CLI_SSL_OPTIONS \ " --ssl-self-signed setup network encryption using self signed certificates (does NOT protect against MITM)\n" \ " --ssl-mode use that sslmode in connection strings\n" \ " --ssl-ca-file set the Postgres ssl_ca_file to that file path\n" \ " --ssl-crl-file set the Postgres ssl_crl_file to that file path\n" \ " --no-ssl don't enable network encryption (NOT recommended, prefer --ssl-self-signed)\n" \ " --server-key set the Postgres ssl_key_file to that file path\n" \ " --server-cert set the Postgres ssl_cert_file to that file path\n" #define KEEPER_CLI_WORKER_SETUP_OPTIONS \ " --pgctl path to pg_ctl\n" \ " --pgdata path to data directory\n" \ " --pghost PostgreSQL's hostname\n" \ " --pgport PostgreSQL's port number\n" \ " --listen PostgreSQL's listen_addresses\n" \ " --username PostgreSQL's username\n" \ " --dbname PostgreSQL's database name\n" \ " --proxyport Proxy's port number\n" \ " --name pg_auto_failover node name\n" \ " --hostname hostname used to connect from the other nodes\n" \ " --formation pg_auto_failover formation\n" \ " --group pg_auto_failover group Id\n" \ " --monitor pg_auto_failover Monitor Postgres URL\n" \ KEEPER_CLI_SSL_OPTIONS #define KEEPER_CLI_NON_WORKER_SETUP_OPTIONS \ " --pgctl path to pg_ctl\n" \ " --pgdata path to data directory\n" \ " --pghost PostgreSQL's hostname\n" \ " --pgport PostgreSQL's port number\n" \ " --listen PostgreSQL's listen_addresses\n" \ " --username PostgreSQL's username\n" \ " --dbname PostgreSQL's database name\n" \ " --name pg_auto_failover node name\n" \ " --hostname hostname used to connect from the other nodes\n" \ " --formation pg_auto_failover formation\n" \ " --group pg_auto_failover group Id\n" \ " --monitor pg_auto_failover Monitor Postgres URL\n" \ KEEPER_CLI_SSL_OPTIONS #define CLI_PGDATA_OPTION \ " --pgdata path to data directory\n" \ #define CLI_PGDATA_USAGE " [ --pgdata ] [ --json ] " /* cli_do.c */ extern CommandLine do_commands; /* cli_config.c */ extern CommandLine config_commands; /* cli_create_drop_node.c */ extern CommandLine create_monitor_command; extern CommandLine create_postgres_command; extern CommandLine drop_node_command; extern CommandLine drop_monitor_command; extern CommandLine destroy_command; /* cli_get_set_properties.c */ extern CommandLine get_commands; extern CommandLine set_commands; /* cli_enable_disable.c */ extern CommandLine enable_commands; extern CommandLine disable_commands; /* cli_formation.c */ extern CommandLine create_formation_command; extern CommandLine drop_formation_command; /* cli_perform.c */ extern CommandLine perform_failover_command; extern CommandLine perform_switchover_command; extern CommandLine *perform_subcommands[]; extern CommandLine perform_commands; /* cli_service.c */ extern CommandLine service_run_command; extern CommandLine service_stop_command; extern CommandLine service_reload_command; extern CommandLine service_status_command; /* cli_show.c */ extern CommandLine show_uri_command; extern CommandLine show_events_command; extern CommandLine show_state_command; extern CommandLine show_settings_command; extern CommandLine show_file_command; extern CommandLine show_standby_names_command; /* cli_watch.c */ extern CommandLine watch_command; /* cli_systemd.c */ extern CommandLine systemd_cat_service_file_command; /* * Handling SSL options on the command line and their inter-compatibility is a * little complex. */ typedef enum { SSL_CLI_UNKNOWN = 0, SSL_CLI_NO_SSL, SSL_CLI_SELF_SIGNED, SSL_CLI_USER_PROVIDED } SSLCommandLineOptions; void keeper_cli_help(int argc, char **argv); int cli_print_version_getopts(int argc, char **argv); void keeper_cli_print_version(int argc, char **argv); void cli_pprint_json(JSON_Value *js); void cli_common_get_set_pgdata_or_exit(PostgresSetup *pgSetup); int cli_common_keeper_getopts(int argc, char **argv, struct option *long_options, const char *optstring, KeeperConfig *options, SSLCommandLineOptions *sslCommandLineOptions); int cli_create_node_getopts(int argc, char **argv, struct option *long_options, const char *optstring, KeeperConfig *options); int cli_getopt_pgdata(int argc, char **argv); void prepare_keeper_options(KeeperConfig *options); void set_first_pgctl(PostgresSetup *pgSetup); bool monitor_init_from_pgsetup(Monitor *monitor, PostgresSetup *pgSetup); void exit_unless_role_is_keeper(KeeperConfig *kconfig); void cli_set_groupId(Monitor *monitor, KeeperConfig *kconfig); /* cli_create_drop_node.c */ bool cli_create_config(Keeper *keeper); void cli_create_pg(Keeper *keeper); bool check_or_discover_hostname(KeeperConfig *config); int cli_drop_node_getopts(int argc, char **argv); void cli_drop_node(int argc, char **argv); void keeper_cli_destroy_node(int argc, char **argv); void cli_drop_node_from_monitor(KeeperConfig *config, int64_t *nodeId, int *groupId); void cli_drop_local_node(KeeperConfig *config, bool dropAndDestroy); bool cli_getopt_ssl_flags(int ssl_flag, char *optarg, PostgresSetup *pgSetup); bool cli_getopt_accept_ssl_options(SSLCommandLineOptions newSSLOption, SSLCommandLineOptions currentSSLOptions); char * logLevelToString(int logLevel); bool cli_common_pgsetup_init(ConfigFilePaths *pathnames, PostgresSetup *pgSetup); bool cli_common_ensure_formation(KeeperConfig *options); bool cli_pg_autoctl_reload(const char *pidfile); int cli_node_metadata_getopts(int argc, char **argv); int cli_get_name_getopts(int argc, char **argv); bool cli_use_monitor_option(KeeperConfig *options); void cli_monitor_init_from_option_or_config(Monitor *monitor, KeeperConfig *kconfig); void cli_ensure_node_name(Keeper *keeper); bool discover_hostname(char *hostname, int size, const char *monitorHostname, int monitorPort); /* cli_get_set_properties.c */ void cli_get_formation_settings(int argc, char **argv); #endif /* CLI_COMMON_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_config.c000066400000000000000000000412641414244367200230240ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_config.c * Implementation of pg_autoctl config CLI sub-commands. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "defaults.h" #include "ini_file.h" #include "keeper_config.h" #include "keeper.h" #include "monitor.h" #include "monitor_config.h" #include "pidfile.h" static void cli_config_check(int argc, char **argv); static void cli_config_check_pgsetup(PostgresSetup *pgSetup); static void cli_config_check_connections(PostgresSetup *pgSetup, const char *monitor_pguri); static void cli_config_get(int argc, char **argv); static void cli_keeper_config_get(int argc, char **argv); static void cli_monitor_config_get(int argc, char **argv); static void cli_config_set(int argc, char **argv); static void cli_keeper_config_set(int argc, char **argv); static void cli_monitor_config_set(int argc, char **argv); static CommandLine config_check = make_command("check", "Check pg_autoctl configuration", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_config_check); static CommandLine config_get = make_command("get", "Get the value of a given pg_autoctl configuration variable", CLI_PGDATA_USAGE "[ section.option ]", CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_config_get); static CommandLine config_set = make_command("set", "Set the value of a given pg_autoctl configuration variable", CLI_PGDATA_USAGE "section.option [ value ]", CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_config_set); static CommandLine *config[] = { &config_check, &config_get, &config_set, NULL }; CommandLine config_commands = make_command_set("config", "Manages the pg_autoctl configuration", NULL, NULL, NULL, config); /* * cli_config_check reads a configuration file and debug its content as * DEBUG messages. */ static void cli_config_check(int argc, char **argv) { const bool missingPgdataIsOk = true; const bool pgIsNotRunningIsOk = true; const bool monitorDisabledIsOk = true; KeeperConfig config = keeperOptions; if (!keeper_config_set_pathnames_from_pgdata(&config.pathnames, config.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } switch (ProbeConfigurationFileRole(config.pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { bool missingPgDataIsOk = true; MonitorConfig mconfig = { 0 }; if (!monitor_config_init_from_pgsetup(&mconfig, &config.pgSetup, missingPgdataIsOk, pgIsNotRunningIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!pg_controldata(&(mconfig.pgSetup), missingPgDataIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_PGCTL); } (void) cli_config_check_pgsetup(&(mconfig.pgSetup)); (void) cli_config_check_connections(&(mconfig.pgSetup), NULL); if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Value *jsPostgres = json_value_init_object(); JSON_Value *jsMConfig = json_value_init_object(); JSON_Object *root = json_value_get_object(js); /* prepare both JSON objects */ if (!pg_setup_as_json(&(mconfig.pgSetup), jsPostgres)) { /* can't happen */ exit(EXIT_CODE_INTERNAL_ERROR); } if (!monitor_config_to_json(&mconfig, jsMConfig)) { log_fatal("Failed to serialize monitor configuration to JSON"); exit(EXIT_CODE_BAD_CONFIG); } /* concatenate JSON objects into a container object */ json_object_set_value(root, "postgres", jsPostgres); json_object_set_value(root, "config", jsMConfig); (void) cli_pprint_json(js); } else { fprintf_pg_setup(stdout, &(mconfig.pgSetup)); } break; } case PG_AUTOCTL_ROLE_KEEPER: { bool missingPgDataIsOk = true; if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!pg_controldata(&(config.pgSetup), missingPgDataIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_PGCTL); } (void) cli_config_check_pgsetup(&(config.pgSetup)); (void) cli_config_check_connections( &(config.pgSetup), config.monitorDisabled ? NULL : config.monitor_pguri); if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Value *jsPostgres = json_value_init_object(); JSON_Value *jsKConfig = json_value_init_object(); JSON_Object *root = json_value_get_object(js); /* prepare both JSON objects */ if (!pg_setup_as_json(&(config.pgSetup), jsPostgres)) { /* can't happen */ exit(EXIT_CODE_INTERNAL_ERROR); } if (!keeper_config_to_json(&config, jsKConfig)) { log_fatal("Failed to serialize monitor configuration to JSON"); exit(EXIT_CODE_BAD_CONFIG); } /* concatenate JSON objects into a container object */ json_object_set_value(root, "postgres", jsPostgres); json_object_set_value(root, "config", jsKConfig); (void) cli_pprint_json(js); } else { fprintf_pg_setup(stdout, &(config.pgSetup)); } break; } default: { log_fatal("Unrecognized configuration file \"%s\"", config.pathnames.config); exit(EXIT_CODE_INTERNAL_ERROR); } } } /* * cli_keeper_config_check checks a keeper configuration file. */ static void cli_config_check_pgsetup(PostgresSetup *pgSetup) { int errors = 0; /* * Now check for errors. Rather than using the generic missing_pgdata_is_ok * and pg_not_running_is_ok facility, we do our own error checking here. * One reason is that this command line doesn't provide support for * --pgport and other options, on purpose. Another reason is that we want * to check for everything rather than fail fast. */ char globalControlPath[MAXPGPATH] = { 0 }; /* globalControlFilePath = $PGDATA/global/pg_control */ join_path_components(globalControlPath, pgSetup->pgdata, "global/pg_control"); if (!file_exists(globalControlPath)) { errors++; log_error("postgresql.pgdata does not belong to a PostgreSQL cluster: " "\"%s\"", pgSetup->pgdata); } /* when PostgreSQL is running, pg_setup_init() has connected to it. */ if (pgSetup->pidFile.pid == 0) { errors++; log_error("PostgreSQL is not running"); } /* TODO: check formation, group, hostname on the monitor */ if (errors > 0) { exit(EXIT_CODE_BAD_CONFIG); } log_info("Postgres setup for PGDATA \"%s\" is ok, " "running with PID %d and port %d", pgSetup->pgdata, pgSetup->pidFile.port, pgSetup->pidFile.pid); } /* * cli_config_check_connections checks that the following three connections are * possible: * * 1. connection to the local Postgres server * 2. connection to the Postgres monitor * 3. streaming replication connection string */ static void cli_config_check_connections(PostgresSetup *pgSetup, const char *monitor_pguri) { PGSQL pgsql = { 0 }; char connInfo[MAXCONNINFO] = { 0 }; bool settings_are_ok = false; Monitor monitor = { 0 }; MonitorExtensionVersion version = { 0 }; pg_setup_get_local_connection_string(pgSetup, connInfo); pgsql_init(&pgsql, connInfo, PGSQL_CONN_LOCAL); if (!pgsql_is_in_recovery(&pgsql, &pgSetup->is_in_recovery)) { /* errors have already been logged */ exit(EXIT_CODE_PGSQL); } log_info("Connection to local Postgres ok, using \"%s\"", connInfo); /* * Do not check settings on the monitor node itself. On the monitor, we * don't have a monitor_pguri in the config. */ if (monitor_pguri == NULL) { return; } /* * Check that the Postgres settings for pg_auto_failover are active in the * running Postgres instance. */ if (!pgsql_check_postgresql_settings(&pgsql, false, &settings_are_ok)) { /* errors have already been logged */ exit(EXIT_CODE_PGSQL); } if (settings_are_ok) { log_info("Postgres configuration settings required " "for pg_auto_failover are ok"); } else { log_warn("Failed to check required settings for pg_auto_failover, " "please review your Postgres configuration"); } if (pg_setup_standby_slot_supported(pgSetup, LOG_WARN)) { log_info("Postgres version \"%s\" allows using replication slots " "on the standby nodes", pgSetup->pg_version); } else { log_warn("Postgres version \"%s\" DOES NOT allow using replication " "slots on the standby nodes", pgSetup->pg_version); } /* * Now, on Postgres nodes, check that the monitor uri is valid and that we * can connect to the monitor just fine. This requires having setup the * Postgres HBA rules correctly, which is up to the user when using * --skip-pg-hba. */ if (!monitor_init(&monitor, (char *) monitor_pguri)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } if (!monitor_get_extension_version(&monitor, &version)) { log_fatal("Failed to check version compatibility with the monitor " "extension \"%s\", see above for details", PG_AUTOCTL_MONITOR_EXTENSION_NAME); exit(EXIT_CODE_MONITOR); } /* disconnect from the monitor now */ pgsql_finish(&(monitor.pgsql)); log_info("Connection to monitor ok, using \"%s\"", monitor_pguri); if (strcmp(version.installedVersion, PG_AUTOCTL_EXTENSION_VERSION) == 0) { log_info("Monitor is running version \"%s\", as expected", version.installedVersion); } else { log_info("Monitor is running version \"%s\" " "instead of expected version \"%s\"", version.installedVersion, PG_AUTOCTL_EXTENSION_VERSION); log_warn("Please connect to the monitor node and restart pg_autoctl."); } /* TODO: check streaming replication connections */ } /* * cli_keeper_config_get retrieves the value of a given configuration value, * supporting either a Keeper or a Monitor configuration file. */ static void cli_config_get(int argc, char **argv) { KeeperConfig config = keeperOptions; if (!keeper_config_set_pathnames_from_pgdata(&config.pathnames, config.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } switch (ProbeConfigurationFileRole(config.pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { (void) cli_monitor_config_get(argc, argv); break; } case PG_AUTOCTL_ROLE_KEEPER: { (void) cli_keeper_config_get(argc, argv); break; } default: { log_fatal("Unrecognized configuration file \"%s\"", config.pathnames.config); exit(EXIT_CODE_INTERNAL_ERROR); } } } /* * keeper_cli_config_get returns the value of a given section.option, or prints * out the whole file to stdout when no argument has been given. */ static void cli_keeper_config_get(int argc, char **argv) { KeeperConfig config = keeperOptions; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; switch (argc) { case 0: { /* no argument, write the config out */ if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { exit(EXIT_CODE_PGCTL); } else { if (outputJSON) { JSON_Value *js = json_value_init_object(); if (!keeper_config_to_json(&config, js)) { log_fatal("Failed to serialize configuration to JSON"); exit(EXIT_CODE_BAD_CONFIG); } (void) cli_pprint_json(js); } else { keeper_config_write(stdout, &config); fformat(stdout, "\n"); } } break; } case 1: { /* single argument, find the option and display its value */ char *path = argv[0]; char value[BUFSIZE]; if (keeper_config_get_setting(&config, path, value, BUFSIZE)) { fformat(stdout, "%s\n", value); } else { log_error("Failed to lookup option %s", path); exit(EXIT_CODE_BAD_ARGS); } break; } default: { /* we only support 0 or 1 argument */ commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } } } /* * keeper_cli_config_get returns the value of a given section.option, or prints * out the whole file to stdout when no argument has been given. */ static void cli_monitor_config_get(int argc, char **argv) { MonitorConfig mconfig = { 0 }; KeeperConfig kconfig = keeperOptions; bool missing_pgdata_is_ok = true; bool pg_is_not_running_is_ok = true; if (!monitor_config_init_from_pgsetup(&mconfig, &kconfig.pgSetup, missing_pgdata_is_ok, pg_is_not_running_is_ok)) { exit(EXIT_CODE_PGCTL); } switch (argc) { case 0: { if (outputJSON) { JSON_Value *js = json_value_init_object(); if (!monitor_config_to_json(&mconfig, js)) { log_fatal("Failed to serialize configuration to JSON"); exit(EXIT_CODE_BAD_CONFIG); } (void) cli_pprint_json(js); } else { monitor_config_write(stdout, &mconfig); fformat(stdout, "\n"); } break; } case 1: { /* single argument, find the option and display its value */ char *path = argv[0]; char value[BUFSIZE]; if (monitor_config_get_setting(&mconfig, path, value, BUFSIZE)) { fformat(stdout, "%s\n", value); } else { log_error("Failed to lookup option %s", path); exit(EXIT_CODE_BAD_ARGS); } break; } default: { /* we only support 0 or 1 argument */ commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } } } /* * cli_config_set sets the value of a given configuration value, * supporting either a Keeper or a Monitor configuration file. */ static void cli_config_set(int argc, char **argv) { KeeperConfig config = keeperOptions; if (!keeper_config_set_pathnames_from_pgdata(&config.pathnames, config.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } switch (ProbeConfigurationFileRole(config.pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { (void) cli_monitor_config_set(argc, argv); break; } case PG_AUTOCTL_ROLE_KEEPER: { (void) cli_keeper_config_set(argc, argv); break; } default: { log_fatal("Unrecognized configuration file \"%s\"", config.pathnames.config); exit(EXIT_CODE_INTERNAL_ERROR); } } if (!cli_pg_autoctl_reload(config.pathnames.pid)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_keeper_config_set sets the given option path to the given value. */ static void cli_keeper_config_set(int argc, char **argv) { KeeperConfig config = keeperOptions; if (argc != 2) { log_error("Two arguments are expected, found %d", argc); exit(EXIT_CODE_BAD_ARGS); } else { /* we print out the value that we parsed, as a double-check */ char value[BUFSIZE]; if (!keeper_config_set_setting(&config, argv[0], argv[1])) { /* we already logged about it */ exit(EXIT_CODE_BAD_CONFIG); } /* first write the new configuration settings to file */ if (!keeper_config_write_file(&config)) { log_fatal("Failed to write pg_autoctl configuration file \"%s\", " "see above for details", config.pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } /* now read the value from just written file */ if (keeper_config_get_setting(&config, argv[0], value, BUFSIZE)) { fformat(stdout, "%s\n", value); } else { log_error("Failed to lookup option %s", argv[0]); exit(EXIT_CODE_BAD_ARGS); } } } /* * cli_monitor_config_set sets the given option path to the given value. */ static void cli_monitor_config_set(int argc, char **argv) { if (argc != 2) { log_error("Two arguments are expected, found %d", argc); exit(EXIT_CODE_BAD_ARGS); } else { /* we print out the value that we parsed, as a double-check */ char value[BUFSIZE]; MonitorConfig mconfig = { 0 }; mconfig.pgSetup = keeperOptions.pgSetup; if (!monitor_config_set_pathnames_from_pgdata(&mconfig)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } /* first write the new configuration settings to file */ if (!monitor_config_set_setting(&mconfig, argv[0], argv[1])) { /* we already logged about it */ exit(EXIT_CODE_BAD_CONFIG); } if (!monitor_config_write_file(&mconfig)) { log_fatal("Failed to write the monitor's configuration file, " "see above"); exit(EXIT_CODE_BAD_CONFIG); } /* now read the value from just written file */ if (monitor_config_get_setting(&mconfig, argv[0], value, BUFSIZE)) { fformat(stdout, "%s\n", value); } else { log_error("Failed to lookup option %s", argv[0]); exit(EXIT_CODE_BAD_ARGS); } } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_create_node.c000066400000000000000000000725271414244367200240350ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_create_node.c * Implementation of the pg_autoctl create and pg_autoctl drop CLI for the * pg_auto_failover nodes (monitor, coordinator, worker, postgres). * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "env_utils.h" #include "defaults.h" #include "fsm.h" #include "ini_file.h" #include "ipaddr.h" #include "keeper_config.h" #include "keeper_pg_init.h" #include "keeper.h" #include "monitor.h" #include "monitor_config.h" #include "monitor_pg_init.h" #include "pgctl.h" #include "pghba.h" #include "pidfile.h" #include "primary_standby.h" #include "service_keeper.h" #include "service_keeper_init.h" #include "service_monitor.h" #include "service_monitor_init.h" #include "string_utils.h" /* * Global variables that we're going to use to "communicate" in between getopts * functions and their command implementation. We can't pass parameters around. */ MonitorConfig monitorOptions = { 0 }; static int cli_create_postgres_getopts(int argc, char **argv); static void cli_create_postgres(int argc, char **argv); static int cli_create_monitor_getopts(int argc, char **argv); static void cli_create_monitor(int argc, char **argv); static void check_hostname(const char *hostname); CommandLine create_monitor_command = make_command( "monitor", "Initialize a pg_auto_failover monitor node", " [ --pgdata --pgport --pgctl --hostname ] ", " --pgctl path to pg_ctl\n" " --pgdata path to data directory\n" " --pgport PostgreSQL's port number\n" " --hostname hostname by which postgres is reachable\n" " --auth authentication method for connections from data nodes\n" " --skip-pg-hba skip editing pg_hba.conf rules\n" " --run create node then run pg_autoctl service\n" KEEPER_CLI_SSL_OPTIONS, cli_create_monitor_getopts, cli_create_monitor); CommandLine create_postgres_command = make_command( "postgres", "Initialize a pg_auto_failover standalone postgres node", "", " --pgctl path to pg_ctl\n" " --pgdata path to data directory\n" " --pghost PostgreSQL's hostname\n" " --pgport PostgreSQL's port number\n" " --listen PostgreSQL's listen_addresses\n" " --username PostgreSQL's username\n" " --dbname PostgreSQL's database name\n" " --name pg_auto_failover node name\n" " --hostname hostname used to connect from the other nodes\n" " --formation pg_auto_failover formation\n" " --monitor pg_auto_failover Monitor Postgres URL\n" " --auth authentication method for connections from monitor\n" " --skip-pg-hba skip editing pg_hba.conf rules\n" " --pg-hba-lan edit pg_hba.conf rules for --dbname in detected LAN\n" KEEPER_CLI_SSL_OPTIONS " --candidate-priority priority of the node to be promoted to become primary\n" " --replication-quorum true if node participates in write quorum\n" " --maximum-backup-rate maximum transfer rate of data transferred from the server during initial sync\n", cli_create_postgres_getopts, cli_create_postgres); /* * cli_create_config manages the whole set of configuration parameters that * pg_autoctl accepts and deals with either creating a configuration file if * necessary, or merges the command line arguments into the pre-existing * configuration file. */ bool cli_create_config(Keeper *keeper) { KeeperConfig *config = &(keeper->config); bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; /* * We support two modes of operations here: * - configuration exists already, we need PGDATA * - configuration doesn't exist already, we need PGDATA, and more */ if (file_exists(config->pathnames.config)) { Monitor *monitor = &(keeper->monitor); KeeperConfig options = { 0 }; KeeperConfig oldConfig = { 0 }; PostgresSetup optionsFullPgSetup = { 0 }; if (!keeper_config_read_file(config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { log_fatal("Failed to read configuration file \"%s\"", config->pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } oldConfig = *config; options = *config; /* * Before merging command line options into the (maybe) pre-existing * configuration file, we should also mix in the environment variables * values in the command line options. */ if (!pg_setup_init(&optionsFullPgSetup, &(options.pgSetup), missingPgdataIsOk, pgIsNotRunningIsOk)) { exit(EXIT_CODE_BAD_ARGS); } options.pgSetup = optionsFullPgSetup; /* * Now that we have loaded the configuration file, apply the command * line options on top of it, giving them priority over the config. */ if (!keeper_config_merge_options(config, &options)) { /* errors have been logged already */ exit(EXIT_CODE_BAD_CONFIG); } /* * If we have registered to the monitor already, then we need to check * if the user is providing new --nodename, --hostname, or --pgport * arguments. After all, they may change their mind of have just * realized that the --pgport they wanted to use is already in use. */ if (!config->monitorDisabled) { if (!monitor_init(monitor, config->monitor_pguri)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } if (file_exists(config->pathnames.state)) { /* * Handle the node metadata options: --name, --hostname, * --pgport. * * When those options have been used, then the configuration * file has been merged with the command line values, and we * can update the metadata for this node on the monitor. */ if (!keeper_set_node_metadata(keeper, &oldConfig)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } /* * Now, at 1.3 to 1.4 upgrade, the monitor assigns a new name to * pg_autoctl nodes, which did not use to have a name before. In * that case, and then pg_autoctl run has been used without * options, our name might be empty here. We then need to fetch * it from the monitor. */ if (!keeper_update_nodename_from_monitor(keeper)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } } } } else { /* set our KeeperConfig from the command line options now. */ (void) keeper_config_init(config, missingPgdataIsOk, pgIsNotRunningIsOk); /* and write our brand new setup to file */ if (!keeper_config_write_file(config)) { log_fatal("Failed to write the pg_autoctl configuration file, " "see above"); exit(EXIT_CODE_BAD_CONFIG); } } return true; } /* * cli_pg_create calls keeper_pg_init where all the magic happens. */ void cli_create_pg(Keeper *keeper) { if (!keeper_pg_init(keeper)) { /* errors have been logged */ exit(EXIT_CODE_BAD_STATE); } } /* * cli_create_postgres_getopts parses command line options and set the global * variable keeperOptions from them, without doing any check. */ static int cli_create_postgres_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; static struct option long_options[] = { { "pgctl", required_argument, NULL, 'C' }, { "pgdata", required_argument, NULL, 'D' }, { "pghost", required_argument, NULL, 'H' }, { "pgport", required_argument, NULL, 'p' }, { "listen", required_argument, NULL, 'l' }, { "username", required_argument, NULL, 'U' }, { "auth", required_argument, NULL, 'A' }, { "skip-pg-hba", no_argument, NULL, 'S' }, { "pg-hba-lan", no_argument, NULL, 'L' }, { "dbname", required_argument, NULL, 'd' }, { "name", required_argument, NULL, 'a' }, { "hostname", required_argument, NULL, 'n' }, { "formation", required_argument, NULL, 'f' }, { "monitor", required_argument, NULL, 'm' }, { "disable-monitor", no_argument, NULL, 'M' }, { "node-id", required_argument, NULL, 'I' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { "candidate-priority", required_argument, NULL, 'P' }, { "replication-quorum", required_argument, NULL, 'r' }, { "maximum-backup-rate", required_argument, NULL, 'R' }, { "run", no_argument, NULL, 'x' }, { "no-ssl", no_argument, NULL, 'N' }, { "ssl-self-signed", no_argument, NULL, 's' }, { "ssl-mode", required_argument, &ssl_flag, SSL_MODE_FLAG }, { "ssl-ca-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG }, { "ssl-crl-file", required_argument, &ssl_flag, SSL_CRL_FILE_FLAG }, { "server-cert", required_argument, &ssl_flag, SSL_SERVER_CRT_FLAG }, { "server-key", required_argument, &ssl_flag, SSL_SERVER_KEY_FLAG }, { NULL, 0, NULL, 0 } }; int optind = cli_create_node_getopts(argc, argv, long_options, "C:D:H:p:l:U:A:SLd:a:n:f:m:MI:RVvqhP:r:xsN", &options); /* publish our option parsing in the global variable */ keeperOptions = options; return optind; } /* * cli_create_postgres prepares a local PostgreSQL instance to be used as a * standalone Postgres instance, not in a Citus formation. */ static void cli_create_postgres(int argc, char **argv) { pid_t pid = 0; Keeper keeper = { 0 }; KeeperConfig *config = &(keeper.config); keeper.config = keeperOptions; if (read_pidfile(config->pathnames.pid, &pid)) { log_fatal("pg_autoctl is already running with pid %d", pid); exit(EXIT_CODE_BAD_STATE); } if (!file_exists(config->pathnames.config)) { /* pg_autoctl create postgres: mark ourselves as a standalone node */ config->pgSetup.pgKind = NODE_KIND_STANDALONE; strlcpy(config->nodeKind, "standalone", NAMEDATALEN); if (!check_or_discover_hostname(config)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } } if (!cli_create_config(&keeper)) { log_error("Failed to initialize our configuration, see above."); exit(EXIT_CODE_BAD_CONFIG); } cli_create_pg(&keeper); } /* * cli_create_monitor_getopts parses the command line options necessary to * initialize a PostgreSQL instance as our monitor. */ static int cli_create_monitor_getopts(int argc, char **argv) { MonitorConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; SSLCommandLineOptions sslCommandLineOptions = SSL_CLI_UNKNOWN; static struct option long_options[] = { { "pgctl", required_argument, NULL, 'C' }, { "pgdata", required_argument, NULL, 'D' }, { "pgport", required_argument, NULL, 'p' }, { "hostname", required_argument, NULL, 'n' }, { "listen", required_argument, NULL, 'l' }, { "auth", required_argument, NULL, 'A' }, { "skip-pg-hba", no_argument, NULL, 'S' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { "run", no_argument, NULL, 'x' }, { "no-ssl", no_argument, NULL, 'N' }, { "ssl-self-signed", no_argument, NULL, 's' }, { "ssl-mode", required_argument, &ssl_flag, SSL_MODE_FLAG }, { "ssl-ca-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG }, { "ssl-crl-file", required_argument, &ssl_flag, SSL_CRL_FILE_FLAG }, { "server-cert", required_argument, &ssl_flag, SSL_SERVER_CRT_FLAG }, { "server-key", required_argument, &ssl_flag, SSL_SERVER_KEY_FLAG }, { NULL, 0, NULL, 0 } }; /* hard-coded defaults */ options.pgSetup.pgport = pgsetup_get_pgport(); optind = 0; while ((c = getopt_long(argc, argv, "C:D:p:n:l:A:SVvqhxNs", long_options, &option_index)) != -1) { switch (c) { case 'C': { strlcpy(options.pgSetup.pg_ctl, optarg, MAXPGPATH); log_trace("--pg_ctl %s", options.pgSetup.pg_ctl); break; } case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'p': { if (!stringToInt(optarg, &options.pgSetup.pgport)) { log_fatal("--pgport argument is a valid port number: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--pgport %d", options.pgSetup.pgport); break; } case 'l': { strlcpy(options.pgSetup.listen_addresses, optarg, MAXPGPATH); log_trace("--listen %s", options.pgSetup.listen_addresses); break; } case 'n': { strlcpy(options.hostname, optarg, _POSIX_HOST_NAME_MAX); log_trace("--hostname %s", options.hostname); break; } case 'A': { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.authMethod)) { errors++; log_error("Please use either --auth or --skip-pg-hba"); } strlcpy(options.pgSetup.authMethod, optarg, NAMEDATALEN); log_trace("--auth %s", options.pgSetup.authMethod); break; } case 'S': { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.authMethod)) { errors++; log_error("Please use either --auth or --skip-pg-hba"); } /* force default authentication method then */ strlcpy(options.pgSetup.authMethod, DEFAULT_AUTH_METHOD, NAMEDATALEN); options.pgSetup.hbaLevel = HBA_EDIT_SKIP; log_trace("--skip-pg-hba"); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'x': { /* { "run", no_argument, NULL, 'x' }, */ createAndRun = true; log_trace("--run"); break; } case 's': { /* { "ssl-self-signed", no_argument, NULL, 's' }, */ if (!cli_getopt_accept_ssl_options(SSL_CLI_SELF_SIGNED, sslCommandLineOptions)) { errors++; break; } sslCommandLineOptions = SSL_CLI_SELF_SIGNED; options.pgSetup.ssl.active = 1; options.pgSetup.ssl.createSelfSignedCert = true; log_trace("--ssl-self-signed"); break; } case 'N': { /* { "no-ssl", no_argument, NULL, 'N' }, */ if (!cli_getopt_accept_ssl_options(SSL_CLI_NO_SSL, sslCommandLineOptions)) { errors++; break; } sslCommandLineOptions = SSL_CLI_NO_SSL; options.pgSetup.ssl.active = 0; options.pgSetup.ssl.createSelfSignedCert = false; log_trace("--no-ssl"); break; } /* * { "ssl-ca-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG } * { "ssl-crl-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG } * { "server-cert", required_argument, &ssl_flag, SSL_SERVER_CRT_FLAG } * { "server-key", required_argument, &ssl_flag, SSL_SERVER_KEY_FLAG } * { "ssl-mode", required_argument, &ssl_flag, SSL_MODE_FLAG }, */ case 0: { if (ssl_flag != SSL_MODE_FLAG) { if (!cli_getopt_accept_ssl_options(SSL_CLI_USER_PROVIDED, sslCommandLineOptions)) { errors++; break; } sslCommandLineOptions = SSL_CLI_USER_PROVIDED; options.pgSetup.ssl.active = 1; } if (!cli_getopt_ssl_flags(ssl_flag, optarg, &(options.pgSetup))) { errors++; } break; } default: { /* getopt_long already wrote an error message */ commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); break; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* * We're not using pg_setup_init() here: we are following a very different * set of rules. We just want to check: * * - PGDATA is set and the directory does not exist * - PGPORT is either set or defaults to 5432 * * Also we use the first pg_ctl binary found in the PATH, we're not picky * here, we don't have to manage the whole life-time of that PostgreSQL * instance. */ cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); /* * We support two modes of operations here: * - configuration exists already, we need PGDATA * - configuration doesn't exist already, we need PGDATA, and more */ if (!monitor_config_set_pathnames_from_pgdata(&options)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* * We require the user to specify an authentication mechanism, or to use * --skip-pg-hba. Our documentation tutorial will use --auth trust, and we * should make it obvious that this is not the right choice for production. */ if (IS_EMPTY_STRING_BUFFER(options.pgSetup.authMethod)) { log_fatal("Please use either --auth trust|md5|... or --skip-pg-hba"); log_info("pg_auto_failover can be set to edit Postgres HBA rules " "automatically when needed. For quick testing '--auth trust' " "makes it easy to get started, " "consider another authentication mechanism for production."); exit(EXIT_CODE_BAD_ARGS); } /* * If any --ssl-* option is provided, either we have a root ca file and a * server.key and a server.crt or none of them. Any other combo is a * mistake. */ if (sslCommandLineOptions == SSL_CLI_UNKNOWN) { log_fatal("Explicit SSL choice is required: please use either " "--ssl-self-signed or provide your certificates " "using --ssl-ca-file, --ssl-crl-file, " "--server-key, and --server-cert (or use --no-ssl if you " "are very sure that you do not want encrypted traffic)"); exit(EXIT_CODE_BAD_ARGS); } if (!pgsetup_validate_ssl_settings(&(options.pgSetup))) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } if (IS_EMPTY_STRING_BUFFER(options.pgSetup.pg_ctl)) { set_first_pgctl(&(options.pgSetup)); } if (IS_EMPTY_STRING_BUFFER(options.pgSetup.listen_addresses)) { strlcpy(options.pgSetup.listen_addresses, POSTGRES_DEFAULT_LISTEN_ADDRESSES, MAXPGPATH); } /* publish our option parsing in the global variable */ monitorOptions = options; return optind; } /* * cli_create_monitor_config takes care of the monitor configuration, either * creating it from scratch or merging the pg_autoctl create monitor command * line arguments and options with the pre-existing configuration file (for * when people change their mind or fix an error in the previous command). */ static bool cli_create_monitor_config(Monitor *monitor) { MonitorConfig *config = &(monitor->config); bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; if (file_exists(config->pathnames.config)) { MonitorConfig options = monitor->config; if (!monitor_config_read_file(config, missingPgdataIsOk, pgIsNotRunningIsOk)) { log_fatal("Failed to read configuration file \"%s\"", config->pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } /* * Now that we have loaded the configuration file, apply the command * line options on top of it, giving them priority over the config. */ if (!monitor_config_merge_options(config, &options)) { /* errors have been logged already */ exit(EXIT_CODE_BAD_CONFIG); } } else { /* Take care of the --hostname */ if (IS_EMPTY_STRING_BUFFER(config->hostname)) { if (!ipaddrGetLocalHostname(config->hostname, sizeof(config->hostname))) { char monitorHostname[_POSIX_HOST_NAME_MAX] = { 0 }; strlcpy(monitorHostname, DEFAULT_INTERFACE_LOOKUP_SERVICE_NAME, _POSIX_HOST_NAME_MAX); if (!discover_hostname((char *) &(config->hostname), _POSIX_HOST_NAME_MAX, DEFAULT_INTERFACE_LOOKUP_SERVICE_NAME, DEFAULT_INTERFACE_LOOKUP_SERVICE_PORT)) { log_fatal("Failed to auto-detect the hostname " "of this machine, please provide one " "via --hostname"); exit(EXIT_CODE_BAD_ARGS); } } } else { /* * When provided with a --hostname option, we run some checks on * the user provided value based on Postgres usage for the hostname * in its HBA setup. Both forward and reverse DNS needs to return * meaningful values for the connections to be granted when using a * hostname. * * That said network setup is something complex and we don't * pretend we are able to avoid any and all false negatives in our * checks, so we only WARN when finding something that might be * fishy, and proceed with the setup of the local node anyway. */ (void) check_hostname(config->hostname); } /* set our MonitorConfig from the command line options now. */ (void) monitor_config_init(config, missingPgdataIsOk, pgIsNotRunningIsOk); /* and write our brand new setup to file */ if (!monitor_config_write_file(config)) { log_fatal("Failed to write the monitor's configuration file, " "see above"); exit(EXIT_CODE_BAD_CONFIG); } } return true; } /* * Initialize the PostgreSQL instance that we're using for the Monitor: * * - pg_ctl initdb * - add postgresql-citus.conf to postgresql.conf * - pg_ctl start * - create user autoctl with createdb login; * - create database pg_auto_failover with owner autoctl; * - create extension pgautofailover; * * When this function is called (from monitor_config_init at the CLI level), we * know that PGDATA has been initdb already, and that's about it. * */ static void cli_create_monitor(int argc, char **argv) { pid_t pid = 0; Monitor monitor = { 0 }; MonitorConfig *config = &(monitor.config); monitor.config = monitorOptions; if (read_pidfile(config->pathnames.pid, &pid)) { log_fatal("pg_autoctl is already running with pid %d", pid); exit(EXIT_CODE_BAD_STATE); } if (!cli_create_monitor_config(&monitor)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* Initialize our local connection to the monitor */ if (!monitor_local_init(&monitor)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } /* Ok, now we know we have a configuration file, and it's been loaded. */ if (!monitor_pg_init(&monitor)) { /* errors have been logged */ exit(EXIT_CODE_BAD_STATE); } if (!service_monitor_init(&monitor)) { /* errors have been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } } /* * check_or_discover_hostname checks given --hostname or attempt to discover a * suitable default value for the current node when it's not been provided on * the command line. */ bool check_or_discover_hostname(KeeperConfig *config) { /* take care of the hostname */ if (IS_EMPTY_STRING_BUFFER(config->hostname)) { char monitorHostname[_POSIX_HOST_NAME_MAX]; int monitorPort = 0; /* * When --disable-monitor, use the defaults for ipAddr discovery, same * as when creating the monitor node itself. */ if (config->monitorDisabled) { strlcpy(monitorHostname, DEFAULT_INTERFACE_LOOKUP_SERVICE_NAME, _POSIX_HOST_NAME_MAX); monitorPort = DEFAULT_INTERFACE_LOOKUP_SERVICE_PORT; } else if (!hostname_from_uri(config->monitor_pguri, monitorHostname, _POSIX_HOST_NAME_MAX, &monitorPort)) { log_fatal("Failed to determine monitor hostname when parsing " "Postgres URI \"%s\"", config->monitor_pguri); return false; } if (!discover_hostname((char *) &(config->hostname), _POSIX_HOST_NAME_MAX, monitorHostname, monitorPort)) { log_fatal("Failed to auto-detect the hostname of this machine, " "please provide one via --hostname"); return false; } } else { /* * When provided with a --hostname option, we run some checks on the * user provided value based on Postgres usage for the hostname in its * HBA setup. Both forward and reverse DNS needs to return meaningful * values for the connections to be granted when using a hostname. * * That said network setup is something complex and we don't pretend we * are able to avoid any and all false negatives in our checks, so we * only WARN when finding something that might be fishy, and proceed * with the setup of the local node anyway. */ (void) check_hostname(config->hostname); } return true; } /* * discover_hostname discovers a suitable --hostname default value in three * steps: * * 1. First find the local LAN IP address by connecting a socket() to either an * internet service (8.8.8.8:53) or to the monitor's hostname and port, and * then inspecting which local address has been used. * * 2. Use the local IP address obtained in the first step and do a reverse DNS * lookup for it. The answer is our candidate default --hostname. * * 3. Do a DNS lookup for the candidate default --hostname. If we get back a IP * address that matches one of the local network interfaces, we keep the * candidate, the DNS lookup that Postgres does at connection time is * expected to then work. * * All this dansing around DNS lookups is necessary in order to mimic Postgres * HBA matching of hostname rules against client IP addresses: the hostname in * the HBA rule is resolved and compared to the client IP address. We want the * --hostname we use to resolve to an IP address that exists on the local * Postgres server. * * Worst case here is that we fail to discover a --hostname and then ask the * user to provide one for us. * * monitorHostname and monitorPort are used to open a socket to that address, * in order to find the right outbound interface. When creating a monitor node, * of course, we don't have the monitorHostname yet: we are trying to discover * it... in that case we use PG_AUTOCTL_DEFAULT_SERVICE_NAME and PORT, which * are the Google DNS service: 8.8.8.8:53, expected to be reachable. */ bool discover_hostname(char *hostname, int size, const char *monitorHostname, int monitorPort) { /* * Try and find a default --hostname. The --hostname is mandatory, so * when not provided for by the user, then failure to discover a * suitable hostname is a fatal error. */ char ipAddr[BUFSIZE]; char localIpAddr[BUFSIZE]; char hostnameCandidate[_POSIX_HOST_NAME_MAX]; ConnectionRetryPolicy retryPolicy = { 0 }; /* retry connecting to the monitor when it's not available */ (void) pgsql_set_monitor_interactive_retry_policy(&retryPolicy); while (!pgsql_retry_policy_expired(&retryPolicy)) { bool mayRetry = false; /* fetch our local address among the network interfaces */ if (fetchLocalIPAddress(ipAddr, BUFSIZE, monitorHostname, monitorPort, LOG_DEBUG, &mayRetry)) { /* success: break out of the retry loop */ break; } if (!mayRetry) { log_fatal("Failed to find a local IP address, " "please provide --hostname."); return false; } int sleepTimeMs = pgsql_compute_connection_retry_sleep_time(&retryPolicy); log_warn("Failed to connect to \"%s\" on port %d " "to discover this machine hostname, " "retrying in %d ms.", monitorHostname, monitorPort, sleepTimeMs); /* we have milliseconds, pg_usleep() wants microseconds */ (void) pg_usleep(sleepTimeMs * 1000); } /* from there on we can take the ipAddr as the default --hostname */ strlcpy(hostname, ipAddr, size); log_debug("discover_hostname: local ip %s", ipAddr); /* do a reverse DNS lookup from our local LAN ip address */ if (!findHostnameFromLocalIpAddress(ipAddr, hostnameCandidate, _POSIX_HOST_NAME_MAX)) { /* errors have already been logged */ log_info("Using local IP address \"%s\" as the --hostname.", ipAddr); return true; } log_debug("discover_hostname: host from ip %s", hostnameCandidate); /* do a DNS lookup of the hostname we got from the IP address */ if (!findHostnameLocalAddress(hostnameCandidate, localIpAddr, BUFSIZE)) { /* errors have already been logged */ log_info("Using local IP address \"%s\" as the --hostname.", ipAddr); return true; } log_debug("discover_hostname: ip from host %s", localIpAddr); /* * ok ipAddr resolves to an hostname that resolved back to a local address, * we should be able to use the hostname in pg_hba.conf */ strlcpy(hostname, hostnameCandidate, size); log_info("Using --hostname \"%s\", which resolves to IP address \"%s\"", hostname, localIpAddr); return true; } /* * check_hostname runs some DNS check against the provided --hostname in order * to warn the user in case we might later fail to use it in the Postgres HBA * setup. * * The main trouble we guard against is from HBA authentication. Postgres HBA * check_hostname() does a DNS lookup of the hostname found in the pg_hba.conf * file and then compares the IP addresses obtained to the client IP address, * and refuses the connection where there's no match. */ static void check_hostname(const char *hostname) { char localIpAddress[INET_ADDRSTRLEN]; IPType ipType = ip_address_type(hostname); if (ipType == IPTYPE_NONE) { if (!findHostnameLocalAddress(hostname, localIpAddress, INET_ADDRSTRLEN)) { log_warn( "Failed to resolve hostname \"%s\" to a local IP address, " "automated pg_hba.conf setup might fail.", hostname); } } else { char cidr[BUFSIZE] = { 0 }; char ipaddr[BUFSIZE] = { 0 }; if (!fetchLocalCIDR(hostname, cidr, BUFSIZE)) { log_warn("Failed to find adress \"%s\" in local network " "interfaces, automated pg_hba.conf setup might fail.", hostname); } bool useHostname = false; /* use pghba_check_hostname for log diagnostics */ (void) pghba_check_hostname(hostname, ipaddr, BUFSIZE, &useHostname); } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_azure.c000066400000000000000000000320671414244367200233700ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_azure.c * Implementation of a CLI which lets you call `az` cli commands to prepare * a pg_auto_failover demo or QA environment. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "snprintf.h" #include "azure.h" #include "azure_config.h" #include "cli_common.h" #include "cli_do_root.h" #include "cli_root.h" #include "commandline.h" #include "config.h" #include "env_utils.h" #include "log.h" #include "pidfile.h" #include "signals.h" #include "string_utils.h" #include "runprogram.h" static AzureOptions azOptions = { 0 }; static AzureRegionResources azRegion = { 0 }; bool dryRun = false; PQExpBuffer azureScript = NULL; static void outputAzureScript(void); /* * cli_print_version_getopts parses the CLI options for the pg_autoctl version * command, which are the usual suspects. */ int cli_do_azure_getopts(int argc, char **argv) { int c, option_index = 0, errors = 0; int verboseCount = 0; bool printVersion = false; AzureOptions options = { 0 }; static struct option long_options[] = { { "prefix", required_argument, NULL, 'p' }, { "region", required_argument, NULL, 'r' }, { "location", required_argument, NULL, 'l' }, { "from-source", no_argument, NULL, 's' }, { "nodes", required_argument, NULL, 'N' }, { "no-monitor", no_argument, NULL, 'M' }, { "no-app", no_argument, NULL, 'n' }, { "all", no_argument, NULL, 'A' }, { "script", no_argument, NULL, 'S' }, { "watch", no_argument, NULL, 'T' }, { "az", no_argument, NULL, 'Z' }, { "cidr", no_argument, NULL, 'c' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; /* set our defaults */ options.cidr = 11; /* 10.11.0.0/16 and 10.11.11.0/24 */ options.nodes = 2; options.fromSource = false; options.appNode = true; options.monitor = true; options.all = false; options.watch = false; strlcpy(options.prefix, "ha-demo", sizeof(options.prefix)); /* * The only command lines that are using keeper_cli_getopt_pgdata are * terminal ones: they don't accept subcommands. In that case our option * parsing can happen in any order and we don't need getopt_long to behave * in a POSIXLY_CORRECT way. * * The unsetenv() call allows getopt_long() to reorder arguments for us. */ unsetenv("POSIXLY_CORRECT"); while ((c = getopt_long(argc, argv, "p:n:l:N:MAWSTVvqh", long_options, &option_index)) != -1) { switch (c) { /* { "prefix", required_argument, NULL, 'p' }, */ case 'p': { strlcpy(options.prefix, optarg, NAMEDATALEN); log_trace("--prefix %s", options.prefix); break; } /* { "region", required_argument, NULL, 'r' }, */ case 'r': { strlcpy(options.region, optarg, NAMEDATALEN); log_trace("--region %s", options.region); break; } /* { "location", required_argument, NULL, 'l' }, */ case 'l': { strlcpy(options.location, optarg, NAMEDATALEN); log_trace("--location %s", options.location); break; } /* { "az", no_argument, NULL, 'Z' }, */ case 'Z': { strlcpy(azureCLI, optarg, NAMEDATALEN); log_trace("--az %s", azureCLI); break; } /* { "cidr", no_argument, NULL, 'c' }, */ case 'c': { if (!stringToInt(optarg, &options.cidr)) { log_error("Failed to parse --cidr number \"%s\"", optarg); errors++; } else if (options.cidr < 1 || options.cidr > 254) { log_error("Failed to parse --cidr number \"%s\"", optarg); errors++; } else { log_trace("--cidr %d", options.cidr); } break; } /* { "nodes", required_argument, NULL, 'N' }, */ case 'N': { if (!stringToInt(optarg, &options.nodes)) { log_error("Failed to parse --nodes number \"%s\"", optarg); errors++; } log_trace("--nodes %d", options.nodes); break; } /* { "no-monitor", no_argument, NULL, 'M' }, */ case 'M': { options.monitor = false; log_trace("--no-monitor"); break; } /* { "no-app", no_argument, NULL, 'n' }, */ case 'n': { options.appNode = false; log_trace("--no-app"); break; } /* { "from-source", required_argument, NULL, 's' }, */ case 's': { options.fromSource = true; log_trace("--from-source"); break; } /* { "all", no_argument, NULL, 'A' }, */ case 'A': { options.all = true; log_trace("--monitor"); break; } /* { "script", no_argument, NULL, 'S' }, */ case 'S': { dryRun = true; log_trace("--script"); break; } /* { "watch", no_argument, NULL, 'T' }, */ case 'T': { options.watch = true; log_trace("--watch"); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ printVersion = true; break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } if (IS_EMPTY_STRING_BUFFER(options.prefix)) { ++errors; log_fatal("--prefix is a mandatory option"); } if (IS_EMPTY_STRING_BUFFER(azureCLI)) { if (!search_path_first("az", azureCLI, LOG_ERROR)) { ++errors; log_fatal("Failed to find program \"%s\" in PATH", "az"); } } else { if (!file_exists(azureCLI)) { ++errors; log_fatal("No such file or directory: \"%s\"", azureCLI); } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } if (printVersion) { keeper_cli_print_version(argc, argv); } /* * From command line options parsing, prepare a AzureRegionResources in our * static place. * * If a configuration file exists already, it takes precendence, because we * have probably already created all the resources on Azure and deployed * things there. * * If no configuration file exists already, we create one filled with the * options given in the command line. */ (void) azure_config_prepare(&options, &azRegion); if (file_exists(azRegion.filename)) { log_info("Reading configuration from \"%s\"", azRegion.filename); if (!azure_config_read_file(&azRegion)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* maybe late we will merge new options in the pre-existing file */ log_warn("Ignoring command line options, " "configuration file takes precedence"); log_info("Using --prefix \"%s\" --region \"%s\" --location \"%s\"", azRegion.prefix, azRegion.region, azRegion.location); } else { if (!azure_config_write_file(&azRegion)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } } /* when a configuration file already exists, it provides the location */ if (IS_EMPTY_STRING_BUFFER(azRegion.location)) { log_fatal("--location is a mandatory option"); exit(EXIT_CODE_BAD_ARGS); } /* * In --script mode (or dry run) we generate a script with the commands we * would run instead of actually running them. */ if (dryRun) { azureScript = createPQExpBuffer(); if (azureScript == NULL) { log_error("Failed to allocate memory"); exit(EXIT_CODE_INTERNAL_ERROR); } appendPQExpBuffer(azureScript, "# azure commands for pg_auto_failover demo"); } /* publish parsed options */ azOptions = options; return optind; } /* * outputAzureScript writes the azure script to stdout. */ static void outputAzureScript() { if (dryRun) { fformat(stdout, "%s\n", azureScript->data); destroyPQExpBuffer(azureScript); } } /* * cli_do_azure_create_environment creates an Azure region with some nodes and * network rules for a demo or QA context of pg_auto_failover, then provision * those VMs with the needed software, and then create pg_auto_failover nodes * from that, in a tmux session for interactive QA. */ void cli_do_azure_create_environment(int argc, char **argv) { /* * azure_create_region creates the resources we need (VMs, network, access * rules, etc) and then provision the VMs with the needed software. */ if (!azure_create_region(&azRegion)) { exit(EXIT_CODE_INTERNAL_ERROR); } (void) outputAzureScript(); /* * tmux_azure_start_or_attach_session then creates a tmux session with a * shell window for each VM in the Azure resource group, and in each * session in parallel runs the pg_autoctl create commands, and then add * the setup to systemd. * * Another tmux window is created to run pg_autoctl show state in a watch * loop. * * An extra window is created for interactive tinkering with the QA * environment thus provided. */ if (!tmux_azure_start_or_attach_session(&azRegion)) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_do_azure_create_region creates an Azure region with some nodes and * network rules for a demo or QA context of pg_auto_failover. */ void cli_do_azure_create_region(int argc, char **argv) { if (!azure_create_region(&azRegion)) { exit(EXIT_CODE_INTERNAL_ERROR); } (void) outputAzureScript(); } /* * cli_do_azure_drop_region drops the azure resource group that has been * created to host the azure resources in use for the environment. */ void cli_do_azure_drop_region(int argc, char **argv) { bool success = true; if (!azure_drop_region(&azRegion)) { log_warn("Configuration file \"%s\" has not been deleted", azRegion.filename); exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Killing tmux sessions \"%s\"", azRegion.group); success = success && tmux_azure_kill_session(&azRegion); log_info("Removing azure configuration file \"%s\"", azRegion.filename); if (!unlink_file(azRegion.filename)) { log_fatal("Failed to remove azure configuration file"); exit(EXIT_CODE_INTERNAL_ERROR); } (void) outputAzureScript(); } /* * cli_do_azure_deploy deploys the pg_autoctl services in the target VM, given * by name (such as "monitor" or "a" or "b", etc). */ void cli_do_azure_deploy(int argc, char **argv) { if (argc != 1) { (void) commandline_print_usage(&do_azure_ssh, stderr); exit(EXIT_CODE_BAD_ARGS); } if (!azure_deploy_vm(&azRegion, argv[0])) { exit(EXIT_CODE_INTERNAL_ERROR); } (void) outputAzureScript(); } /* * cli_do_azure_create_nodes creates the pg_autoctl services in an Azure * region that's been created and provisionned before. */ void cli_do_azure_create_nodes(int argc, char **argv) { if (!azure_create_nodes(&azRegion)) { exit(EXIT_CODE_INTERNAL_ERROR); } (void) outputAzureScript(); } /* * cli_do_azure_ls lists Azure resources created in the target region. */ void cli_do_azure_ls(int argc, char **argv) { if (!azure_ls(&azRegion)) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_do_azure_show_ips lists Azure ip addresses assigned to created VMs in a * specific region. */ void cli_do_azure_show_ips(int argc, char **argv) { if (!azure_show_ips(&azRegion)) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_do_azure_ssh starts an ssh command to the given Azure VM in a specific * prefix and region name. */ void cli_do_azure_ssh(int argc, char **argv) { if (argc != 1) { (void) commandline_print_usage(&do_azure_ssh, stderr); exit(EXIT_CODE_BAD_ARGS); } if (!azure_ssh(&azRegion, argv[0])) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_do_azure_rsync uses rsync to upload the current sources to all the * created VMs in the target region. */ void cli_do_azure_rsync(int argc, char **argv) { if (!azure_sync_source_dir(&azRegion)) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_do_azure_ssh starts an ssh command to the given Azure VM in a specific * prefix and region name. */ void cli_do_azure_show_state(int argc, char **argv) { char *pg_autoctl_command = azOptions.watch ? "watch -n 0.2 pg_autoctl show state --pgdata ./monitor" : "pg_autoctl show state --pgdata ./monitor"; if (!azure_ssh_command(&azRegion, "monitor", azOptions.watch, /* tty is needed for watch */ pg_autoctl_command)) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_do_azure_tmux_session starts or re-attach to a tmux session from where * to control the VMs in the QA environment on Azure. */ void cli_do_azure_tmux_session(int argc, char **argv) { if (!tmux_azure_start_or_attach_session(&azRegion)) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_do_azure_tmux_session starts or re-attach to a tmux session from where * to control the VMs in the QA environment on Azure. */ void cli_do_azure_tmux_kill(int argc, char **argv) { if (!tmux_azure_kill_session(&azRegion)) { exit(EXIT_CODE_INTERNAL_ERROR); } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_demoapp.c000066400000000000000000000324751414244367200236720ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_demoapp.c * Implementation of a demo application that shows how to handle automatic * reconnection when a failover happened, and uses a single URI. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "postgres_fe.h" #include "portability/instr_time.h" #include "cli_common.h" #include "cli_do_demoapp.h" #include "cli_do_root.h" #include "commandline.h" #include "defaults.h" #include "demoapp.h" #include "file_utils.h" #include "ipaddr.h" #include "monitor.h" #include "pgctl.h" DemoAppOptions demoAppOptions = { 0 }; static int cli_do_demoapp_getopts(int argc, char **argv); static void cli_demo_run(int argc, char **argv); static void cli_demo_uri(int argc, char **argv); static void cli_demo_ping(int argc, char **argv); static void cli_demo_summary(int argc, char **argv); static CommandLine do_demo_run_command = make_command("run", "Run the pg_auto_failover demo application", "[option ...]", " --monitor Postgres URI of the pg_auto_failover monitor\n" " --formation Formation to use (default)\n" " --group Group Id to failover (0)\n" " --username PostgreSQL's username\n" " --clients How many client processes to use (1)\n" " --duration Duration of the demo app, in seconds (30)\n" " --first-failover Timing of the first failover (10)\n" " --failover-freq Seconds between subsequent failovers (45)\n", cli_do_demoapp_getopts, cli_demo_run); static CommandLine do_demo_uri_command = make_command("uri", "Grab the application connection string from the monitor", "[option ...]", " --monitor Postgres URI of the pg_auto_failover monitor\n" " --formation Formation to use (default)\n" " --group Group Id to failover (0)\n" \ " --username PostgreSQL's username\n" " --clients How many client processes to use (1)\n" " --duration Duration of the demo app, in seconds (30)\n", cli_do_demoapp_getopts, cli_demo_uri); static CommandLine do_demo_ping_command = make_command("ping", "Attempt to connect to the application URI", "[option ...]", " --monitor Postgres URI of the pg_auto_failover monitor\n" " --formation Formation to use (default)\n" " --group Group Id to failover (0)\n" \ " --username PostgreSQL's username\n" " --clients How many client processes to use (1)\n" " --duration Duration of the demo app, in seconds (30)\n", cli_do_demoapp_getopts, cli_demo_ping); static CommandLine do_demo_summary_command = make_command("summary", "Display a summary of the previous demo app run", "[option ...]", " --monitor Postgres URI of the pg_auto_failover monitor\n" " --formation Formation to use (default)\n" " --group Group Id to failover (0)\n" \ " --username PostgreSQL's username\n" " --clients How many client processes to use (1)\n" " --duration Duration of the demo app, in seconds (30)\n", cli_do_demoapp_getopts, cli_demo_summary); CommandLine *do_demo_subcommands[] = { &do_demo_run_command, &do_demo_uri_command, &do_demo_ping_command, &do_demo_summary_command, NULL }; CommandLine do_demo_commands = make_command_set("demo", "Use a demo application for pg_auto_failover", NULL, NULL, NULL, do_demo_subcommands); /* * cli_do_demoapp_getopts parses the command line options for the demo * sub-commands. */ static int cli_do_demoapp_getopts(int argc, char **argv) { int c, option_index = 0, errors = 0; int verboseCount = 0; bool printVersion = false; DemoAppOptions options = { 0 }; static struct option long_options[] = { { "monitor", required_argument, NULL, 'm' }, { "formation", required_argument, NULL, 'f' }, { "group", required_argument, NULL, 'g' }, { "username", required_argument, NULL, 'U' }, { "clients", required_argument, NULL, 'c' }, { "duration", required_argument, NULL, 't' }, { "no-failover", no_argument, NULL, 'N' }, { "first-failover", required_argument, NULL, 'F' }, { "failover-freq", required_argument, NULL, 'Q' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; /* set our defaults */ options.groupId = 0; options.clientsCount = 1; options.duration = 30; options.firstFailover = 10; options.failoverFreq = 45; options.doFailover = true; strlcpy(options.formation, "default", sizeof(options.formation)); /* * The only command lines that are using cli_do_demoapp_getopts are * terminal ones: they don't accept subcommands. In that case our option * parsing can happen in any order and we don't need getopt_long to behave * in a POSIXLY_CORRECT way. * * The unsetenv() call allows getopt_long() to reorder arguments for us. */ unsetenv("POSIXLY_CORRECT"); while ((c = getopt_long(argc, argv, "D:p:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'm': { /* { "monitor", required_argument, NULL, 'm' } */ if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'f': { /* { "formation", required_argument, NULL, 'f' } */ strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'N': { /* { "no-failover", no_argument, NULL, 'N' }, */ options.doFailover = false; log_trace("--no-failover"); break; } case 'g': { /* { "group", required_argument, NULL, 'g' } */ if (!stringToInt(optarg, &options.groupId)) { log_fatal("--group argument is not a valid group ID: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--group %d", options.groupId); break; } case 'U': { /* { "username", required_argument, NULL, 'U' } */ strlcpy(options.username, optarg, NAMEDATALEN); log_trace("--username %s", options.username); break; } case 'c': { /* { "clients", required_argument, NULL, 'c' }, */ if (!stringToInt(optarg, &options.clientsCount)) { log_error("Failed to parse --clients number \"%s\"", optarg); errors++; } if (options.clientsCount < 1 || options.clientsCount > MAX_CLIENTS_COUNT) { log_error("Unsupported value for --clients: %d must be " "at least 1 and maximum %d", options.clientsCount, MAX_CLIENTS_COUNT); } log_trace("--clients %d", options.clientsCount); break; } case 't': { /* { "duration", required_argument, NULL, 't' }, */ if (!stringToInt(optarg, &options.duration)) { log_error("Failed to parse --duration number \"%s\"", optarg); errors++; } log_trace("--duration %d", options.duration); break; } case 'F': { /* { "first-failover", required_argument, NULL, 'F' }, */ if (!stringToInt(optarg, &options.firstFailover)) { log_error("Failed to parse --first-failover number \"%s\"", optarg); errors++; } log_trace("--first-failover %d", options.firstFailover); break; } case 'Q': { /* { "failover-freq", required_argument, NULL, 'Q' }, */ if (!stringToInt(optarg, &options.failoverFreq)) { log_error("Failed to parse --failover-freq number \"%s\"", optarg); errors++; } log_trace("--failover-freq %d", options.failoverFreq); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ printVersion = true; break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } if (IS_EMPTY_STRING_BUFFER(options.monitor_pguri)) { if (env_exists(PG_AUTOCTL_MONITOR) && get_env_copy(PG_AUTOCTL_MONITOR, options.monitor_pguri, sizeof(options.monitor_pguri))) { log_debug("Using environment PG_AUTOCTL_MONITOR \"%s\"", options.monitor_pguri); } else { log_fatal("Please provide --monitor"); errors++; } } if (IS_EMPTY_STRING_BUFFER(options.username)) { if (!get_env_copy_with_fallback("PGUSER", options.username, NAMEDATALEN, "")) { PostgresSetup pgSetup = { 0 }; char *username = pg_setup_get_username(&pgSetup); strlcpy(options.username, username, sizeof(options.username)); } } /* set our Postgres username as the PGUSER environment variable now */ setenv("PGUSER", options.username, 1); if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } if (printVersion) { keeper_cli_print_version(argc, argv); } /* publish parsed options */ demoAppOptions = options; return optind; } /* * cli_demo_run runs a demo application. */ static void cli_demo_run(int argc, char **argv) { char pguri[MAXCONNINFO] = { 0 }; ConnectionRetryPolicy retryPolicy = { 0 }; /* retry connecting to the monitor when it's not available */ (void) pgsql_set_monitor_interactive_retry_policy(&retryPolicy); while (!pgsql_retry_policy_expired(&retryPolicy)) { bool mayRetry = false; if (demoapp_grab_formation_uri(&demoAppOptions, pguri, sizeof(pguri), &mayRetry)) { /* success: break out of the retry loop */ break; } /* errors have already been logged */ if (!mayRetry) { exit(EXIT_CODE_INTERNAL_ERROR); } int sleepTimeMs = pgsql_compute_connection_retry_sleep_time(&retryPolicy); /* we have milliseconds, pg_usleep() wants microseconds */ log_info("Retrying to grab formation \"%s\" URI in %dms", demoAppOptions.formation, sleepTimeMs); (void) pg_usleep(sleepTimeMs * 1000); } log_info("Using application connection string \"%s\"", pguri); log_info("Using Postgres user PGUSER \"%s\"", demoAppOptions.username); if (!demoapp_prepare_schema(pguri)) { log_fatal("Failed to install the demo application schema"); exit(EXIT_CODE_INTERNAL_ERROR); } if (!demoapp_run(pguri, &demoAppOptions)) { log_fatal("Failed to run the demo application"); exit(EXIT_CODE_INTERNAL_ERROR); } /* show the historgram now, avoid the fully detailed summary */ (void) demoapp_print_histogram(pguri, &demoAppOptions); } /* * cli_demo_uri returns the Postgres connection string (URI) to use in the demo * application, grabbed from a running monitor node by using the SQL API. */ static void cli_demo_uri(int argc, char **argv) { bool mayRetry = false; char pguri[MAXCONNINFO] = { 0 }; if (!demoapp_grab_formation_uri(&demoAppOptions, pguri, sizeof(pguri), &mayRetry)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%s\n", pguri); } /* * cli_demo_ping connects to the application connection string retrieved from * the monitor, and outputs some statistics about the connection attempt(s) and * its success or failure. */ static void cli_demo_ping(int argc, char **argv) { PGSQL pgsql = { 0 }; bool mayRetry = false; char pguri[MAXCONNINFO] = { 0 }; bool is_in_recovery = false; if (!demoapp_grab_formation_uri(&demoAppOptions, pguri, sizeof(pguri), &mayRetry)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Using application connection string \"%s\"", pguri); log_info("Using Postgres user PGUSER \"%s\"", demoAppOptions.username); pgsql_init(&pgsql, pguri, PGSQL_CONN_LOCAL); if (!pgsql_is_in_recovery(&pgsql, &is_in_recovery)) { /* errors have already been logged */ exit(EXIT_CODE_PGSQL); } instr_time duration; INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, pgsql.retryPolicy.startTime); log_info("Connected after %d attempt(s) in %g ms", pgsql.retryPolicy.attempts + 1, INSTR_TIME_GET_MILLISEC(duration)); if (is_in_recovery) { log_error("Failed to connect to a primary node: " "Postgres is in recovery"); exit(EXIT_CODE_PGSQL); } log_info("Target Postgres is not in recovery, " "as expected from a primary node"); } /* * cli_demo_summary prints the summary of the previous demo app run. */ static void cli_demo_summary(int argc, char **argv) { bool mayRetry = false; char pguri[MAXCONNINFO] = { 0 }; if (!demoapp_grab_formation_uri(&demoAppOptions, pguri, sizeof(pguri), &mayRetry)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Using application connection string \"%s\"", pguri); log_info("Using Postgres user PGUSER \"%s\"", demoAppOptions.username); (void) demoapp_print_summary(pguri, &demoAppOptions); (void) demoapp_print_histogram(pguri, &demoAppOptions); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_demoapp.h000066400000000000000000000017541414244367200236730ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_demoapp.h * Implementation of a demo application that shows how to handle automatic * reconnection when a failover happened, and uses a single URI. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef CLI_DO_DEMOAPP_H #define CLI_DO_DEMOAPP_H #include "postgres_fe.h" #include "pqexpbuffer.h" #include "snprintf.h" #include "cli_common.h" #include "cli_do_root.h" #include "cli_root.h" #include "commandline.h" #include "config.h" #include "env_utils.h" #include "log.h" #include "pidfile.h" #include "signals.h" #include "string_utils.h" #define MAX_CLIENTS_COUNT 128 typedef struct DemoAppOptions { char monitor_pguri[MAXCONNINFO]; char formation[NAMEDATALEN]; char username[NAMEDATALEN]; int groupId; int clientsCount; int duration; int firstFailover; int failoverFreq; bool doFailover; } DemoAppOptions; extern DemoAppOptions demoAppOptions; #endif /* CLI_DO_DEMOAPP_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_fsm.c000066400000000000000000000310451414244367200230220ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_fsm.c * Implementation of a CLI which lets you run individual keeper Finite * State Machine routines directly * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "defaults.h" #include "fsm.h" #include "keeper_config.h" #include "keeper.h" #include "parsing.h" #include "pgctl.h" #include "state.h" #include "string_utils.h" static void cli_do_fsm_init(int argc, char **argv); static void cli_do_fsm_state(int argc, char **argv); static void cli_do_fsm_list(int argc, char **argv); static void cli_do_fsm_gv(int argc, char **argv); static void cli_do_fsm_assign(int argc, char **argv); static void cli_do_fsm_step(int argc, char **argv); static void cli_do_fsm_get_nodes(int argc, char **argv); static void cli_do_fsm_set_nodes(int argc, char **argv); static CommandLine fsm_init = make_command("init", "Initialize the keeper's state on-disk", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_fsm_init); static CommandLine fsm_state = make_command("state", "Read the keeper's state from disk and display it", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_fsm_state); static CommandLine fsm_list = make_command("list", "List reachable FSM states from current state", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_fsm_list); static CommandLine fsm_gv = make_command("gv", "Output the FSM as a .gv program suitable for graphviz/dot", "", NULL, NULL, cli_do_fsm_gv); static CommandLine fsm_assign = make_command("assign", "Assign a new goal state to the keeper", CLI_PGDATA_USAGE "", CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_fsm_assign); static CommandLine fsm_step = make_command("step", "Make a state transition if instructed by the monitor", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_fsm_step); static CommandLine fsm_nodes_get = make_command("get", "Get the list of nodes from file (see --disable-monitor)", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_fsm_get_nodes); static CommandLine fsm_nodes_set = make_command("set", "Set the list of nodes to file (see --disable-monitor)", CLI_PGDATA_USAGE "", CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_fsm_set_nodes); static CommandLine *fsm_nodes_[] = { &fsm_nodes_get, &fsm_nodes_set, NULL }; CommandLine fsm_nodes = make_command_set("nodes", "Manually manage the keeper's nodes list", NULL, NULL, NULL, fsm_nodes_); static CommandLine *fsm[] = { &fsm_init, &fsm_state, &fsm_list, &fsm_gv, &fsm_assign, &fsm_step, &fsm_nodes, NULL }; CommandLine do_fsm_commands = make_command_set("fsm", "Manually manage the keeper's state", NULL, NULL, NULL, fsm); /* * cli_do_fsm_init initializes the internal Keeper state, and writes it to * disk. */ static void cli_do_fsm_init(int argc, char **argv) { Keeper keeper = { 0 }; KeeperConfig config = keeperOptions; char keeperStateJSON[BUFSIZE]; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } log_info("Initializing an FSM state in \"%s\"", config.pathnames.state); if (!keeper_state_create_file(config.pathnames.state)) { /* errors are logged in keeper_state_write */ exit(EXIT_CODE_BAD_STATE); } if (!keeper_init(&keeper, &config)) { /* errors are logged in keeper_state_read */ exit(EXIT_CODE_BAD_STATE); } if (!keeper_update_pg_state(&keeper, LOG_ERROR)) { log_fatal("Failed to update the keeper's state from the local " "PostgreSQL instance, see above."); exit(EXIT_CODE_BAD_STATE); } if (!keeper_store_state(&keeper)) { /* errors logged in keeper_state_write */ exit(EXIT_CODE_BAD_STATE); } if (!keeper_state_as_json(&keeper, keeperStateJSON, BUFSIZE)) { log_error("Failed to serialize internal keeper state to JSON"); exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%s\n", keeperStateJSON); } /* * cli_do_fsm_init initializes the internal Keeper state, and writes it to * disk. */ static void cli_do_fsm_state(int argc, char **argv) { Keeper keeper = { 0 }; KeeperConfig config = keeperOptions; char keeperStateJSON[BUFSIZE]; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_init(&keeper, &config)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_state_as_json(&keeper, keeperStateJSON, BUFSIZE)) { log_error("Failed to serialize internal keeper state to JSON"); exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%s\n", keeperStateJSON); } /* * cli_do_fsm_list lists reachable states from the current one. */ static void cli_do_fsm_list(int argc, char **argv) { KeeperStateData keeperState = { 0 }; KeeperConfig config = keeperOptions; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* now read keeper's state */ if (!keeper_state_read(&keeperState, config.pathnames.state)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_STATE); } if (outputJSON) { log_warn("This command does not support JSON output at the moment"); } print_reachable_states(&keeperState); fformat(stdout, "\n"); } /* * cli_do_fsm_gv outputs the FSM as a .gv program. */ static void cli_do_fsm_gv(int argc, char **argv) { print_fsm_for_graphviz(); } /* * cli_do_fsm_assigns a reachable state from the current one. */ static void cli_do_fsm_assign(int argc, char **argv) { Keeper keeper = { 0 }; KeeperConfig config = keeperOptions; char keeperStateJSON[BUFSIZE]; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; int timeout = 30; int attempts = 0; uint64_t startTime = time(NULL); if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (argc != 1) { log_error("USAGE: do fsm state "); commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } NodeState goalState = NodeStateFromString(argv[0]); if (goalState == NO_STATE) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* now read keeper's state */ if (!keeper_init(&keeper, &config)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* assign the new state */ keeper.state.assigned_role = goalState; if (!keeper_store_state(&keeper)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_STATE); } /* loop over reading the state until assigned state has been reached */ for (attempts = 0; keeper.state.current_role != goalState; attempts++) { uint64_t now = time(NULL); if (!keeper_load_state(&keeper)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_STATE); } /* we're done if we reach the timeout */ if ((now - startTime) >= timeout) { break; } /* sleep 100 ms in between state file probes */ pg_usleep(100 * 1000); } if (keeper.state.current_role != goalState) { uint64_t now = time(NULL); log_warn("Failed to reach goal state \"%s\" in %d attempts and %ds", NodeStateToString(goalState), attempts, (int) (now - startTime)); exit(EXIT_CODE_BAD_STATE); } if (!keeper_state_as_json(&keeper, keeperStateJSON, BUFSIZE)) { log_error("Failed to serialize internal keeper state to JSON"); exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%s\n", keeperStateJSON); } /* * cli_do_fsm_step gets the goal state from the monitor, makes * the necessary transition, and then reports the current state to * the monitor. */ static void cli_do_fsm_step(int argc, char **argv) { Keeper keeper = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; keeper.config = keeperOptions; if (!keeper_config_read_file(&(keeper.config), missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (keeper.config.monitorDisabled) { log_fatal("The command `pg_autoctl do fsm step` is meant to step as " "instructed by the monitor, and the monitor is disabled."); log_info("HINT: see `pg_autoctl do fsm assign` instead"); exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_init(&keeper, &keeper.config)) { log_fatal("Failed to initialize keeper, see above for details"); exit(EXIT_CODE_PGCTL); } const char *oldRole = NodeStateToString(keeper.state.current_role); if (!keeper_fsm_step(&keeper)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_STATE); } const char *newRole = NodeStateToString(keeper.state.assigned_role); if (outputJSON) { log_warn("This command does not support JSON output at the moment"); } fformat(stdout, "%s ➜ %s\n", oldRole, newRole); } /* * cli_do_fsm_get_nodes displays the list of nodes parsed from the nodes file * on-disk. A nodes file is only used when running with --disable-monitor. */ static void cli_do_fsm_get_nodes(int argc, char **argv) { Keeper keeper = { 0 }; KeeperConfig *config = &(keeper.config); bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; *config = keeperOptions; if (!keeper_config_read_file(config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!config->monitorDisabled) { log_fatal("The monitor is not disabled, there's no nodes file"); exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_read_nodes_from_file(&keeper, &(keeper.otherNodes))) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } (void) printNodeArray(&(keeper.otherNodes)); } /* * cli_do_fsm_set_nodes parses the list of nodes parsed from the nodes file * on-disk. A JSON array of nodes objects is expected. A nodes file is only * used when running with --disable-monitor. */ static void cli_do_fsm_set_nodes(int argc, char **argv) { Keeper keeper = { 0 }; KeeperConfig *config = &(keeper.config); char nodesArrayInputFile[MAXPGPATH] = { 0 }; char *contents = NULL; long size = 0L; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; *config = keeperOptions; if (!keeper_config_read_file(config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!config->monitorDisabled) { log_fatal("The monitor is not disabled, there's no nodes file"); exit(EXIT_CODE_BAD_CONFIG); } if (argc != 1) { commandline_print_usage(&fsm_nodes_set, stderr); exit(EXIT_CODE_BAD_ARGS); } strlcpy(nodesArrayInputFile, argv[0], sizeof(nodesArrayInputFile)); if (!read_file_if_exists(nodesArrayInputFile, &contents, &size)) { log_error("Failed to read nodes array from file \"%s\"", nodesArrayInputFile); exit(EXIT_CODE_BAD_ARGS); } /* now read keeper's state */ if (!keeper_init(&keeper, config)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* now parse the nodes JSON file */ if (!parseNodesArray(contents, &(keeper.otherNodes), keeper.state.current_node_id)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } /* parsing is successful, so let's copy that file to the expected path */ if (!write_file(contents, size, config->pathnames.nodes)) { log_error("Failed to write input nodes file \"%s\" to \"%s\"", nodesArrayInputFile, config->pathnames.nodes); exit(EXIT_CODE_INTERNAL_ERROR); } (void) printNodeArray(&(keeper.otherNodes)); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_misc.c000066400000000000000000000411761414244367200231760ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_misc.c * Implementation of a CLI which lets you run operations on the local * postgres server directly. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "cli_common.h" #include "cli_do_root.h" #include "cli_root.h" #include "commandline.h" #include "config.h" #include "defaults.h" #include "env_utils.h" #include "file_utils.h" #include "fsm.h" #include "keeper_config.h" #include "keeper.h" #include "monitor.h" #include "monitor_config.h" #include "pgctl.h" #include "pgtuning.h" #include "primary_standby.h" #include "string_utils.h" /* * keeper_cli_create_replication_slot implements the CLI to create a replication * slot on the primary. */ void keeper_cli_create_replication_slot(int argc, char **argv) { KeeperConfig config = keeperOptions; LocalPostgresServer postgres = { 0 }; bool missingPgdataOk = false; bool pgNotRunningOk = false; keeper_config_init(&config, missingPgdataOk, pgNotRunningOk); local_postgres_init(&postgres, &(config.pgSetup)); if (!primary_create_replication_slot(&postgres, config.replication_slot_name)) { exit(EXIT_CODE_PGSQL); } } /* * keeper_cli_drop_replication_slot implements the CLI to drop a replication * slot on the primary. */ void keeper_cli_drop_replication_slot(int argc, char **argv) { KeeperConfig config = keeperOptions; LocalPostgresServer postgres = { 0 }; bool missingPgdataOk = false; bool pgNotRunningOk = false; keeper_config_init(&config, missingPgdataOk, pgNotRunningOk); local_postgres_init(&postgres, &(config.pgSetup)); if (!primary_drop_replication_slot(&postgres, config.replication_slot_name)) { exit(EXIT_CODE_PGSQL); } } /* * keeper_cli_add_defaults implements the CLI to add pg_auto_failover default * settings to postgresql.conf */ void keeper_cli_add_default_settings(int argc, char **argv) { KeeperConfig config = keeperOptions; LocalPostgresServer postgres = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { exit(EXIT_CODE_BAD_CONFIG); } local_postgres_init(&postgres, &(config.pgSetup)); if (!postgres_add_default_settings(&postgres, config.hostname)) { log_fatal("Failed to add the default settings for streaming replication " "used by pg_auto_failover to postgresql.conf, " "see above for details"); exit(EXIT_CODE_PGSQL); } } /* * keeper_create_monitor_user implements the CLI to add a user for the * pg_auto_failover monitor. */ void keeper_cli_create_monitor_user(int argc, char **argv) { KeeperConfig config = keeperOptions; LocalPostgresServer postgres = { 0 }; bool missingPgdataOk = false; bool postgresNotRunningOk = false; char monitorHostname[_POSIX_HOST_NAME_MAX]; int monitorPort = 0; int connlimit = 1; keeper_config_init(&config, missingPgdataOk, postgresNotRunningOk); local_postgres_init(&postgres, &(config.pgSetup)); int urlLength = strlcpy(config.monitor_pguri, argv[0], MAXCONNINFO); if (urlLength >= MAXCONNINFO) { log_fatal("Monitor URL \"%s\" given in command line is %d characters, " "the maximum supported by pg_autoctl is %d", argv[0], urlLength, MAXCONNINFO - 1); exit(EXIT_CODE_BAD_ARGS); } if (!hostname_from_uri(config.monitor_pguri, monitorHostname, _POSIX_HOST_NAME_MAX, &monitorPort)) { log_fatal("Failed to determine monitor hostname"); exit(EXIT_CODE_BAD_ARGS); } if (!primary_create_user_with_hba(&postgres, PG_AUTOCTL_HEALTH_USERNAME, PG_AUTOCTL_HEALTH_PASSWORD, monitorHostname, "trust", HBA_EDIT_MINIMAL, connlimit)) { log_fatal("Failed to create the database user that the pg_auto_failover " " monitor uses for health checks, see above for details"); exit(EXIT_CODE_PGSQL); } } /* * keeper_create_replication_user implements the CLI to add a user for the * secondary. */ void keeper_cli_create_replication_user(int argc, char **argv) { KeeperConfig config = keeperOptions; LocalPostgresServer postgres = { 0 }; bool missingPgdataOk = false; bool postgresNotRunningOk = false; keeper_config_init(&config, missingPgdataOk, postgresNotRunningOk); local_postgres_init(&postgres, &(config.pgSetup)); if (!primary_create_replication_user(&postgres, PG_AUTOCTL_REPLICA_USERNAME, config.replication_password)) { log_fatal("Failed to create the database user that a pg_auto_failover " " standby uses for replication, see above for details"); exit(EXIT_CODE_PGSQL); } } /* * keeper_cli_pgsetup_pg_ctl implements the CLI to find a suitable pg_ctl entry * from either the PG_CONFIG environment variable, or the PATH, then either * finding a single pg_ctl entry or falling back to a single pg_config entry * that we then use with pg_config --bindir. */ void keeper_cli_pgsetup_pg_ctl(int argc, char **argv) { bool success = true; PostgresSetup pgSetupMonitor = { 0 }; /* find first entry */ PostgresSetup pgSetupKeeper = { 0 }; /* find non ambiguous entry */ char PG_CONFIG[MAXPGPATH] = { 0 }; if (env_exists("PG_CONFIG") && get_env_copy("PG_CONFIG", PG_CONFIG, sizeof(PG_CONFIG))) { log_info("Environment variable PG_CONFIG is set to \"%s\"", PG_CONFIG); } if (config_find_pg_ctl(&pgSetupKeeper)) { log_info("`pg_autoctl create postgres` would use \"%s\" for Postgres %s", pgSetupKeeper.pg_ctl, pgSetupKeeper.pg_version); } else { log_fatal("pg_autoctl create postgres would fail to find pg_ctl"); success = false; } /* * This function EXITs when it's not happy, so we do it last: */ (void) set_first_pgctl(&pgSetupMonitor); log_info("`pg_autoctl create monitor` would use \"%s\" for Postgres %s", pgSetupMonitor.pg_ctl, pgSetupMonitor.pg_version); /* * Now check that find_extension_control_file would be happy. */ if (find_extension_control_file(pgSetupMonitor.pg_ctl, PG_AUTOCTL_MONITOR_EXTENSION_NAME)) { log_info("Found the control file for extension \"%s\"", PG_AUTOCTL_MONITOR_EXTENSION_NAME); } else { log_fatal("pg_autoctl on the monitor would fail " "to find extension \"%s\"", PG_AUTOCTL_MONITOR_EXTENSION_NAME); success = false; } if (!success) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * keeper_cli_pgsetup_discover implements the CLI to discover a PostgreSQL * setup thanks to PGDATA and other environment variables. */ void keeper_cli_pgsetup_discover(int argc, char **argv) { ConfigFilePaths pathnames = { 0 }; LocalPostgresServer postgres = { 0 }; PostgresSetup *pgSetup = &(postgres.postgresSetup); if (!cli_common_pgsetup_init(&pathnames, pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } bool missingPgdataOk = true; if (!pg_controldata(pgSetup, missingPgdataOk)) { exit(EXIT_CODE_PGCTL); } if (!IS_EMPTY_STRING_BUFFER(keeperOptions.hostname)) { fformat(stdout, "Node Name: %s\n", keeperOptions.hostname); } fprintf_pg_setup(stdout, pgSetup); } /* * keeper_cli_pgsetup_is_ready returns success when the local PostgreSQL setup * belongs to a server that is "ready". */ void keeper_cli_pgsetup_is_ready(int argc, char **argv) { ConfigFilePaths pathnames = { 0 }; LocalPostgresServer postgres = { 0 }; PostgresSetup *pgSetup = &(postgres.postgresSetup); if (!cli_common_pgsetup_init(&pathnames, pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } log_debug("Initialized pgSetup, now calling pg_setup_is_ready()"); bool pgIsNotRunningIsOk = false; bool pgIsReady = pg_setup_is_ready(pgSetup, pgIsNotRunningIsOk); log_info("Postgres status is: \"%s\"", pmStatusToString(pgSetup->pm_status)); if (pgIsReady) { exit(EXIT_CODE_QUIT); } exit(EXIT_CODE_PGSQL); } /* * keeper_cli_discover_pg_setup implements the CLI to discover a PostgreSQL * setup thanks to PGDATA and other environment variables. */ void keeper_cli_pgsetup_wait_until_ready(int argc, char **argv) { int timeout = 30; ConfigFilePaths pathnames = { 0 }; LocalPostgresServer postgres = { 0 }; PostgresSetup *pgSetup = &(postgres.postgresSetup); if (!cli_common_pgsetup_init(&pathnames, pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } log_debug("Initialized pgSetup, now calling pg_setup_wait_until_is_ready()"); bool pgIsReady = pg_setup_wait_until_is_ready(pgSetup, timeout, LOG_INFO); log_info("Postgres status is: \"%s\"", pmStatusToString(pgSetup->pm_status)); if (pgIsReady) { exit(EXIT_CODE_QUIT); } exit(EXIT_CODE_PGSQL); } /* * keeper_cli_pgsetup_startup_logs logs the Postgres startup logs. */ void keeper_cli_pgsetup_startup_logs(int argc, char **argv) { ConfigFilePaths pathnames = { 0 }; LocalPostgresServer postgres = { 0 }; PostgresSetup *pgSetup = &(postgres.postgresSetup); if (!cli_common_pgsetup_init(&pathnames, pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } log_debug("Initialized pgSetup, now calling pg_log_startup()"); if (!pg_log_startup(pgSetup->pgdata, LOG_INFO)) { exit(EXIT_CODE_PGCTL); } } /* * keeper_cli_pgsetup_tune compute some Postgres tuning for the local system. */ void keeper_cli_pgsetup_tune(int argc, char **argv) { char config[BUFSIZE] = { 0 }; if (!pgtuning_prepare_guc_settings(postgres_tuning, config, BUFSIZE)) { exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%s\n", config); } /* * keeper_cli_init_standby initializes a standby */ void keeper_cli_init_standby(int argc, char **argv) { const bool missing_pgdata_is_ok = true; const bool pg_not_running_is_ok = true; const bool skipBaseBackup = false; KeeperConfig config = keeperOptions; LocalPostgresServer postgres = { 0 }; if (argc != 2) { commandline_print_usage(&do_standby_init, stderr); exit(EXIT_CODE_BAD_ARGS); } keeper_config_init(&config, missing_pgdata_is_ok, pg_not_running_is_ok); local_postgres_init(&postgres, &(config.pgSetup)); int hostLength = strlcpy(postgres.replicationSource.primaryNode.host, argv[0], _POSIX_HOST_NAME_MAX); if (hostLength >= _POSIX_HOST_NAME_MAX) { log_fatal("Hostname \"%s\" given in command line is %d characters, " "the maximum supported by pg_autoctl is %d", argv[0], hostLength, MAXCONNINFO - 1); exit(EXIT_CODE_BAD_ARGS); } if (!stringToInt(argv[1], &(postgres.replicationSource.primaryNode.port))) { log_fatal("Argument is not a valid port number: \"%s\"", argv[1]); exit(EXIT_CODE_BAD_ARGS); } if (!standby_init_replication_source(&postgres, NULL, /* primaryNode is done */ PG_AUTOCTL_REPLICA_USERNAME, config.replication_password, config.replication_slot_name, config.maximum_backup_rate, config.backupDirectory, NULL, /* no targetLSN */ config.pgSetup.ssl, 0)) { /* can't happen at the moment */ exit(EXIT_CODE_INTERNAL_ERROR); } if (!standby_init_database(&postgres, config.hostname, skipBaseBackup)) { log_fatal("Failed to grant access to the standby by adding " "relevant lines to pg_hba.conf for the " "standby hostname and user, see above for details"); exit(EXIT_CODE_PGSQL); } } void keeper_cli_rewind_old_primary(int argc, char **argv) { const bool missing_pgdata_is_ok = false; const bool pg_not_running_is_ok = true; KeeperConfig config = keeperOptions; LocalPostgresServer postgres = { 0 }; if (argc < 1 || argc > 2) { commandline_print_usage(&do_standby_rewind, stderr); exit(EXIT_CODE_BAD_ARGS); } keeper_config_init(&config, missing_pgdata_is_ok, pg_not_running_is_ok); local_postgres_init(&postgres, &(config.pgSetup)); int hostLength = strlcpy(postgres.replicationSource.primaryNode.host, argv[0], _POSIX_HOST_NAME_MAX); if (hostLength >= _POSIX_HOST_NAME_MAX) { log_fatal("Hostname \"%s\" given in command line is %d characters, " "the maximum supported by pg_autoctl is %d", argv[0], hostLength, MAXCONNINFO - 1); exit(EXIT_CODE_BAD_ARGS); } if (!stringToInt(argv[1], &(postgres.replicationSource.primaryNode.port))) { log_fatal("Argument is not a valid port number: \"%s\"", argv[1]); exit(EXIT_CODE_BAD_ARGS); } if (!standby_init_replication_source(&postgres, NULL, /* primaryNode is done */ PG_AUTOCTL_REPLICA_USERNAME, config.replication_password, config.replication_slot_name, config.maximum_backup_rate, config.backupDirectory, NULL, /* no targetLSN */ config.pgSetup.ssl, 0)) { /* can't happen at the moment */ exit(EXIT_CODE_INTERNAL_ERROR); } if (!primary_rewind_to_standby(&postgres)) { log_fatal("Failed to rewind a demoted primary to standby, " "see above for details"); exit(EXIT_CODE_PGSQL); } } void keeper_cli_maybe_do_crash_recovery(int argc, char **argv) { const bool missing_pgdata_is_ok = false; const bool pg_not_running_is_ok = true; KeeperConfig config = keeperOptions; LocalPostgresServer postgres = { 0 }; keeper_config_init(&config, missing_pgdata_is_ok, pg_not_running_is_ok); local_postgres_init(&postgres, &(config.pgSetup)); if (!standby_init_replication_source(&postgres, NULL, /* primaryNode is done */ PG_AUTOCTL_REPLICA_USERNAME, config.replication_password, config.replication_slot_name, config.maximum_backup_rate, config.backupDirectory, NULL, /* no targetLSN */ config.pgSetup.ssl, 0)) { /* can't happen at the moment */ exit(EXIT_CODE_INTERNAL_ERROR); } if (!postgres_maybe_do_crash_recovery(&postgres)) { log_fatal("Failed to implement postgres crash recovery, " "see above for details"); exit(EXIT_CODE_PGSQL); } } void keeper_cli_promote_standby(int argc, char **argv) { const bool missing_pgdata_is_ok = false; const bool pg_not_running_is_ok = false; KeeperConfig config = keeperOptions; LocalPostgresServer postgres = { 0 }; keeper_config_init(&config, missing_pgdata_is_ok, pg_not_running_is_ok); local_postgres_init(&postgres, &(config.pgSetup)); if (!standby_promote(&postgres)) { log_fatal("Failed to promote a standby to primary, see above for details"); exit(EXIT_CODE_PGSQL); } } /* * keeper_cli_identify_system connects to a Postgres server using the * replication protocol to run the IDENTIFY_SYSTEM command. * * The IDENTIFY_SYSTEM replication command requests the server to identify * itself. We use this command mostly to ensure that we can establish a * replication connection to the upstream/primary server, which means that the * HBA setup is good to go. * * See https://www.postgresql.org/docs/12/protocol-replication.html for more * information about the replication protocol and commands. */ void keeper_cli_identify_system(int argc, char **argv) { const bool missing_pgdata_is_ok = true; const bool pg_not_running_is_ok = true; KeeperConfig config = keeperOptions; ReplicationSource replicationSource = { 0 }; if (argc != 2) { commandline_print_usage(&do_primary_identify_system, stderr); exit(EXIT_CODE_BAD_ARGS); } keeper_config_init(&config, missing_pgdata_is_ok, pg_not_running_is_ok); int hostLength = strlcpy(replicationSource.primaryNode.host, argv[0], _POSIX_HOST_NAME_MAX); if (hostLength >= _POSIX_HOST_NAME_MAX) { log_fatal("Hostname \"%s\" given in command line is %d characters, " "the maximum supported by pg_autoctl is %d", argv[0], hostLength, _POSIX_HOST_NAME_MAX - 1); exit(EXIT_CODE_BAD_ARGS); } if (!stringToInt(argv[1], &(replicationSource.primaryNode.port))) { log_fatal("Argument is not a valid port number: \"%s\"", argv[1]); exit(EXIT_CODE_BAD_ARGS); } strlcpy(replicationSource.applicationName, "pg_autoctl", MAXCONNINFO); strlcpy(replicationSource.userName, PG_AUTOCTL_REPLICA_USERNAME, NAMEDATALEN); if (!pgctl_identify_system(&replicationSource)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } IdentifySystem *system = &(replicationSource.system); fformat(stdout, "Current timeline: %d\n", system->timeline); fformat(stdout, "Current WAL LSN: %s\n", system->xlogpos); for (int index = 0; index < system->timelines.count; index++) { TimeLineHistoryEntry *entry = &(system->timelines.history[index]); char startLSN[PG_LSN_MAXLENGTH] = { 0 }; sformat(startLSN, sizeof(startLSN), "%X/%X", (uint32_t) (entry->begin >> 32), (uint32_t) entry->begin); fformat(stdout, "Timeline %d: %18s .. %X/%X\n", entry->tli, startLSN, (uint32_t) (entry->end >> 32), (uint32_t) entry->end); } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_monitor.c000066400000000000000000000456571414244367200237420ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_monitor.c * Implementation of a CLI which lets you interact with a pg_auto_failover * monitor. * * The monitor API only makes sense given a local pg_auto_failover keeper * setup: we need the formation and group, or the hostname and port, and at * registration time we want to create a state file, then at node_active time * we need many information obtained in both the configuration and the current * state. * * The `pg_autctl do monitor ...` commands are meant for testing the keeper use * of the monitor's API, not just the monitor API itself, so to make use of * those commands you need both a running monitor instance and a valid * configuration for a local keeper. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include "parson.h" #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "defaults.h" #include "keeper_config.h" #include "keeper.h" #include "monitor.h" #include "nodestate_utils.h" #include "parsing.h" #include "pgctl.h" #include "pgsetup.h" #include "pgsql.h" #include "state.h" static void cli_do_monitor_get_primary_node(int argc, char **argv); static void cli_do_monitor_get_other_nodes(int argc, char **argv); static void cli_do_monitor_get_candidate_count(int argc, char **argv); static void cli_do_monitor_get_coordinator(int argc, char **argv); static void cli_do_monitor_register_node(int argc, char **argv); static void cli_do_monitor_node_active(int argc, char **argv); static void cli_do_monitor_version(int argc, char **argv); static void cli_do_monitor_parse_notification(int argc, char **argv); static CommandLine monitor_get_primary_command = make_command("primary", "Get the primary node from pg_auto_failover in given formation/group", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_monitor_get_primary_node); static CommandLine monitor_get_other_nodes_command = make_command("others", "Get the other nodes from the pg_auto_failover group of hostname/port", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_monitor_get_other_nodes); static CommandLine monitor_get_candidate_count_command = make_command("candidate-count", "Get the failover candidate count in the group", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_monitor_get_candidate_count); static CommandLine monitor_get_coordinator_command = make_command("coordinator", "Get the coordinator node from the pg_auto_failover formation", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_monitor_get_coordinator); static CommandLine *monitor_get_commands[] = { &monitor_get_primary_command, &monitor_get_other_nodes_command, &monitor_get_candidate_count_command, &monitor_get_coordinator_command, NULL }; static CommandLine monitor_get_command = make_command_set("get", "Get information from the monitor", NULL, NULL, NULL, monitor_get_commands); static CommandLine monitor_register_command = make_command("register", "Register the current node with the monitor", CLI_PGDATA_USAGE "", CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_monitor_register_node); static CommandLine monitor_node_active_command = make_command("active", "Call in the pg_auto_failover Node Active protocol", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_monitor_node_active); static CommandLine monitor_version_command = make_command("version", "Check that monitor version is " PG_AUTOCTL_EXTENSION_VERSION "; alter extension update if not", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_monitor_version); static CommandLine monitor_parse_notification_command = make_command("parse-notification", "parse a raw notification message", " ", "", NULL, cli_do_monitor_parse_notification); static CommandLine *monitor_subcommands[] = { &monitor_get_command, &monitor_register_command, &monitor_node_active_command, &monitor_version_command, &monitor_parse_notification_command, NULL }; CommandLine do_monitor_commands = make_command_set("monitor", "Query a pg_auto_failover monitor", NULL, NULL, NULL, monitor_subcommands); /* * cli_do_monitor_get_primary_node contacts the pg_auto_failover monitor and * retrieves the primary node information for given formation and group. */ static void cli_do_monitor_get_primary_node(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; NodeAddress primaryNode; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged. */ exit(EXIT_CODE_BAD_CONFIG); } if (!monitor_init(&monitor, config.monitor_pguri)) { log_fatal("Failed to contact the monitor because its URL is invalid, " "see above for details"); exit(EXIT_CODE_BAD_CONFIG); } if (!monitor_get_primary(&monitor, config.formation, config.groupId, &primaryNode)) { log_fatal("Failed to get the primary node from the monitor, " "see above for details"); exit(EXIT_CODE_MONITOR); } /* output something easy to parse by another program */ if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *root = json_value_get_object(js); json_object_set_string(root, "formation", config.formation); json_object_set_number(root, "groupId", (double) config.groupId); json_object_set_number(root, "nodeId", (double) primaryNode.nodeId); json_object_set_string(root, "name", primaryNode.name); json_object_set_string(root, "host", primaryNode.host); json_object_set_number(root, "port", (double) primaryNode.port); (void) cli_pprint_json(js); } else { fformat(stdout, "%s/%d %s:%d\n", config.formation, config.groupId, primaryNode.host, primaryNode.port); } } /* * cli_do_monitor_get_other_nodes contacts the pg_auto_failover monitor and * retrieves the "other node" information for given hostname and port. */ static void cli_do_monitor_get_other_nodes(int argc, char **argv) { Keeper keeper = { 0 }; KeeperConfig *config = &(keeper.config); Monitor *monitor = &(keeper.monitor); bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; keeper.config = keeperOptions; if (!keeper_config_read_file(config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged. */ exit(EXIT_CODE_BAD_CONFIG); } /* load the state file to get the node id */ if (!keeper_init(&keeper, config)) { /* errors are logged in keeper_state_read */ exit(EXIT_CODE_BAD_STATE); } if (!monitor_init(monitor, config->monitor_pguri)) { log_fatal("Failed to contact the monitor because its URL is invalid, " "see above for details"); exit(EXIT_CODE_BAD_CONFIG); } if (outputJSON) { if (!monitor_print_other_nodes_as_json(monitor, keeper.state.current_node_id, ANY_STATE)) { log_fatal("Failed to get the other nodes from the monitor, " "see above for details"); exit(EXIT_CODE_MONITOR); } } else { if (!monitor_print_other_nodes(monitor, keeper.state.current_node_id, ANY_STATE)) { log_fatal("Failed to get the other nodes from the monitor, " "see above for details"); exit(EXIT_CODE_MONITOR); } } } /* * cli_do_monitor_get_candidate_count contacts the pg_auto_failover monitor and * retrieves the current count of failover candidate nodes. */ static void cli_do_monitor_get_candidate_count(int argc, char **argv) { Keeper keeper = { 0 }; KeeperConfig *config = &(keeper.config); Monitor *monitor = &(keeper.monitor); bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; keeper.config = keeperOptions; if (!keeper_config_read_file(config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged. */ exit(EXIT_CODE_BAD_CONFIG); } /* load the state file to get the node id */ if (!keeper_init(&keeper, config)) { /* errors are logged in keeper_state_read */ exit(EXIT_CODE_BAD_STATE); } if (!monitor_init(monitor, config->monitor_pguri)) { log_fatal("Failed to contact the monitor because its URL is invalid, " "see above for details"); exit(EXIT_CODE_BAD_CONFIG); } int failoverCandidateCount = 0; if (!monitor_count_failover_candidates(monitor, config->formation, config->groupId, &failoverCandidateCount)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *root = json_value_get_object(js); json_object_set_string(root, "formation", config->formation); json_object_set_number(root, "groupId", (double) config->groupId); json_object_set_number(root, "failoverCandidateCount", (double) failoverCandidateCount); (void) cli_pprint_json(js); } else { fformat(stdout, "%d\n", failoverCandidateCount); } } /* * cli_do_monitor_get_coordinator contacts the pg_auto_failover monitor and * retrieves the "coordinator" information for given formation. */ static void cli_do_monitor_get_coordinator(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; CoordinatorNodeAddress coordinatorNode = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged. */ exit(EXIT_CODE_BAD_CONFIG); } if (!monitor_init(&monitor, config.monitor_pguri)) { log_fatal("Failed to contact the monitor because its URL is invalid, " "see above for details"); exit(EXIT_CODE_BAD_CONFIG); } if (!monitor_get_coordinator(&monitor, config.formation, &coordinatorNode)) { log_fatal("Failed to get the coordinator node from the monitor, " "see above for details"); exit(EXIT_CODE_MONITOR); } if (IS_EMPTY_STRING_BUFFER(coordinatorNode.node.host)) { fformat(stdout, "%s has no coordinator ready yet\n", config.formation); exit(EXIT_CODE_QUIT); } /* output something easy to parse by another program */ if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *root = json_value_get_object(js); json_object_set_string(root, "formation", config.formation); json_object_set_number(root, "groupId", (double) config.groupId); json_object_set_string(root, "host", coordinatorNode.node.host); json_object_set_number(root, "port", (double) coordinatorNode.node.port); (void) cli_pprint_json(js); } else { fformat(stdout, "%s %s:%d\n", config.formation, coordinatorNode.node.host, coordinatorNode.node.port); } } /* * keeper_cli_monitor_register_node registers the current node to the monitor. */ static void cli_do_monitor_register_node(int argc, char **argv) { Keeper keeper = { 0 }; KeeperConfig config = keeperOptions; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; if (argc != 1) { log_error("Missing argument: "); exit(EXIT_CODE_BAD_ARGS); } NodeState initialState = NodeStateFromString(argv[0]); /* * On the keeper's side we should only accept to register a local node to * the monitor in a state that matches what we have found. A SINGLE node * shoud certainly have a PostgreSQL running already, for instance. * * Then again, we are not overly protective here because we also need this * command to test the monitor's side of handling different kinds of * situations. */ switch (initialState) { case NO_STATE: { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } case INIT_STATE: { missingPgdataIsOk = true; pgIsNotRunningIsOk = true; break; } case SINGLE_STATE: { missingPgdataIsOk = false; pgIsNotRunningIsOk = true; break; } case WAIT_STANDBY_STATE: { missingPgdataIsOk = false; pgIsNotRunningIsOk = false; break; } default: { /* let the monitor decide if the situation is supported or not */ missingPgdataIsOk = true; pgIsNotRunningIsOk = true; break; } } /* The processing of the --pgdata option has set keeperConfigFilePath. */ if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged. */ exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_register_and_init(&keeper, initialState)) { exit(EXIT_CODE_BAD_STATE); } /* output something easy to parse by another program */ if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *root = json_value_get_object(js); json_object_set_string(root, "formation", config.formation); json_object_set_string(root, "host", config.hostname); json_object_set_number(root, "port", (double) config.pgSetup.pgport); json_object_set_number(root, "nodeId", (double) keeper.state.current_node_id); json_object_set_number(root, "groupId", (double) keeper.state.current_group); json_object_set_string(root, "assigned_role", NodeStateToString(keeper.state.assigned_role)); (void) cli_pprint_json(js); } else { fformat(stdout, "%s/%d %s:%d %d:%d %s\n", config.formation, config.groupId, config.hostname, config.pgSetup.pgport, keeper.state.current_node_id, keeper.state.current_group, NodeStateToString(keeper.state.assigned_role)); } } /* * keeper_cli_monitor_node_active contacts the monitor with the current state * of the keeper and get an assigned state from there. */ static void cli_do_monitor_node_active(int argc, char **argv) { Keeper keeper = { 0 }; KeeperConfig config = keeperOptions; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; MonitorAssignedState assignedState = { 0 }; /* The processing of the --pgdata option has set keeperConfigFilePath. */ if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged. */ exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_init(&keeper, &config)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* * Update our in-memory representation of PostgreSQL state, ignore errors * as in the main loop: we continue with default WAL lag of -1 and an empty * string for pgsrSyncState. */ (void) keeper_update_pg_state(&keeper, LOG_WARN); if (!monitor_node_active(&keeper.monitor, config.formation, keeper.state.current_node_id, keeper.state.current_group, keeper.state.current_role, keeper.postgres.pgIsRunning, keeper.postgres.postgresSetup.control.timeline_id, keeper.postgres.currentLSN, keeper.postgres.pgsrSyncState, &assignedState)) { log_fatal("Failed to get the goal state from the node with the monitor, " "see above for details"); exit(EXIT_CODE_PGSQL); } if (!keeper_update_state(&keeper, assignedState.nodeId, assignedState.groupId, assignedState.state, true)) { /* log and error but continue, giving more information to the user */ log_error("Failed to update keepers's state"); } /* output something easy to parse by another program */ if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *root = json_value_get_object(js); json_object_set_string(root, "formation", config.formation); json_object_set_string(root, "host", config.hostname); json_object_set_number(root, "port", (double) config.pgSetup.pgport); json_object_set_number(root, "nodeId", (double) assignedState.nodeId); json_object_set_number(root, "groupId", (double) assignedState.groupId); json_object_set_string(root, "assigned_role", NodeStateToString(assignedState.state)); (void) cli_pprint_json(js); } else { fformat(stdout, "%s/%d %s:%d %" PRId64 ":%d %s\n", config.formation, config.groupId, config.hostname, config.pgSetup.pgport, assignedState.nodeId, assignedState.groupId, NodeStateToString(assignedState.state)); } } /* * cli_monitor_version ensures that the version of the monitor is the one that * is expected by pg_autoctl too. When that's not the case, the command issues * an ALTER EXTENSION ... UPDATE TO ... to ensure that the monitor is now * running the expected version number. */ static void cli_do_monitor_version(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; MonitorExtensionVersion version = { 0 }; LocalPostgresServer postgres = { 0 }; if (!monitor_init_from_pgsetup(&monitor, &config.pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } (void) local_postgres_init(&postgres, &(monitor.config.pgSetup)); /* Check version compatibility */ if (!monitor_ensure_extension_version(&monitor, &postgres, &version)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } if (outputJSON) { log_warn("This command does not support JSON output at the moment"); } fformat(stdout, "%s\n", version.installedVersion); } /* * cli_do_monitor_parse_notification parses a raw notification message as given * by the monitor LISTEN/NOTIFY protocol on the state channel, such as: * * { * "type": "state", "formation": "default", "groupId": 0, "nodeId": 1, * "name": "node_1", "host": "localhost", "port": 5001, * "reportedState": "maintenance", "goalState": "maintenance" * } */ static void cli_do_monitor_parse_notification(int argc, char **argv) { CurrentNodeState nodeState = { 0 }; JSON_Value *js = json_value_init_object(); JSON_Object *root = json_value_get_object(js); if (argc != 1) { commandline_print_usage(&monitor_parse_notification_command, stderr); exit(EXIT_CODE_BAD_ARGS); } /* errors are logged by parse_state_notification_message */ if (!parse_state_notification_message(&nodeState, argv[0])) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* log the notification just parsed */ (void) nodestate_log(&nodeState, LOG_INFO, 0); json_object_set_string(root, "name", nodeState.node.name); json_object_set_string(root, "hostname", nodeState.node.host); json_object_set_number(root, "port", (double) nodeState.node.port); json_object_set_string(root, "formationid", nodeState.formation); json_object_set_string(root, "reportedState", NodeStateToString(nodeState.reportedState)); json_object_set_string(root, "goalState", NodeStateToString(nodeState.goalState)); (void) cli_pprint_json(js); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_root.c000066400000000000000000000450211414244367200232170ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_root.c * Implementation of a CLI which lets you run operations on the local * postgres server directly. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "cli_do_root.h" #include "commandline.h" #include "config.h" #include "defaults.h" #include "file_utils.h" #include "fsm.h" #include "keeper_config.h" #include "keeper.h" #include "monitor.h" #include "monitor_config.h" #include "pgctl.h" #include "primary_standby.h" CommandLine do_primary_adduser_monitor = make_command("monitor", "add a local user for queries from the monitor", "", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_create_monitor_user); CommandLine do_primary_adduser_replica = make_command("replica", "add a local user with replication privileges", "", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_create_replication_user); CommandLine *do_primary_adduser_subcommands[] = { &do_primary_adduser_monitor, &do_primary_adduser_replica, NULL }; CommandLine do_primary_adduser = make_command_set("adduser", "Create users on primary", NULL, NULL, NULL, do_primary_adduser_subcommands); CommandLine do_primary_slot_create = make_command("create", "Create a replication slot on the primary server", "", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_create_replication_slot); CommandLine do_primary_slot_drop = make_command("drop", "Drop a replication slot on the primary server", "", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_drop_replication_slot); CommandLine *do_primary_slot[] = { &do_primary_slot_create, &do_primary_slot_drop, NULL }; CommandLine do_primary_slot_ = make_command_set("slot", "Manage replication slot on the primary server", NULL, NULL, NULL, do_primary_slot); CommandLine do_primary_defaults = make_command("defaults", "Add default settings to postgresql.conf", "", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_add_default_settings); CommandLine do_primary_identify_system = make_command("identify", "Run the IDENTIFY_SYSTEM replication command on given host", " host port", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_identify_system); CommandLine *do_primary[] = { &do_primary_slot_, &do_primary_adduser, &do_primary_defaults, &do_primary_identify_system, NULL }; CommandLine do_primary_ = make_command_set("primary", "Manage a PostgreSQL primary server", NULL, NULL, NULL, do_primary); CommandLine do_standby_init = make_command("init", "Initialize the standby server using pg_basebackup", "[option ...] \n", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_init_standby); CommandLine do_standby_rewind = make_command("rewind", "Rewind a demoted primary server using pg_rewind", " ", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_rewind_old_primary); CommandLine do_standby_crash_recovery = make_command("crash-recovery", "Setup postgres for crash-recovery and start postgres", " [ --pgdata ... ]", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_maybe_do_crash_recovery); CommandLine do_standby_promote = make_command("promote", "Promote a standby server to become writable", "", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_promote_standby); CommandLine *do_standby[] = { &do_standby_init, &do_standby_rewind, &do_standby_crash_recovery, &do_standby_promote, NULL }; CommandLine do_standby_ = make_command_set("standby", "Manage a PostgreSQL standby server", NULL, NULL, NULL, do_standby); CommandLine do_pgsetup_pg_ctl = make_command("pg_ctl", "Find a non-ambiguous pg_ctl program and Postgres version", "[option ...]", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_pgsetup_pg_ctl); CommandLine do_pgsetup_discover = make_command("discover", "Discover local PostgreSQL instance, if any", "[option ...]", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_pgsetup_discover); CommandLine do_pgsetup_is_ready = make_command("ready", "Return true is the local Postgres server is ready", "[option ...]", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_pgsetup_is_ready); CommandLine do_pgsetup_wait_until_ready = make_command("wait", "Wait until the local Postgres server is ready", "[option ...]", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_pgsetup_wait_until_ready); CommandLine do_pgsetup_startup_logs = make_command("logs", "Outputs the Postgres startup logs", "[option ...]", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_pgsetup_startup_logs); CommandLine do_pgsetup_tune = make_command("tune", "Compute and log some Postgres tuning options", "[option ...]", KEEPER_CLI_WORKER_SETUP_OPTIONS, keeper_cli_keeper_setup_getopts, keeper_cli_pgsetup_tune); CommandLine *do_pgsetup[] = { &do_pgsetup_pg_ctl, &do_pgsetup_discover, &do_pgsetup_is_ready, &do_pgsetup_wait_until_ready, &do_pgsetup_startup_logs, &do_pgsetup_tune, NULL }; CommandLine do_pgsetup_commands = make_command_set("pgsetup", "Manage a local Postgres setup", NULL, NULL, NULL, do_pgsetup); CommandLine do_tmux_script = make_command("script", "Produce a tmux script for a demo or a test case (debug only)", "[option ...]", " --root path where to create a cluster\n" " --first-pgport first Postgres port to use (5500)\n" " --nodes number of Postgres nodes to create (2)\n" " --async-nodes number of async nodes within nodes (0)\n" " --node-priorities list of nodes priorities (50)\n" " --sync-standbys number-sync-standbys to set (0 or 1)\n" " --skip-pg-hba use --skip-pg-hba when creating nodes\n" " --layout tmux layout to use (even-vertical)\n" " --binpath path to the pg_autoctl binary (current binary path)", cli_do_tmux_script_getopts, cli_do_tmux_script); CommandLine do_tmux_session = make_command("session", "Run a tmux session for a demo or a test case", "[option ...]", " --root path where to create a cluster\n" " --first-pgport first Postgres port to use (5500)\n" " --nodes number of Postgres nodes to create (2)\n" " --async-nodes number of async nodes within nodes (0)\n" " --node-priorities list of nodes priorities (50)\n" " --sync-standbys number-sync-standbys to set (0 or 1)\n" " --skip-pg-hba use --skip-pg-hba when creating nodes\n" " --layout tmux layout to use (even-vertical)\n" " --binpath path to the pg_autoctl binary (current binary path)", cli_do_tmux_script_getopts, cli_do_tmux_session); CommandLine do_tmux_stop = make_command("stop", "Stop pg_autoctl processes that belong to a tmux session ", "[option ...]", " --root path where to create a cluster\n" " --first-pgport first Postgres port to use (5500)\n" " --nodes number of Postgres nodes to create (2)", cli_do_tmux_script_getopts, cli_do_tmux_stop); CommandLine do_tmux_clean = make_command("clean", "Clean-up a tmux session processes and root dir", "[option ...]", " --root path where to create a cluster\n" " --first-pgport first Postgres port to use (5500)\n" " --nodes number of Postgres nodes to create (2)", cli_do_tmux_script_getopts, cli_do_tmux_clean); CommandLine do_tmux_wait = make_command("wait", "Wait until a given node has been registered on the monitor", "[option ...] nodename [ targetState ]", " --root path where to create a cluster\n" " --first-pgport first Postgres port to use (5500)\n" " --nodes number of Postgres nodes to create (2)\n" " --async-nodes number of async nodes within nodes (0)\n" " --node-priorities list of nodes priorities (50)\n" " --sync-standbys number-sync-standbys to set (0 or 1)\n" " --skip-pg-hba use --skip-pg-hba when creating nodes\n" " --layout tmux layout to use (even-vertical)", cli_do_tmux_script_getopts, cli_do_tmux_wait); CommandLine *do_tmux[] = { &do_tmux_script, &do_tmux_session, &do_tmux_stop, &do_tmux_wait, &do_tmux_clean, NULL }; CommandLine do_tmux_commands = make_command_set("tmux", "Set of facilities to handle tmux interactive sessions", NULL, NULL, NULL, do_tmux); /* * pg_autoctl do azure ... * * Set of commands to prepare and control a full QA environment running in * Azure VMs, provisionned either from our packages or from local source code. */ CommandLine do_azure_provision_region = make_command("region", "Provision an azure region: resource group, network, VMs", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region name to use for referencing the region\n" " --location azure location where to create a resource group\n" " --monitor should we create a monitor in the region (false)\n" " --nodes number of Postgres nodes to create (2)\n" " --script output a shell script instead of creating resources\n", cli_do_azure_getopts, cli_do_azure_create_region); CommandLine do_azure_provision_nodes = make_command("nodes", "Provision our pre-created VM with pg_autoctl Postgres nodes", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region name to use for referencing the region\n" " --monitor should we create a monitor in the region (false)\n" " --nodes number of Postgres nodes to create (2)\n" " --script output a shell script instead of creating resources\n", cli_do_azure_getopts, cli_do_azure_create_nodes); CommandLine *do_azure_provision[] = { &do_azure_provision_region, &do_azure_provision_nodes, NULL }; CommandLine do_azure_provision_commands = make_command_set("provision", "provision azure resources for a pg_auto_failover demo", NULL, NULL, NULL, do_azure_provision); CommandLine do_azure_create = make_command("create", "Create an azure QA environment", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region name to use for referencing the region\n" " --location azure location to use for the resources\n" " --nodes number of Postgres nodes to create (2)\n" " --script output a script instead of creating resources\n" " --no-monitor do not create the pg_autoctl monitor node\n" " --no-app do not create the application node\n" " --cidr use the 10.CIDR.CIDR.0/24 subnet (11)\n" " --from-source provision pg_auto_failover from sources\n", cli_do_azure_getopts, cli_do_azure_create_environment); CommandLine do_azure_drop = make_command("drop", "Drop an azure QA environment: resource group, network, VMs", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region name to use for referencing the region\n" " --location azure location where to create a resource group\n" " --monitor should we create a monitor in the region (false)\n" " --nodes number of Postgres nodes to create (2)\n" " --script output a shell script instead of creating resources\n", cli_do_azure_getopts, cli_do_azure_drop_region); CommandLine do_azure_deploy = make_command("deploy", "Deploy a pg_autoctl VMs, given by name", "[option ...] vmName", "", cli_do_azure_getopts, cli_do_azure_deploy); CommandLine do_azure_show_ips = make_command("ips", "Show public and private IP addresses for selected VMs", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region name to use for referencing the region\n", cli_do_azure_getopts, cli_do_azure_show_ips); CommandLine do_azure_show_state = make_command("state", "Connect to the monitor node to show the current state", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region name to use for referencing the region\n" " --watch run the command again every 0.2s\n", cli_do_azure_getopts, cli_do_azure_show_state); CommandLine *do_azure_show[] = { &do_azure_show_ips, &do_azure_show_state, NULL }; CommandLine do_azure_show_commands = make_command_set("show", "show azure resources for a pg_auto_failover demo", NULL, NULL, NULL, do_azure_show); CommandLine do_azure_ls = make_command("ls", "List resources in a given azure region", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region name to use for referencing the region\n", cli_do_azure_getopts, cli_do_azure_ls); CommandLine do_azure_ssh = make_command("ssh", "Runs ssh -l ha-admin for a given VM name", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region name to use for referencing the region\n", cli_do_azure_getopts, cli_do_azure_ssh); CommandLine do_azure_sync = make_command("sync", "Rsync pg_auto_failover sources on all the target region VMs", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region region to use for referencing the region\n" " --monitor should we create a monitor in the region (false)\n" " --nodes number of Postgres nodes to create (2)\n", cli_do_azure_getopts, cli_do_azure_rsync); CommandLine do_azure_tmux_session = make_command("session", "Create or attach a tmux session for the created Azure VMs", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region region to use for referencing the region\n" " --monitor should we create a monitor in the region (false)\n" " --nodes number of Postgres nodes to create (2)\n", cli_do_azure_getopts, cli_do_azure_tmux_session); CommandLine do_azure_tmux_kill = make_command("kill", "Kill an existing tmux session for Azure VMs", "[option ...]", " --prefix azure group name prefix (ha-demo)\n" " --region region to use for referencing the region\n" " --monitor should we create a monitor in the region (false)\n" " --nodes number of Postgres nodes to create (2)\n", cli_do_azure_getopts, cli_do_azure_tmux_kill); CommandLine *do_azure_tmux[] = { &do_azure_tmux_session, &do_azure_tmux_kill, NULL }; CommandLine do_azure_tmux_commands = make_command_set("tmux", "Run a tmux session with an Azure setup for QA/testing", NULL, NULL, NULL, do_azure_tmux); CommandLine *do_azure[] = { &do_azure_provision_commands, &do_azure_tmux_commands, &do_azure_show_commands, &do_azure_deploy, &do_azure_create, &do_azure_drop, &do_azure_ls, &do_azure_ssh, &do_azure_sync, NULL }; CommandLine do_azure_commands = make_command_set("azure", "Manage a set of Azure resources for a pg_auto_failover demo", NULL, NULL, NULL, do_azure); CommandLine *do_subcommands[] = { &do_monitor_commands, &do_fsm_commands, &do_primary_, &do_standby_, &do_show_commands, &do_pgsetup_commands, &do_service_postgres_ctl_commands, &do_service_commands, &do_tmux_commands, &do_azure_commands, &do_demo_commands, NULL }; CommandLine do_commands = make_command_set("do", "Internal commands and internal QA tooling", NULL, NULL, NULL, do_subcommands); /* * keeper_cli_keeper_setup_getopts parses command line options and set the * global variable keeperOptions from them, without doing any check. */ int keeper_cli_keeper_setup_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; SSLCommandLineOptions sslCommandLineOptions = SSL_CLI_UNKNOWN; static struct option long_options[] = { { "pgctl", required_argument, NULL, 'C' }, { "pgdata", required_argument, NULL, 'D' }, { "pghost", required_argument, NULL, 'H' }, { "pgport", required_argument, NULL, 'p' }, { "listen", required_argument, NULL, 'l' }, { "username", required_argument, NULL, 'U' }, { "auth", required_argument, NULL, 'A' }, { "skip-pg-hba", no_argument, NULL, 'S' }, { "dbname", required_argument, NULL, 'd' }, { "hostname", required_argument, NULL, 'n' }, { "formation", required_argument, NULL, 'f' }, { "monitor", required_argument, NULL, 'm' }, { "disable-monitor", no_argument, NULL, 'M' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { "candidate-priority", required_argument, NULL, 'P' }, { "replication-quorum", required_argument, NULL, 'r' }, { "run", no_argument, NULL, 'x' }, { "help", no_argument, NULL, 0 }, { "no-ssl", no_argument, NULL, 'N' }, { "ssl-self-signed", no_argument, NULL, 's' }, { "ssl-mode", required_argument, &ssl_flag, SSL_MODE_FLAG }, { "ssl-ca-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG }, { "ssl-crl-file", required_argument, &ssl_flag, SSL_CRL_FILE_FLAG }, { "server-cert", required_argument, &ssl_flag, SSL_SERVER_CRT_FLAG }, { "server-key", required_argument, &ssl_flag, SSL_SERVER_KEY_FLAG }, { NULL, 0, NULL, 0 } }; /* * The only command lines that are using keeper_cli_getopt_pgdata are * terminal ones: they don't accept subcommands. In that case our option * parsing can happen in any order and we don't need getopt_long to behave * in a POSIXLY_CORRECT way. * * The unsetenv() call allows getopt_long() to reorder arguments for us. */ unsetenv("POSIXLY_CORRECT"); int optind = cli_common_keeper_getopts(argc, argv, long_options, "C:D:H:p:l:U:A:SLd:n:f:m:MRVvqhP:r:xsN", &options, &sslCommandLineOptions); /* publish our option parsing in the global variable */ keeperOptions = options; return optind; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_root.h000066400000000000000000000106411414244367200232240ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_root.h * Implementation of a CLI which lets you run individual keeper routines * directly * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef CLI_DO_ROOT_H #define CLI_DO_ROOT_H #include "commandline.h" /* src/bin/pg_autoctl/cli_do_fsm.c */ extern CommandLine do_fsm_commands; /* src/bin/pg_autoctl/cli_do_monitor.c */ extern CommandLine do_monitor_commands; /* src/bin/pg_autoctl/cli_do_service.c */ extern CommandLine do_service_commands; extern CommandLine do_service_postgres_ctl_commands; /* src/bin/pg_autoctl/cli_do_show.c */ extern CommandLine do_show_commands; extern CommandLine do_pgsetup_commands; extern CommandLine do_service_postgres_ctl_commands; extern CommandLine do_service_commands; /* src/bin/pg_autoctl/cli_do_demo.c */ extern CommandLine do_demo_commands; /* src/bin/pg_autoctl/cli_do_root.c */ extern CommandLine do_primary_adduser; extern CommandLine *do_primary_adduser_subcommands[]; extern CommandLine do_primary_adduser_monitor; extern CommandLine do_primary_adduser_replica; extern CommandLine do_primary_syncrep_; extern CommandLine *do_primary_syncrep[]; extern CommandLine do_primary_syncrep_enable; extern CommandLine do_primary_syncrep_disable; extern CommandLine do_primary_slot_; extern CommandLine *do_primary_slot[]; extern CommandLine do_primary_slot_create; extern CommandLine do_primary_slot_drop; extern CommandLine do_primary_hba; extern CommandLine *do_primary_hba_commands[]; extern CommandLine do_primary_hba_setup; extern CommandLine do_primary_defaults; extern CommandLine do_primary_identify_system; extern CommandLine do_primary_; extern CommandLine *do_primary[]; extern CommandLine do_standby_; extern CommandLine *do_standby[]; extern CommandLine do_standby_init; extern CommandLine do_standby_rewind; extern CommandLine do_standby_promote; extern CommandLine do_discover; extern CommandLine do_tmux_commands; /* src/bin/pg_autoctl/cli_do_azure.c */ extern CommandLine do_azure_ssh; extern CommandLine do_commands; extern CommandLine *do_subcommands[]; int keeper_cli_keeper_setup_getopts(int argc, char **argv); /* src/bin/pg_autoctl/cli_do_misc.c */ void keeper_cli_create_replication_slot(int argc, char **argv); void keeper_cli_drop_replication_slot(int argc, char **argv); void keeper_cli_enable_synchronous_replication(int argc, char **argv); void keeper_cli_disable_synchronous_replication(int argc, char **argv); void keeper_cli_pgsetup_pg_ctl(int argc, char **argv); void keeper_cli_pgsetup_discover(int argc, char **argv); void keeper_cli_pgsetup_is_ready(int argc, char **argv); void keeper_cli_pgsetup_wait_until_ready(int argc, char **argv); void keeper_cli_pgsetup_startup_logs(int argc, char **argv); void keeper_cli_pgsetup_tune(int argc, char **argv); void keeper_cli_add_default_settings(int argc, char **argv); void keeper_cli_create_monitor_user(int argc, char **argv); void keeper_cli_create_replication_user(int argc, char **argv); void keeper_cli_add_standby_to_hba(int argc, char **argv); void keeper_cli_init_standby(int argc, char **argv); void keeper_cli_rewind_old_primary(int argc, char **argv); void keeper_cli_maybe_do_crash_recovery(int argc, char **argv); void keeper_cli_promote_standby(int argc, char **argv); void keeper_cli_receiwal(int argc, char **argv); void keeper_cli_identify_system(int argc, char **argv); /* src/bin/pg_autoctl/cli_do_tmux.c */ int cli_do_tmux_script_getopts(int argc, char **argv); void cli_do_tmux_script(int argc, char **argv); void cli_do_tmux_session(int argc, char **argv); void cli_do_tmux_stop(int argc, char **argv); void cli_do_tmux_clean(int argc, char **argv); void cli_do_tmux_wait(int argc, char **argv); /* src/bin/pg_autoctl/cli_do_azure.c */ int cli_do_azure_getopts(int argc, char **argv); void cli_do_azure_create_environment(int argc, char **argv); void cli_do_azure_create_region(int argc, char **argv); void cli_do_azure_drop_region(int argc, char **argv); void cli_do_azure_deploy(int argc, char **argv); void cli_do_azure_create_nodes(int argc, char **argv); void cli_do_azure_ls(int argc, char **argv); void cli_do_azure_show_ips(int argc, char **argv); void cli_do_azure_ssh(int argc, char **argv); void cli_do_azure_rsync(int argc, char **argv); void cli_do_azure_show_state(int argc, char **argv); void cli_do_azure_tmux_session(int argc, char **argv); void cli_do_azure_tmux_kill(int argc, char **argv); #endif /* CLI_DO_ROOT_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_service.c000066400000000000000000000357571414244367200237130ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_service.c * Implementation of a CLI for controlling the pg_autoctl service. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "defaults.h" #include "keeper_config.h" #include "keeper.h" #include "monitor.h" #include "monitor_config.h" #include "pidfile.h" #include "service_keeper.h" #include "service_monitor.h" #include "service_postgres_ctl.h" #include "signals.h" #include "supervisor.h" static void cli_do_service_postgres(int argc, char **argv); static void cli_do_service_pgcontroller(int argc, char **argv); static void cli_do_service_postgresctl_on(int argc, char **argv); static void cli_do_service_postgresctl_off(int argc, char **argv); static void cli_do_service_getpid(const char *serviceName); static void cli_do_service_getpid_postgres(int argc, char **argv); static void cli_do_service_getpid_listener(int argc, char **argv); static void cli_do_service_getpid_node_active(int argc, char **argv); static void cli_do_service_restart(const char *serviceName); static void cli_do_service_restart_postgres(int argc, char **argv); static void cli_do_service_restart_listener(int argc, char **argv); static void cli_do_service_restart_node_active(int argc, char **argv); static void cli_do_service_monitor_listener(int argc, char **argv); static void cli_do_service_node_active(int argc, char **argv); CommandLine service_pgcontroller = make_command("pgcontroller", "pg_autoctl supervised postgres controller", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_pgcontroller); CommandLine service_postgres = make_command("postgres", "pg_autoctl service that start/stop postgres when asked", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_postgres); CommandLine service_monitor_listener = make_command("listener", "pg_autoctl service that listens to the monitor notifications", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_monitor_listener); CommandLine service_node_active = make_command("node-active", "pg_autoctl service that implements the node active protocol", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_node_active); CommandLine service_getpid_postgres = make_command("postgres", "Get the pid of the pg_autoctl postgres controller service", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_getpid_postgres); CommandLine service_getpid_listener = make_command("listener", "Get the pid of the pg_autoctl monitor listener service", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_getpid_listener); CommandLine service_getpid_node_active = make_command("node-active", "Get the pid of the pg_autoctl keeper node-active service", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_getpid_node_active); static CommandLine *service_getpid[] = { &service_getpid_postgres, &service_getpid_listener, &service_getpid_node_active, NULL }; CommandLine do_service_getpid_commands = make_command_set("getpid", "Get the pid of pg_autoctl sub-processes (services)", NULL, NULL, NULL, service_getpid); CommandLine service_restart_postgres = make_command("postgres", "Restart the pg_autoctl postgres controller service", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_restart_postgres); CommandLine service_restart_listener = make_command("listener", "Restart the pg_autoctl monitor listener service", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_restart_listener); CommandLine service_restart_node_active = make_command("node-active", "Restart the pg_autoctl keeper node-active service", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_restart_node_active); static CommandLine *service_restart[] = { &service_restart_postgres, &service_restart_listener, &service_restart_node_active, NULL }; CommandLine do_service_restart_commands = make_command_set("restart", "Restart pg_autoctl sub-processes (services)", NULL, NULL, NULL, service_restart); static CommandLine *service[] = { &do_service_getpid_commands, &do_service_restart_commands, &service_pgcontroller, &service_postgres, &service_monitor_listener, &service_node_active, NULL }; CommandLine do_service_commands = make_command_set("service", "Run pg_autoctl sub-processes (services)", NULL, NULL, NULL, service); CommandLine service_postgres_ctl_on = make_command("on", "Signal pg_autoctl postgres service to ensure Postgres is running", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_postgresctl_on); CommandLine service_postgres_ctl_off = make_command("off", "Signal pg_autoctl postgres service to ensure Postgres is stopped", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_do_service_postgresctl_off); static CommandLine *pgctl[] = { &service_postgres_ctl_on, &service_postgres_ctl_off, NULL }; CommandLine do_service_postgres_ctl_commands = make_command_set("pgctl", "Signal the pg_autoctl postgres service", NULL, NULL, NULL, pgctl); /* * cli_do_service_getpid retrieves the PID of a service running within the * pg_autoctl supervision. */ static void cli_do_service_getpid(const char *serviceName) { ConfigFilePaths pathnames = { 0 }; LocalPostgresServer postgres = { 0 }; pid_t pid = -1; if (!cli_common_pgsetup_init(&pathnames, &(postgres.postgresSetup))) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!supervisor_find_service_pid(pathnames.pid, serviceName, &pid)) { log_fatal("Failed to find pid for service name \"%s\"", serviceName); exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%d\n", pid); } /* * cli_do_service_getpid_postgres gets the postgres service pid. */ static void cli_do_service_getpid_postgres(int argc, char **argv) { (void) cli_do_service_getpid(SERVICE_NAME_POSTGRES); } /* * cli_do_service_getpid_listener gets the postgres service pid. */ static void cli_do_service_getpid_listener(int argc, char **argv) { (void) cli_do_service_getpid(SERVICE_NAME_MONITOR); } /* * cli_do_service_getpid_node_active gets the postgres service pid. */ static void cli_do_service_getpid_node_active(int argc, char **argv) { (void) cli_do_service_getpid(SERVICE_NAME_KEEPER); } /* * cli_do_service_restart sends the TERM signal to the given serviceName, which * is known to have the restart policy RP_PERMANENT (that's hard-coded). As a * consequence the supervisor will restart the service. */ static void cli_do_service_restart(const char *serviceName) { ConfigFilePaths pathnames = { 0 }; LocalPostgresServer postgres = { 0 }; pid_t pid = -1; pid_t newPid = -1; if (!cli_common_pgsetup_init(&pathnames, &(postgres.postgresSetup))) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!supervisor_find_service_pid(pathnames.pid, serviceName, &pid)) { log_fatal("Failed to find pid for service name \"%s\"", serviceName); exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Sending the TERM signal to service \"%s\" with pid %d", serviceName, pid); if (kill(pid, SIGTERM) != 0) { log_error("Failed to send SIGHUP to the pg_autoctl pid %d: %m", pid); exit(EXIT_CODE_INTERNAL_ERROR); } /* loop until we have a new pid */ do { if (!supervisor_find_service_pid(pathnames.pid, serviceName, &newPid)) { log_fatal("Failed to find pid for service name \"%s\"", serviceName); exit(EXIT_CODE_INTERNAL_ERROR); } if (newPid == pid) { log_trace("pidfile \"%s\" still contains pid %d for service \"%s\"", pathnames.pid, newPid, serviceName); } pg_usleep(100 * 1000); /* retry in 100 ms */ } while (newPid == pid); log_info("Service \"%s\" has been restarted with pid %d", serviceName, newPid); fformat(stdout, "%d\n", newPid); } /* * cli_do_service_restart_postgres sends the TERM signal to the postgres * service, which is known to have the restart policy RP_PERMANENT (that's * hard-coded). As a consequence the supervisor will restart the service. */ static void cli_do_service_restart_postgres(int argc, char **argv) { (void) cli_do_service_restart(SERVICE_NAME_POSTGRES); } /* * cli_do_service_restart_listener sends the TERM signal to the monitor * listener service, which is known to have the restart policy RP_PERMANENT * (that's hard-coded). As a consequence the supervisor will restart the * service. */ static void cli_do_service_restart_listener(int argc, char **argv) { (void) cli_do_service_restart(SERVICE_NAME_MONITOR); } /* * cli_do_service_restart_node_active sends the TERM signal to the keeper node * active service, which is known to have the restart policy RP_PERMANENT * (that's hard-coded). As a consequence the supervisor will restart the * service. */ static void cli_do_service_restart_node_active(int argc, char **argv) { (void) cli_do_service_restart(SERVICE_NAME_KEEPER); } /* * cli_do_pgcontroller starts the process controller service within a supervision * tree. It is used for debug purposes only. When using this entry point we * have a supervisor process that is responsible for only one service: * * pg_autoctl do service pgcontroller * - pg_autoctl do service postgres * - postgres */ static void cli_do_service_pgcontroller(int argc, char **argv) { ConfigFilePaths pathnames = { 0 }; LocalPostgresServer postgres = { 0 }; Service subprocesses[] = { "postgres", RP_PERMANENT, -1, &service_postgres_ctl_start }; int subprocessesCount = sizeof(subprocesses) / sizeof(subprocesses[0]); bool exitOnQuit = false; /* Establish a handler for signals. */ (void) set_signal_handlers(exitOnQuit); if (!cli_common_pgsetup_init(&pathnames, &(postgres.postgresSetup))) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!supervisor_start(subprocesses, subprocessesCount, pathnames.pid)) { log_fatal("Failed to start the supervisor, see above for details"); exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_do_service_postgres starts the process service. This is intended to be * used from the supervisor process tree itself. Then we have a main process * that supervises two sub-processes, one of them is cli_do_service_postgres: * * pg_autoctl * - pg_autoctl do service postgres * - postgres * - pg_autoctl do service keeper|monitor */ static void cli_do_service_postgres(int argc, char **argv) { ConfigFilePaths pathnames = { 0 }; LocalPostgresServer postgres = { 0 }; bool exitOnQuit = false; /* Establish a handler for signals. */ (void) set_signal_handlers(exitOnQuit); if (!cli_common_pgsetup_init(&pathnames, &(postgres.postgresSetup))) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* display a user-friendly process name */ (void) set_ps_title("pg_autoctl: start/stop postgres"); /* create the service pidfile */ if (!create_service_pidfile(pathnames.pid, SERVICE_NAME_POSTGRES)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } (void) service_postgres_ctl_loop(&postgres); } /* * cli_do_service_postgresctl_on asks the pg_autoctl Postgres controller service * to ensure that Postgres is running. */ static void cli_do_service_postgresctl_on(int argc, char **argv) { ConfigFilePaths pathnames = { 0 }; LocalPostgresServer postgres = { 0 }; PostgresSetup *pgSetup = &(postgres.postgresSetup); if (!cli_common_pgsetup_init(&pathnames, pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } (void) local_postgres_init(&postgres, pgSetup); if (!ensure_postgres_service_is_running(&postgres)) { exit(EXIT_CODE_PGCTL); } log_info("Postgres is serving PGDATA \"%s\" on port %d with pid %d", pgSetup->pgdata, pgSetup->pgport, pgSetup->pidFile.pid); if (outputJSON) { JSON_Value *js = json_value_init_object(); if (!pg_setup_as_json(pgSetup, js)) { /* can't happen */ exit(EXIT_CODE_INTERNAL_ERROR); } (void) cli_pprint_json(js); } } /* * cli_do_service_postgresctl_on asks the pg_autoctl Postgres controller service * to ensure that Postgres is stopped. */ static void cli_do_service_postgresctl_off(int argc, char **argv) { ConfigFilePaths pathnames = { 0 }; LocalPostgresServer postgres = { 0 }; PostgresSetup *pgSetup = &(postgres.postgresSetup); if (!cli_common_pgsetup_init(&pathnames, pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } (void) local_postgres_init(&postgres, pgSetup); if (!ensure_postgres_service_is_stopped(&postgres)) { exit(EXIT_CODE_PGCTL); } log_info("Postgres has been stopped for PGDATA \"%s\"", pgSetup->pgdata); } /* * cli_do_service_monitor_listener starts the monitor listener service. */ static void cli_do_service_monitor_listener(int argc, char **argv) { KeeperConfig options = keeperOptions; Monitor monitor = { 0 }; bool missingPgdataIsOk = false; bool pgIsNotRunningIsOk = true; bool exitOnQuit = true; /* Establish a handler for signals. */ (void) set_signal_handlers(exitOnQuit); /* Prepare MonitorConfig from the CLI options fed in options */ if (!monitor_config_init_from_pgsetup(&(monitor.config), &options.pgSetup, missingPgdataIsOk, pgIsNotRunningIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_PGCTL); } /* display a user-friendly process name */ (void) set_ps_title("pg_autoctl: monitor listener"); /* create the service pidfile */ if (!create_service_pidfile(monitor.config.pathnames.pid, SERVICE_NAME_MONITOR)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } /* Start the monitor service */ (void) monitor_service_run(&monitor); } /* * cli_do_service_node_active starts the node active service. */ static void cli_do_service_node_active(int argc, char **argv) { Keeper keeper = { 0 }; pid_t ppid = getppid(); bool exitOnQuit = true; keeper.config = keeperOptions; /* Establish a handler for signals. */ (void) set_signal_handlers(exitOnQuit); /* Prepare our Keeper and KeeperConfig from the CLI options */ if (!service_keeper_node_active_init(&keeper)) { log_fatal("Failed to initialize the node active service, " "see above for details"); exit(EXIT_CODE_INTERNAL_ERROR); } /* display a user-friendly process name */ (void) set_ps_title("pg_autoctl: node active"); /* create the service pidfile */ if (!create_service_pidfile(keeper.config.pathnames.pid, SERVICE_NAME_KEEPER)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } /* Start the node_active() protocol client */ (void) keeper_node_active_loop(&keeper, ppid); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_show.c000066400000000000000000000230311414244367200232110ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_show.c * Implementation of a CLI which lets you run operations on the local * postgres server directly. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "cli_do_root.h" #include "commandline.h" #include "config.h" #include "defaults.h" #include "file_utils.h" #include "fsm.h" #include "ipaddr.h" #include "keeper_config.h" #include "keeper.h" #include "monitor.h" #include "monitor_config.h" #include "pgctl.h" #include "pgsetup.h" #include "primary_standby.h" static void cli_show_ipaddr(int argc, char **argv); static void cli_show_cidr(int argc, char **argv); static void cli_show_lookup(int argc, char **argv); static void cli_show_hostname(int argc, char **argv); static void cli_show_reverse(int argc, char **argv); static void cli_show_version(int arg, char **argv); static CommandLine do_show_ipaddr_command = make_command("ipaddr", "Print this node's IP address information", "", "", NULL, cli_show_ipaddr); static CommandLine do_show_cidr_command = make_command("cidr", "Print this node's CIDR information", "", "", NULL, cli_show_cidr); static CommandLine do_show_lookup_command = make_command("lookup", "Print this node's DNS lookup information", "", "", NULL, cli_show_lookup); static CommandLine do_show_hostname_command = make_command("hostname", "Print this node's default hostname", "[postgres://monitor/uri]", "", NULL, cli_show_hostname); static CommandLine do_show_reverse_command = make_command("reverse", "Lookup given hostname and check reverse DNS setup", "", "", NULL, cli_show_reverse); static CommandLine do_show_version_command = make_command("version", "Run pg_autoctl version --json and parses the output", "", "", NULL, cli_show_version); CommandLine *do_show_subcommands[] = { &do_show_ipaddr_command, &do_show_cidr_command, &do_show_lookup_command, &do_show_hostname_command, &do_show_reverse_command, &do_show_version_command, NULL }; CommandLine do_show_commands = make_command_set("show", "Show some debug level information", NULL, NULL, NULL, do_show_subcommands); /* * cli_show_ipaddr displays the LAN IP address of the current node, as used * when computing the CIDR address range to open in the HBA file. */ static void cli_show_ipaddr(int argc, char **argv) { char ipAddr[BUFSIZE]; bool mayRetry = false; if (!fetchLocalIPAddress(ipAddr, BUFSIZE, DEFAULT_INTERFACE_LOOKUP_SERVICE_NAME, DEFAULT_INTERFACE_LOOKUP_SERVICE_PORT, LOG_WARN, &mayRetry)) { log_warn("Failed to determine network configuration."); exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%s\n", ipAddr); } /* * cli_show_cidr displays the LAN CIDR that pg_autoctl grants connections to in * the HBA file for setting up Postgres streaming replication and connections * to the monitor. */ static void cli_show_cidr(int argc, char **argv) { char ipAddr[BUFSIZE]; char cidr[BUFSIZE]; bool mayRetry = false; if (!fetchLocalIPAddress(ipAddr, BUFSIZE, DEFAULT_INTERFACE_LOOKUP_SERVICE_NAME, DEFAULT_INTERFACE_LOOKUP_SERVICE_PORT, LOG_WARN, &mayRetry)) { log_warn("Failed to determine network configuration."); exit(EXIT_CODE_INTERNAL_ERROR); } if (!fetchLocalCIDR(ipAddr, cidr, BUFSIZE)) { log_warn("Failed to determine network configuration."); exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%s\n", cidr); } /* * cli_check_hostname checks that the --hostname argument is either an IP * address that exists on the local list of interfaces, or a hostname that a * DNS lookup solves to an IP address we have on the local machine. * */ static void cli_show_lookup(int argc, char **argv) { if (argc != 1) { commandline_print_usage(&do_show_lookup_command, stderr); exit(EXIT_CODE_BAD_ARGS); } char *hostname = argv[0]; IPType ipType = ip_address_type(hostname); if (ipType == IPTYPE_NONE) { char localIpAddress[BUFSIZE]; if (!findHostnameLocalAddress(hostname, localIpAddress, BUFSIZE)) { log_fatal("Failed to check hostname \"%s\", see above for details", hostname); exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%s: %s\n", hostname, localIpAddress); } else { /* an IP address has been given, we do a reverse lookup */ char *ipAddr = hostname; char hostname[_POSIX_HOST_NAME_MAX]; char localIpAddress[BUFSIZE]; /* reverse DNS lookup to fetch the hostname */ if (!findHostnameFromLocalIpAddress(ipAddr, hostname, _POSIX_HOST_NAME_MAX)) { /* errors already logged, keep the ipAddr, show exit failure */ fformat(stdout, "%s\n", ipAddr); exit(EXIT_CODE_INTERNAL_ERROR); } /* DNS lookup of the given hostname to make sure we get back here */ if (!findHostnameLocalAddress(hostname, localIpAddress, BUFSIZE)) { log_fatal("Failed to check hostname \"%s\", see above for details", hostname); /* keep ipAddr and show exit failure */ fformat(stdout, "%s\n", ipAddr); exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%s: %s\n", localIpAddress, hostname); } } /* * cli_show_hostname shows the default --hostname we would use. It's the * reverse DNS entry for the local IP address we probe. */ static void cli_show_hostname(int argc, char **argv) { char ipAddr[BUFSIZE]; char localIpAddress[BUFSIZE]; char hostname[_POSIX_HOST_NAME_MAX]; char monitorHostname[_POSIX_HOST_NAME_MAX]; int monitorPort = pgsetup_get_pgport(); bool mayRetry = false; /* * When no argument is used, use hostname(3) and 5432, as we would for a * monitor (pg_autoctl create monitor). */ if (argc == 0) { if (ipaddrGetLocalHostname(monitorHostname, sizeof(hostname))) { /* we found our hostname(3), use the default pg port */ fformat(stdout, "%s\n", monitorHostname); exit(EXIT_CODE_QUIT); } else { /* use the default host/port to find the default local IP address */ strlcpy(monitorHostname, DEFAULT_INTERFACE_LOOKUP_SERVICE_NAME, _POSIX_HOST_NAME_MAX); monitorPort = DEFAULT_INTERFACE_LOOKUP_SERVICE_PORT; } } /* * When one argument is given, it is expected to be the monitor Postgres * connection string, and we then act as a keeper node. */ else if (argc == 1) { if (!hostname_from_uri(argv[0], monitorHostname, _POSIX_HOST_NAME_MAX, &monitorPort)) { log_fatal("Failed to determine monitor hostname when parsing " "Postgres URI \"%s\"", argv[0]); exit(EXIT_CODE_BAD_ARGS); } log_info("Using monitor hostname \"%s\" and port %d", monitorHostname, monitorPort); } else { commandline_print_usage(&do_show_hostname_command, stderr); exit(EXIT_CODE_BAD_ARGS); } /* fetch the default local address used when connecting remotely */ if (!fetchLocalIPAddress(ipAddr, BUFSIZE, monitorHostname, monitorPort, LOG_WARN, &mayRetry)) { log_warn("Failed to determine network configuration."); exit(EXIT_CODE_INTERNAL_ERROR); } log_debug("cli_show_hostname: ip %s", ipAddr); /* do a reverse DNS lookup from this local address to an hostname */ if (!findHostnameFromLocalIpAddress(ipAddr, hostname, _POSIX_HOST_NAME_MAX)) { /* the hostname is going to be the ipAddr in that case */ fformat(stdout, "%s\n", ipAddr); /* still indicate it was a failure */ exit(EXIT_CODE_INTERNAL_ERROR); } log_debug("cli_show_hostname: host %s", hostname); /* do a lookup of the host name and see that we get a local address back */ if (!findHostnameLocalAddress(hostname, localIpAddress, BUFSIZE)) { /* the hostname is going to be the ipAddr in that case */ fformat(stdout, "%s\n", ipAddr); /* still indicate it was a failure */ exit(EXIT_CODE_INTERNAL_ERROR); } log_debug("cli_show_hostname: ip %s", localIpAddress); fformat(stdout, "%s\n", hostname); } /* * cli_show_reverse does a forward DNS lookup of the given hostname, and then a * reverse DNS lookup for every of the forward DNS results. Success is reached * when at last one of the IP addresses from the forward lookup resolves back * to the given hostname. */ static void cli_show_reverse(int argc, char **argv) { char ipaddr[BUFSIZE] = { 0 }; bool foundHostnameFromAddress = false; if (argc != 1) { commandline_print_usage(&do_show_reverse_command, stderr); exit(EXIT_CODE_BAD_ARGS); } char *hostname = argv[0]; IPType ipType = ip_address_type(hostname); if (ipType != IPTYPE_NONE) { log_error("Hostname must not be an IP address"); exit(EXIT_CODE_BAD_ARGS); } if (!resolveHostnameForwardAndReverse(hostname, ipaddr, sizeof(ipaddr), &foundHostnameFromAddress) || !foundHostnameFromAddress) { log_fatal("Failed to find an IP address for hostname \"%s\" that " "matches hostname again in a reverse-DNS lookup.", hostname); log_info("Continuing with IP address \"%s\"", ipaddr); exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Hostname \"%s\" resolves to IP address %s and back", hostname, ipaddr); } /* * cli_show_version runs pg_autoctl version --json and parses the version * string. */ static void cli_show_version(int arg, char **argv) { Keeper keeper = { 0 }; KeeperVersion version = { 0 }; log_debug("cli_show_version"); if (!keeper_pg_autoctl_get_version_from_disk(&keeper, &version)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } log_info("pg_autoctl \"%s\"", version.pg_autoctl_version); log_info("pgautofailover \"%s\"", version.required_extension_version); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_tmux.c000066400000000000000000001132201414244367200232260ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_misc.c * Implementation of a CLI which lets you run operations on the local * postgres server directly. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include #if defined(__linux__) #include #endif #include "postgres_fe.h" #include "pqexpbuffer.h" #include "snprintf.h" #include "cli_common.h" #include "cli_do_root.h" #include "cli_do_tmux.h" #include "cli_root.h" #include "commandline.h" #include "config.h" #include "env_utils.h" #include "log.h" #include "pidfile.h" #include "signals.h" #include "string_utils.h" #include "runprogram.h" char *tmux_banner[] = { "# to quit tmux: type either `Ctrl+b d` or `tmux detach`", "# to test failover: pg_autoctl perform failover", NULL }; TmuxOptions tmuxOptions = { 0 }; TmuxNodeArray tmuxNodeArray = { 0 }; char *xdg[][3] = { { "XDG_DATA_HOME", "share" }, { "XDG_CONFIG_HOME", "config" }, { "XDG_RUNTIME_DIR", "run" }, { NULL, NULL } }; static void prepare_tmux_script(TmuxOptions *options, PQExpBuffer script); static bool tmux_stop_pg_autoctl(TmuxOptions *options); static bool parseCandidatePriority(char *priorityString, int pIndex, int *priorities); static bool prepareTmuxNodeArray(TmuxOptions *options, TmuxNodeArray *nodeArray); /* * parseCandidatePriority parses a single candidate priority item into given * index in the priorities integer array of MAX_NODES capacity */ static bool parseCandidatePriority(char *priorityString, int pIndex, int *priorities) { if (MAX_NODES <= pIndex) { log_error("Failed to parse --node-priorities: " "pg_autoctl do tmux session supports up to %d nodes", MAX_NODES); return false; } if (!stringToInt(priorityString, &(priorities[pIndex]))) { log_error("Failed to parse --node-priorities \"%s\"", priorityString); return false; } log_trace("parseCandidatePriorities[%d] = %d", pIndex, priorities[pIndex]); return true; } /* * parseCandidatePriorities parses the --node-priorities options on the command * line and fills-in an array of nodes. * * --node-priorities 50: all node have 50 * --node-priorities 50,50,0: 3+ nodes, first two have 50, then 0 */ bool parseCandidatePriorities(char *prioritiesString, int *priorities) { char sep = ','; char *ptr = prioritiesString; char *previous = prioritiesString; int pIndex = 0; if (strcmp(prioritiesString, "") == 0) { /* fill-in the priorities array with default values (50) */ for (int i = 0; i < MAX_NODES; i++) { priorities[i] = FAILOVER_NODE_CANDIDATE_PRIORITY; } return true; } while ((ptr = strchr(ptr, sep)) != NULL) { *ptr = '\0'; if (!parseCandidatePriority(previous, pIndex++, priorities)) { /* errors have already been logged */ return false; } previous = ++ptr; } /* there is no separator left, parse the end of the option string */ if (!parseCandidatePriority(previous, pIndex++, priorities)) { /* errors have already been logged */ return false; } /* mark final entry in the array; remember that pIndex > 0 here */ for (int i = pIndex; i < MAX_NODES; i++) { priorities[i] = priorities[i - 1]; } return true; } /* * prepareTmuxNodeArray expands the command line options into an array of * nodes, where each node name and properties have been computed. */ bool prepareTmuxNodeArray(TmuxOptions *options, TmuxNodeArray *nodeArray) { /* first pgport is for the monitor */ int pgport = options->firstPort + 1; /* make sure we initialize our nodes array */ nodeArray->count = 0; nodeArray->numSync = options->numSync; for (int i = 0; i < options->nodes; i++) { TmuxNode *node = &(nodeArray->nodes[i]); sformat(node->name, sizeof(node->name), "node%d", i + 1); node->pgport = pgport++; /* the first nodes are sync, then async, threshold is asyncNodes */ node->replicationQuorum = i < (options->nodes - options->asyncNodes); /* node priorities have been expanded correctly in the options */ node->candidatePriority = options->priorities[i]; ++(nodeArray->count); } /* some useful debug information */ for (int i = 0; i < options->nodes; i++) { TmuxNode *node = &(nodeArray->nodes[i]); log_debug("prepareTmuxNodeArray[%d]: %s %d %s %d", i, node->name, node->pgport, node->replicationQuorum ? "true" : "false", node->candidatePriority); } return true; } /* * cli_print_version_getopts parses the CLI options for the pg_autoctl version * command, which are the usual suspects. */ int cli_do_tmux_script_getopts(int argc, char **argv) { int c, option_index = 0, errors = 0; int verboseCount = 0; bool printVersion = false; TmuxOptions options = { 0 }; static struct option long_options[] = { { "root", required_argument, NULL, 'D' }, { "first-pgport", required_argument, NULL, 'p' }, { "nodes", required_argument, NULL, 'n' }, { "async-nodes", required_argument, NULL, 'a' }, { "node-priorities", required_argument, NULL, 'P' }, { "sync-standbys", required_argument, NULL, 's' }, { "skip-pg-hba", required_argument, NULL, 'S' }, { "layout", required_argument, NULL, 'l' }, { "binpath", required_argument, NULL, 'b' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; /* set our defaults */ options.firstPort = 5500; options.nodes = 2; options.asyncNodes = 0; options.numSync = -1; /* use pg_autoctl defaults */ options.skipHBA = false; strlcpy(options.root, "/tmp/pgaf/tmux", sizeof(options.root)); strlcpy(options.layout, "even-vertical", sizeof(options.layout)); strlcpy(options.binpath, pg_autoctl_argv0, sizeof(options.binpath)); if (!parseCandidatePriorities("", options.priorities)) { log_error("BUG: failed to initialize candidate priorities"); exit(EXIT_CODE_INTERNAL_ERROR); } /* * The only command lines that are using keeper_cli_getopt_pgdata are * terminal ones: they don't accept subcommands. In that case our option * parsing can happen in any order and we don't need getopt_long to behave * in a POSIXLY_CORRECT way. * * The unsetenv() call allows getopt_long() to reorder arguments for us. */ unsetenv("POSIXLY_CORRECT"); while ((c = getopt_long(argc, argv, "D:p:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.root, optarg, MAXPGPATH); log_trace("--root %s", options.root); break; } case 'p': { if (!stringToInt(optarg, &options.firstPort)) { log_error("Failed to parse --first-port number \"%s\"", optarg); errors++; } log_trace("--first-port %d", options.firstPort); break; } case 'n': { if (!stringToInt(optarg, &options.nodes)) { log_error("Failed to parse --nodes number \"%s\"", optarg); errors++; } if (MAX_NODES < options.nodes) { log_error("pg_autoctl do tmux session supports up to %d " "nodes, and --nodes %d has been asked for", MAX_NODES, options.nodes); errors++; } log_trace("--nodes %d", options.nodes); break; } case 'a': { if (!stringToInt(optarg, &options.asyncNodes)) { log_error("Failed to parse --async-nodes number \"%s\"", optarg); errors++; } log_trace("--async-nodes %d", options.asyncNodes); break; } case 'P': { char priorities[BUFSIZE] = { 0 }; /* parsing mangles the string, keep a copy */ strlcpy(priorities, optarg, sizeof(priorities)); if (!parseCandidatePriorities(priorities, options.priorities)) { log_error("Failed to parse --node-priorities \"%s\"", optarg); errors++; } break; } case 's': { if (!stringToInt(optarg, &options.numSync)) { log_error("Failed to parse --sync-standbys number \"%s\"", optarg); errors++; } log_trace("--sync-standbys %d", options.numSync); break; } case 'S': { options.skipHBA = true; log_trace("--skip-pg-hba"); break; } case 'l': { strlcpy(options.layout, optarg, MAXPGPATH); log_trace("--layout %s", options.layout); break; } case 'b': { strlcpy(options.binpath, optarg, MAXPGPATH); log_trace("--binpath %s", options.binpath); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ printVersion = true; break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } if (!prepareTmuxNodeArray(&options, &tmuxNodeArray)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } if (printVersion) { keeper_cli_print_version(argc, argv); } /* publish parsed options */ tmuxOptions = options; return optind; } /* * tmux_add_command appends a tmux command to the given script buffer. */ void tmux_add_command(PQExpBuffer script, const char *fmt, ...) { char buffer[ARG_MAX] = { 0 }; va_list args; va_start(args, fmt); pg_vsprintf(buffer, fmt, args); va_end(args); appendPQExpBuffer(script, "%s\n", buffer); } /* * tmux_add_send_keys_command appends a tmux send-keys command to the given * script buffer, with an additional Enter command. */ void tmux_add_send_keys_command(PQExpBuffer script, const char *fmt, ...) { char buffer[BUFSIZE] = { 0 }; va_list args; va_start(args, fmt); pg_vsprintf(buffer, fmt, args); va_end(args); appendPQExpBuffer(script, "send-keys '%s' Enter\n", buffer); } /* * tmux_add_xdg_environment sets the environment variables that we need for the * whole session to be self-contained in the given root directory. The * implementation of this function relies on the fact that the tmux script has * been prepared with tmux set-environment commands, per tmux_setenv. */ void tmux_add_xdg_environment(PQExpBuffer script) { tmux_add_send_keys_command(script, "eval $(tmux show-environment -s)"); } /* * tmux_setenv adds setenv commands to the tmux script. */ void tmux_setenv(PQExpBuffer script, const char *sessionName, const char *root, int firstPort) { char PATH[ARG_MAX] = { 0 }; char PG_CONFIG[MAXPGPATH] = { 0 }; char monitor_pguri[MAXCONNINFO] = { 0 }; if (env_exists("PG_CONFIG")) { if (!get_env_copy("PG_CONFIG", PG_CONFIG, sizeof(PG_CONFIG))) { log_fatal("Failed to get PG_CONFIG from the environment"); exit(EXIT_CODE_INTERNAL_ERROR); } tmux_add_command(script, "set-environment -t %s PG_CONFIG \"%s\"", sessionName, PG_CONFIG); } if (!get_env_copy("PATH", PATH, sizeof(PATH))) { log_fatal("Failed to get PATH from the environment"); exit(EXIT_CODE_INTERNAL_ERROR); } tmux_add_command(script, "set-environment -t %s PATH \"%s\"", sessionName, PATH); sformat(monitor_pguri, sizeof(monitor_pguri), "postgres://autoctl_node@localhost:%d/pg_auto_failover?sslmode=prefer", firstPort); tmux_add_command(script, "set-environment -t %s PG_AUTOCTL_MONITOR \"%s\"", sessionName, monitor_pguri); for (int i = 0; xdg[i][0] != NULL; i++) { char *var = xdg[i][0]; char *dir = xdg[i][1]; tmux_add_command(script, "set-environment -t %s %s \"%s/%s\"", sessionName, var, root, dir); } } /* * tmux_prepare_XDG_environment set XDG environment variables in the current * process tree. */ bool tmux_prepare_XDG_environment(const char *root, bool createDirectories) { log_info("Preparing XDG setting for self-contained session in \"%s\"", root); for (int i = 0; xdg[i][0] != NULL; i++) { char *var = xdg[i][0]; char *dir = xdg[i][1]; char *env = (char *) malloc(MAXPGPATH * sizeof(char)); if (env == NULL) { log_fatal(ALLOCATION_FAILED_ERROR); return false; } sformat(env, MAXPGPATH, "%s/%s", root, dir); if (createDirectories) { log_debug("mkdir -p \"%s\"", env); if (pg_mkdir_p(env, 0700) == -1) { log_error("mkdir -p \"%s\": %m", env); free(env); return false; } } if (!normalize_filename(env, env, MAXPGPATH)) { /* errors have already been logged */ free(env); return false; } log_info("export %s=\"%s\"", var, env); if (setenv(var, env, 1) != 0) { log_error("Failed to set environment variable %s to \"%s\": %m", var, env); } /* also create our actual target directory for our files */ if (createDirectories) { char targetPath[MAXPGPATH] = { 0 }; sformat(targetPath, sizeof(targetPath), "%s/pg_config/%s", env, /* skip first / in the root directory */ root[0] == '/' ? root + 1 : root); log_debug("mkdir -p \"%s\"", targetPath); if (pg_mkdir_p(targetPath, 0700) == -1) { log_error("mkdir -p \"%s\": %m", targetPath); free(env); return false; } } free(env); } return true; } /* * tmux_add_new_session appends a new-session command with the * update-environment options for our XDG settings, as a series of tmux * send-keys commands, to the given script buffer. */ void tmux_add_new_session(PQExpBuffer script, const char *root, int pgport) { char sessionName[BUFSIZE] = { 0 }; sformat(sessionName, BUFSIZE, "pgautofailover-%d", pgport); /* * For demo/tests purposes, arrange a self-contained setup where everything * is to be found in the given options.root directory. */ /* for (int i = 0; xdg[i][0] != NULL; i++) */ /* { */ /* char *var = xdg[i][0]; */ /* tmux_add_command(script, "set-option update-environment %s", var); */ /* } */ tmux_add_command(script, "new-session -s %s", sessionName); } /* * tmux_pg_autoctl_create_monitor appends a pg_autoctl create monitor command * to the given script buffer, and also the commands to set PGDATA and PGPORT. */ void tmux_pg_autoctl_create_monitor(PQExpBuffer script, const char *root, const char *binpath, int pgport, bool skipHBA) { char *pg_ctl_opts = skipHBA ? "--hostname localhost --ssl-self-signed --skip-pg-hba" : "--hostname localhost --ssl-self-signed --auth trust"; tmux_add_send_keys_command(script, "export PGPORT=%d", pgport); /* the monitor is always named monitor, and does not need --monitor */ tmux_add_send_keys_command(script, "export PGDATA=\"%s/monitor\"", root); tmux_add_send_keys_command(script, "%s create monitor %s --run", binpath, pg_ctl_opts); } /* * tmux_pg_autoctl_create_postgres appends a pg_autoctl create postgres command * to the given script buffer, and also the commands to set PGDATA and PGPORT. */ void tmux_pg_autoctl_create_postgres(PQExpBuffer script, const char *root, const char *binpath, int pgport, const char *name, bool replicationQuorum, int candidatePriority, bool skipHBA) { char monitor[BUFSIZE] = { 0 }; char *pg_ctl_opts = skipHBA ? "--hostname localhost --ssl-self-signed --skip-pg-hba" : "--hostname localhost --ssl-self-signed --auth trust --pg-hba-lan"; tmux_add_send_keys_command(script, "export PGPORT=%d", pgport); sformat(monitor, sizeof(monitor), "$(%s show uri --pgdata %s/monitor --formation monitor)", binpath, root); tmux_add_send_keys_command(script, "export PGDATA=\"%s/%s\"", root, name); tmux_add_send_keys_command(script, "%s create postgres %s " "--monitor %s " "--name %s " "--dbname demo " "--replication-quorum %s " "--candidate-priority %d " "--run", binpath, pg_ctl_opts, monitor, name, replicationQuorum ? "true" : "false", candidatePriority); } /* * prepare_tmux_script prepares a script for a tmux session with the given * nodes, root directory, first pgPort, and layout. */ static void prepare_tmux_script(TmuxOptions *options, PQExpBuffer script) { char *root = options->root; int pgport = options->firstPort; char sessionName[BUFSIZE] = { 0 }; char previousName[NAMEDATALEN] = { 0 }; sformat(sessionName, BUFSIZE, "pgautofailover-%d", options->firstPort); tmux_add_command(script, "set-option -g default-shell /bin/bash"); (void) tmux_add_new_session(script, root, pgport); (void) tmux_setenv(script, sessionName, root, options->firstPort); /* start a monitor */ (void) tmux_add_xdg_environment(script); tmux_pg_autoctl_create_monitor(script, root, options->binpath, pgport++, options->skipHBA); /* start the Postgres nodes, using the monitor URI */ sformat(previousName, sizeof(previousName), "monitor"); for (int i = 0; i < tmuxNodeArray.count; i++) { TmuxNode *node = &(tmuxNodeArray.nodes[i]); tmux_add_command(script, "split-window -v"); tmux_add_command(script, "select-layout even-vertical"); (void) tmux_add_xdg_environment(script); /* * Force node ordering to easy debugging of interactive sessions: each * node waits until the previous one has been started or registered. */ tmux_add_send_keys_command(script, "PG_AUTOCTL_DEBUG=1 " "%s do tmux wait --root %s %s", options->binpath, options->root, previousName); tmux_pg_autoctl_create_postgres(script, root, options->binpath, node->pgport, node->name, node->replicationQuorum, node->candidatePriority, options->skipHBA); strlcpy(previousName, node->name, sizeof(previousName)); } /* add a window for pg_autoctl show state */ tmux_add_command(script, "split-window -v"); tmux_add_command(script, "select-layout even-vertical"); (void) tmux_add_xdg_environment(script); tmux_add_send_keys_command(script, "export PGDATA=\"%s/monitor\"", root); tmux_add_send_keys_command(script, "PG_AUTOCTL_DEBUG=1 " "%s do tmux wait --root %s %s", options->binpath, options->root, "monitor"); tmux_add_send_keys_command(script, "%s watch", options->binpath); /* add a window for interactive pg_autoctl commands */ tmux_add_command(script, "split-window -v"); tmux_add_command(script, "select-layout even-vertical"); (void) tmux_add_xdg_environment(script); if (options->numSync != -1) { /* * We need to wait until the first node is either WAIT_PRIMARY or * PRIMARY before we can go on and change formation settings with * pg_autoctl set formation ... */ char firstNode[NAMEDATALEN] = { 0 }; NodeState targetPrimaryState = options->numSync == 0 ? WAIT_PRIMARY_STATE : PRIMARY_STATE; sformat(firstNode, sizeof(firstNode), "node%d", 1); tmux_add_send_keys_command(script, "PG_AUTOCTL_DEBUG=1 " "%s do tmux wait --root %s %s %s", options->binpath, options->root, firstNode, NodeStateToString(targetPrimaryState)); /* PGDATA has just been exported, rely on it */ tmux_add_send_keys_command(script, "%s set formation number-sync-standbys %d", options->binpath, options->numSync); } /* now change to the user given options->root directory */ tmux_add_send_keys_command(script, "cd \"%s\"", options->root); /* now select our target layout */ tmux_add_command(script, "select-layout %s", options->layout); if (env_exists("TMUX_EXTRA_COMMANDS")) { char extra_commands[BUFSIZE] = { 0 }; char *extraLines[BUFSIZE]; int lineNumber = 0; if (!get_env_copy("TMUX_EXTRA_COMMANDS", extra_commands, BUFSIZE)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } int lineCount = splitLines(extra_commands, extraLines, BUFSIZE); for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { appendPQExpBuffer(script, "%s\n", extraLines[lineNumber]); } } for (int i = 0; tmux_banner[i] != NULL; i++) { tmux_add_send_keys_command(script, "%s", tmux_banner[i]); } } /* * tmux_start_server starts a tmux session with the given script. */ bool tmux_start_server(const char *scriptName, const char *binpath) { char *args[8]; int argsIndex = 0; char tmux[MAXPGPATH] = { 0 }; char command[BUFSIZE] = { 0 }; /* * Here we are going to remain the parent process of multiple pg_autoctl * top-level processes, one per node (and the monitor). We don't want all * those processes to use the same semaphore for logging, so make sure we * remove ourselves from the environment before we start all the * sub-processes. */ unsetenv(PG_AUTOCTL_LOG_SEMAPHORE); if (setenv("PG_AUTOCTL_DEBUG", "1", 1) != 0) { log_error("Failed to set environment PG_AUTOCTL_DEBUG: %m"); return false; } if (binpath && setenv("PG_AUTOCTL_DEBUG_BIN_PATH", binpath, 1) != 0) { log_error("Failed to set environment PG_AUTOCTL_DEBUG_BIN_PATH: %m"); return false; } if (!search_path_first("tmux", tmux, LOG_ERROR)) { log_fatal("Failed to find program tmux in PATH"); return false; } /* * Run the tmux command with our script: * tmux start-server \; source-file ${scriptName} */ args[argsIndex++] = (char *) tmux; args[argsIndex++] = "-u"; args[argsIndex++] = "start-server"; args[argsIndex++] = ";"; args[argsIndex++] = "source-file"; args[argsIndex++] = (char *) scriptName; args[argsIndex] = NULL; /* we do not want to call setsid() when running this program. */ Program program = { 0 }; (void) initialize_program(&program, args, false); program.capture = false; /* don't capture output */ program.tty = true; /* allow sharing the parent's tty */ /* log the exact command line we're using */ (void) snprintf_program_command_line(&program, command, BUFSIZE); log_info("%s", command); (void) execute_subprogram(&program); /* we only get there when the tmux session is done */ free_program(&program); return true; } /* * tmux_attach_session_by_name runs the command: * tmux attach-session -t sessionName */ bool tmux_attach_session(const char *tmux_path, const char *sessionName) { Program program = { 0 }; char *args[8]; int argsIndex = 0; char command[BUFSIZE] = { 0 }; /* * Run the tmux command with our script: * tmux start-server \; source-file ${scriptName} */ args[argsIndex++] = (char *) tmux_path; args[argsIndex++] = "attach-session"; args[argsIndex++] = "-t"; args[argsIndex++] = (char *) sessionName; args[argsIndex] = NULL; /* we do not want to call setsid() when running this program. */ (void) initialize_program(&program, args, false); program.capture = false; /* don't capture output */ program.tty = true; /* allow sharing the parent's tty */ /* log the exact command line we're using */ (void) snprintf_program_command_line(&program, command, BUFSIZE); log_info("%s", command); (void) execute_subprogram(&program); /* we only get there when the tmux session is done */ free_program(&program); return true; } /* * pg_autoctl_getpid gets the pid of the pg_autoctl process that is running for * the given PGDATA location. */ bool pg_autoctl_getpid(const char *pgdata, pid_t *pid) { ConfigFilePaths pathnames = { 0 }; if (!keeper_config_set_pathnames_from_pgdata(&pathnames, pgdata)) { exit(EXIT_CODE_INTERNAL_ERROR); } return read_pidfile(pathnames.pid, pid); } /* * tmux_stop_pg_autoctl stops all started pg_autoctl programs in a tmux * sessions. */ static bool tmux_stop_pg_autoctl(TmuxOptions *options) { bool success = true; int signals[] = { SIGTERM, SIGINT, SIGQUIT }; int signalsCount = sizeof(signals) / sizeof(signals[0]); /* signal processes using increasing levels of urge to quit now */ for (int s = 0; s < signalsCount; s++) { int countRunning = options->nodes + 1; for (int i = 0; i <= options->nodes; i++) { pid_t pid = 0; char name[MAXPGPATH] = { 0 }; char pgdata[MAXPGPATH] = { 0 }; if (i == options->nodes) { sformat(name, sizeof(name), "monitor"); } else { sformat(name, sizeof(name), "node%d", i + 1); } sformat(pgdata, sizeof(pgdata), "%s/%s", options->root, name); if (!pg_autoctl_getpid(pgdata, &pid)) { /* we don't have a pid */ log_info("No pidfile for pg_autoctl for node \"%s\"", name); --countRunning; continue; } if (kill(pid, 0) == -1 && errno == ESRCH) { log_info("Pid %d for node \"%s\" is not running anymore", pid, name); --countRunning; } else { log_info("Sending signal %s to pid %d for node \"%s\"", signal_to_string(signals[s]), pid, name); if (kill(pid, signals[s]) != 0) { log_info("Failed to send %s to pid %d", signal_to_string(signals[s]), pid); } } } if (countRunning == 0) { break; } /* sleep enough time that the processes might already be dead */ sleep(1); } return success; } /* * tmux_has_session runs the command `tmux has-session -f sessionName`. */ bool tmux_has_session(const char *tmux_path, const char *sessionName) { int returnCode; char command[BUFSIZE] = { 0 }; Program program = run_program(tmux_path, "has-session", "-t", sessionName, NULL); returnCode = program.returnCode; (void) snprintf_program_command_line(&program, command, BUFSIZE); log_debug("%s", command); if (program.stdOut) { char *outLines[BUFSIZE] = { 0 }; int lineCount = splitLines(program.stdOut, outLines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_info("tmux has-session: %s", outLines[lineNumber]); } } if (program.stdErr) { char *errLines[BUFSIZE] = { 0 }; int lineCount = splitLines(program.stdOut, errLines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_error("tmux has-session: %s", errLines[lineNumber]); } } free_program(&program); /* * From tmux has-session manual page: * * Report an error and exit with 1 if the specified session does not exist. * If it does exist, exit with 0. */ return returnCode == 0; } /* * tmux_kill_session runs the command: * tmux kill-session -t pgautofailover-${first-pgport} */ bool tmux_kill_session(TmuxOptions *options) { char sessionName[BUFSIZE] = { 0 }; sformat(sessionName, BUFSIZE, "pgautofailover-%d", options->firstPort); return tmux_kill_session_by_name(sessionName); } /* * tmux_kill_session_by_name kills a tmux session of the given name. */ bool tmux_kill_session_by_name(const char *sessionName) { char tmux[MAXPGPATH] = { 0 }; char command[BUFSIZE] = { 0 }; bool success = true; if (!search_path_first("tmux", tmux, LOG_ERROR)) { log_fatal("Failed to find program tmux in PATH"); return false; } if (!tmux_has_session(tmux, sessionName)) { log_info("Tmux session \"%s\" does not exist", sessionName); return true; } Program program = run_program(tmux, "kill-session", "-t", sessionName, NULL); (void) snprintf_program_command_line(&program, command, BUFSIZE); log_info("%s", command); if (program.stdOut) { char *outLines[BUFSIZE] = { 0 }; int lineCount = splitLines(program.stdOut, outLines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_info("tmux kill-session: %s", outLines[lineNumber]); } } if (program.stdErr) { char *errLines[BUFSIZE] = { 0 }; int lineCount = splitLines(program.stdErr, errLines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_error("tmux kill-session: %s", errLines[lineNumber]); } } if (program.returnCode != 0) { success = false; log_warn("Failed to kill tmux sessions \"%s\"", sessionName); } free_program(&program); return success; } /* * tmux_process_options processes the tmux commands options. The main activity * here is to ensure that the "root" directory exists and normalize its * internal pathname in the options structure. */ void tmux_process_options(TmuxOptions *options) { log_debug("tmux_process_options"); log_debug("mkdir -p \"%s\"", options->root); if (pg_mkdir_p(options->root, 0700) == -1) { log_fatal("mkdir -p \"%s\": %m", options->root); exit(EXIT_CODE_INTERNAL_ERROR); } log_debug("normalize_filename \"%s\"", options->root); if (!normalize_filename(options->root, options->root, MAXPGPATH)) { /* errors have already been logged. */ exit(EXIT_CODE_INTERNAL_ERROR); } log_debug("Real path for root directory: \"%s\"", options->root); } /* * tmux_cleanup_stale_directory cleans-up the pg_autoctl processes and then the * root directory of a tmux session, and then kills the tmux session. */ void tmux_cleanup_stale_directory(TmuxOptions *options) { if (!directory_exists(options->root)) { log_info("Directory \"%s\" does not exist, nothing to clean-up", options->root); return; } if (!normalize_filename(options->root, options->root, MAXPGPATH)) { /* errors have already been logged. */ exit(EXIT_CODE_INTERNAL_ERROR); } /* prepare the XDG environment */ if (!tmux_prepare_XDG_environment(options->root, false)) { exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Checking for stale pg_autoctl process in \"%s\"", options->root); (void) tmux_stop_pg_autoctl(options); log_info("Removing stale directory: rm -rf \"%s\"", options->root); if (!rmtree(options->root, true)) { log_error("Failed to remove directory \"%s\": %m", options->root); exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Checking for stale tmux session \"pgautofailover-%d\"", options->firstPort); if (!tmux_kill_session(options)) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * keeper_cli_tmux_script generates a tmux script to run a test case or a demo * for pg_auto_failover easily. */ void cli_do_tmux_script(int argc, char **argv) { TmuxOptions options = tmuxOptions; PQExpBuffer script = createPQExpBuffer(); (void) tmux_process_options(&options); /* prepare the XDG environment */ if (!tmux_prepare_XDG_environment(options.root, true)) { exit(EXIT_CODE_INTERNAL_ERROR); } if (script == NULL) { log_error("Failed to allocate memory"); exit(EXIT_CODE_INTERNAL_ERROR); } /* prepare the tmux script */ (void) prepare_tmux_script(&options, script); /* memory allocation could have failed while building string */ if (PQExpBufferBroken(script)) { log_error("Failed to allocate memory"); destroyPQExpBuffer(script); exit(EXIT_CODE_INTERNAL_ERROR); } fformat(stdout, "%s", script->data); destroyPQExpBuffer(script); } /* * cli_do_tmux_session starts an interactive tmux session with the given * specifications for a cluster. When the session is detached, the pg_autoctl * processes are stopped. */ void cli_do_tmux_session(int argc, char **argv) { TmuxOptions options = tmuxOptions; PQExpBuffer script = createPQExpBuffer(); char scriptName[MAXPGPATH] = { 0 }; bool success = true; /* * We need to make sure we start from a clean slate. */ (void) tmux_cleanup_stale_directory(&options); /* * Write the script to "script-${first-pgport}.tmux" file in the root * directory. */ (void) tmux_process_options(&options); /* prepare the XDG environment */ if (!tmux_prepare_XDG_environment(options.root, true)) { exit(EXIT_CODE_INTERNAL_ERROR); } /* * Prepare the tmux script. */ if (script == NULL) { log_error("Failed to allocate memory"); exit(EXIT_CODE_INTERNAL_ERROR); } (void) prepare_tmux_script(&options, script); /* memory allocation could have failed while building string */ if (PQExpBufferBroken(script)) { log_error("Failed to allocate memory"); destroyPQExpBuffer(script); exit(EXIT_CODE_INTERNAL_ERROR); } /* * Write the script to file. */ sformat(scriptName, sizeof(scriptName), "%s/script-%d.tmux", options.root, options.firstPort); log_info("Writing tmux session script \"%s\"", scriptName); if (!write_file(script->data, script->len, scriptName)) { log_fatal("Failed to write tmux script at \"%s\"", scriptName); exit(EXIT_CODE_INTERNAL_ERROR); } destroyPQExpBuffer(script); /* * Start a tmux session from the script. */ if (!tmux_start_server(scriptName, options.binpath)) { success = false; log_fatal("Failed to start the tmux session, see above for details"); } /* * Stop our pg_autoctl processes and kill the tmux session. */ log_info("tmux session ended: kill pg_autoct processes"); success = success && tmux_stop_pg_autoctl(&options); success = success && tmux_kill_session(&options); if (!success) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_do_tmux_stop send termination signals on all the pg_autoctl process that * might be running in a tmux session. */ void cli_do_tmux_stop(int argc, char **argv) { TmuxOptions options = tmuxOptions; (void) tmux_process_options(&options); /* prepare the XDG environment */ if (!tmux_prepare_XDG_environment(options.root, false)) { exit(EXIT_CODE_INTERNAL_ERROR); } if (!tmux_stop_pg_autoctl(&options)) { exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_do_tmux_clean cleans-up the pg_autoctl processes and then the root * directory of a tmux session, and then kills the tmux session. */ void cli_do_tmux_clean(int argc, char **argv) { TmuxOptions options = tmuxOptions; (void) tmux_cleanup_stale_directory(&options); } /* * cli_do_tmux_wait waits until given node name has been registered. When the * target node name is the "monitor" just wait until Postgres is running. */ void cli_do_tmux_wait(int argc, char **argv) { TmuxOptions options = tmuxOptions; char nodeName[NAMEDATALEN] = { 0 }; NodeState targetState = INIT_STATE; (void) tmux_process_options(&options); /* prepare the XDG environment */ if (!tmux_prepare_XDG_environment(options.root, false)) { exit(EXIT_CODE_INTERNAL_ERROR); } switch (argc) { case 1: { strlcpy(nodeName, argv[0], sizeof(nodeName)); break; } case 2: { strlcpy(nodeName, argv[0], sizeof(nodeName)); targetState = NodeStateFromString(argv[1]); /* when we fail to parse the target state we wait 30s and exit */ if (targetState == NO_STATE) { sleep(30); exit(EXIT_CODE_INTERNAL_ERROR); } break; } default: { commandline_help(stderr); exit(EXIT_CODE_INTERNAL_ERROR); } } if (strcmp(nodeName, "monitor") == 0) { int timeout = 30; bool ready = false; char pgdata[MAXPGPATH] = { 0 }; sformat(pgdata, sizeof(pgdata), "%s/%s", options.root, nodeName); /* leave some time for initdb and stuff */ sleep(2); Program program = run_program(pg_autoctl_program, "do", "pgsetup", "wait", "--pgdata", pgdata, NULL); if (program.returnCode != 0) { char command[BUFSIZE]; (void) snprintf_program_command_line(&program, command, BUFSIZE); log_error("%s [%d]", command, program.returnCode); free_program(&program); exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Postgres is running at \"%s\"", pgdata); /* Postgres is running on the monitor, is it ready though? */ while (!ready && timeout > 0) { char command[BUFSIZE]; Program showUri = run_program(pg_autoctl_program, "show", "uri", "--monitor", "--pgdata", pgdata, NULL); (void) snprintf_program_command_line(&showUri, command, BUFSIZE); log_info("%s [%d]", command, showUri.returnCode); ready = showUri.returnCode == 0; --timeout; if (ready) { log_info("The monitor is ready at: %s", showUri.stdOut); } free_program(&showUri); } free_program(&program); if (!ready) { exit(EXIT_CODE_INTERNAL_ERROR); } } else { /* * Not a monitor node: only wait until the node has been registered to * the monitor. We know that the node has been registered when a state * file exists. */ int timeout = 60; char pgdata[MAXPGPATH] = { 0 }; ConfigFilePaths pathnames = { 0 }; sformat(pgdata, sizeof(pgdata), "%s/%s", options.root, nodeName); log_info("Waiting for a node state file for PGDATA \"%s\"", pgdata); /* when waiting for PRIMARY or some other state, raise the timeout */ if (targetState == INIT_STATE) { timeout = 60; } else { timeout = 120; } while (timeout > 0) { if (IS_EMPTY_STRING_BUFFER(pathnames.state)) { if (keeper_config_set_pathnames_from_pgdata(&pathnames, pgdata)) { log_info("Waiting for creation of a state file at \"%s\"", pathnames.state); } } if (file_exists(pathnames.state)) { KeeperStateData keeperState = { 0 }; if (keeper_state_read(&keeperState, pathnames.state)) { if (targetState == INIT_STATE && keeperState.assigned_role > targetState) { log_info("Node \"%s\" is now assigned %s, done waiting", nodeName, NodeStateToString(keeperState.assigned_role)); break; } if (keeperState.assigned_role == targetState && keeperState.current_role == targetState) { log_info("Node \"%s\" is currently %s/%s, done waiting", nodeName, NodeStateToString(keeperState.current_role), NodeStateToString(keeperState.assigned_role)); break; } log_info("Node \"%s\" is currently %s/%s, waiting for %s", nodeName, NodeStateToString(keeperState.current_role), NodeStateToString(keeperState.assigned_role), NodeStateToString(targetState)); } else { log_info("Waiting for node \"%s\" to be registered", nodeName); } } sleep(1); --timeout; } /* we might have reached the timeout */ if (!file_exists(pathnames.state)) { log_fatal("Reached timeout while waiting for state file \"%s\"", pathnames.state); exit(EXIT_CODE_INTERNAL_ERROR); } } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_tmux.h000066400000000000000000000060031414244367200232330ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_tmux.h * Implementation of a CLI which lets you run operations on the local * postgres server directly. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef CLI_DO_TMUX_H #define CLI_DO_TMUX_H #include "postgres_fe.h" #include "pqexpbuffer.h" #include "snprintf.h" #include "cli_common.h" #include "cli_do_root.h" #include "cli_root.h" #include "commandline.h" #include "config.h" #include "env_utils.h" #include "log.h" #include "pidfile.h" #include "signals.h" #include "string_utils.h" #define MAX_NODES 12 typedef struct TmuxOptions { char root[MAXPGPATH]; int firstPort; int nodes; /* number of nodes per groups, total */ int asyncNodes; /* number of async nodes, within the total */ int priorities[MAX_NODES]; /* node priorities */ int numSync; /* number-sync-standbys */ bool skipHBA; /* do we want to use --skip-pg-hba? */ char layout[BUFSIZE]; char binpath[MAXPGPATH]; } TmuxOptions; typedef struct TmuxNode { char name[NAMEDATALEN]; int pgport; bool replicationQuorum; int candidatePriority; } TmuxNode; typedef struct TmuxNodeArray { int count; /* array actual size */ int numSync; /* number-sync-standbys */ TmuxNode nodes[MAX_NODES]; } TmuxNodeArray; extern TmuxOptions tmuxOptions; extern TmuxNodeArray tmuxNodeArray; bool parseCandidatePriorities(char *prioritiesString, int *priorities); void tmux_add_command(PQExpBuffer script, const char *fmt, ...) __attribute__((format(printf, 2, 3))); void tmux_add_send_keys_command(PQExpBuffer script, const char *fmt, ...) __attribute__((format(printf, 2, 3))); bool tmux_has_session(const char *tmux_path, const char *sessionName); void tmux_add_new_session(PQExpBuffer script, const char *root, int pgport); void tmux_add_xdg_environment(PQExpBuffer script); void tmux_setenv(PQExpBuffer script, const char *sessionName, const char *root, int firstPort); bool tmux_prepare_XDG_environment(const char *root, bool createDirectories); void tmux_pg_autoctl_create_monitor(PQExpBuffer script, const char *root, const char *binpath, int pgport, bool skipHBA); void tmux_pg_autoctl_create_postgres(PQExpBuffer script, const char *root, const char *binpath, int pgport, const char *name, bool replicationQuorum, int candidatePriority, bool skipHBA); bool tmux_start_server(const char *scriptName, const char *binpath); bool pg_autoctl_getpid(const char *pgdata, pid_t *pid); bool tmux_has_session(const char *tmux_path, const char *sessionName); bool tmux_attach_session(const char *tmux_path, const char *sessionName); bool tmux_kill_session(TmuxOptions *options); bool tmux_kill_session_by_name(const char *sessionName); void tmux_process_options(TmuxOptions *options); void tmux_cleanup_stale_directory(TmuxOptions *options); #endif /* CLI_DO_TMUX_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_do_tmux_azure.c000066400000000000000000000153741414244367200244470ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_tmux_azure.c * * Implementation of commands that create a tmux session to connect to a * set of Azure VMs where we run pg_autoctl nodes for QA and testing. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "snprintf.h" #include "azure.h" #include "azure_config.h" #include "cli_common.h" #include "cli_do_root.h" #include "cli_do_tmux.h" #include "cli_root.h" #include "commandline.h" #include "config.h" #include "env_utils.h" #include "log.h" #include "parsing.h" #include "pidfile.h" #include "signals.h" #include "string_utils.h" #include "runprogram.h" static void tmux_azure_new_session(PQExpBuffer script, AzureRegionResources *azRegion); static void tmux_azure_deploy(PQExpBuffer script, AzureRegionResources *azRegion, const char *vmName); static void tmux_azure_ssh(PQExpBuffer script, AzureRegionResources *azRegion, const char *vmName); static void tmux_azure_systemctl_status(PQExpBuffer script, AzureRegionResources *azRegion); static void prepare_tmux_azure_script(AzureRegionResources *azRegion, PQExpBuffer script); /* * tmux_azure_new_session appends a new-session command to the given tmux * script buffer, using the azure group name for the tmux session name. */ static void tmux_azure_new_session(PQExpBuffer script, AzureRegionResources *azRegion) { tmux_add_command(script, "new-session -s %s", azRegion->group); } /* * tmux_azure_deploy_postgres appends a pg_autoctl do azure deploy command for * the given vmName to the given script buffer. */ static void tmux_azure_deploy(PQExpBuffer script, AzureRegionResources *azRegion, const char *vmName) { tmux_add_send_keys_command(script, "%s do azure deploy %s", pg_autoctl_argv0, vmName); } /* * tmux_azure_ssh appends a pg_autoctl do azure ssh command for the given * vmName to the given script buffer. */ static void tmux_azure_ssh(PQExpBuffer script, AzureRegionResources *azRegion, const char *vmName) { tmux_add_send_keys_command(script, "%s do azure ssh %s", pg_autoctl_argv0, vmName); } /* * tmux_azure_ssh appends a pg_autoctl do azure ssh command for the given * vmName to the given script buffer. */ static void tmux_azure_systemctl_status(PQExpBuffer script, AzureRegionResources *azRegion) { tmux_add_send_keys_command(script, "systemctl status pgautofailover"); } /* * tmux_add_environment appends the export VAR=value commands that we need to * set the environment for pg_autoctl do azure deploy in the shell windows. */ static void tmux_azure_add_environment(PQExpBuffer script, KeyVal *env) { for (int i = 0; i < env->count; i++) { tmux_add_send_keys_command(script, "export %s=%s", env->keywords[i], env->values[i]); } } /* * prepare_tmux_script prepares a script for a tmux session with the given * azure region resources. */ static void prepare_tmux_azure_script(AzureRegionResources *azRegion, PQExpBuffer script) { KeyVal env = { 0 }; /* fetch environment and defaults for versions */ if (!azure_prepare_target_versions(&env)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } tmux_add_command(script, "set-option -g default-shell /bin/bash"); tmux_azure_new_session(script, azRegion); /* deploy VMs each in a new tmux window */ for (int vmIndex = 0; vmIndex <= azRegion->nodes; vmIndex++) { const char *vmName = azRegion->vmArray[vmIndex].name; /* after the first VM, create new tmux windows for each VM */ if (vmIndex > 0) { tmux_add_command(script, "split-window -v"); tmux_add_command(script, "select-layout even-vertical"); } tmux_azure_add_environment(script, &env); tmux_azure_deploy(script, azRegion, vmName); tmux_azure_ssh(script, azRegion, vmName); tmux_azure_systemctl_status(script, azRegion); } /* add a window for pg_autoctl show state */ tmux_add_command(script, "split-window -v"); tmux_add_command(script, "select-layout even-vertical"); tmux_add_send_keys_command(script, "%s do azure show state --watch", pg_autoctl_argv0); /* add a window for interactive pg_autoctl commands */ tmux_add_command(script, "split-window -v"); tmux_add_command(script, "select-layout even-vertical"); tmux_add_send_keys_command(script, "%s do azure show ips", pg_autoctl_argv0); } /* * cli_do_azure_tmux_session starts a new tmux session for the given azure * region and resources, or attach an existing session that might be running in * the background already. */ bool tmux_azure_start_or_attach_session(AzureRegionResources *azRegion) { char tmux[MAXPGPATH] = { 0 }; PQExpBuffer script; char scriptName[MAXPGPATH] = { 0 }; if (setenv("PG_AUTOCTL_DEBUG", "1", 1) != 0) { log_error("Failed to set environment PG_AUTOCTL_DEBUG: %m"); return false; } if (!search_path_first("tmux", tmux, LOG_ERROR)) { log_fatal("Failed to find program tmux in PATH"); return false; } /* we might just re-use a pre-existing tmux session */ if (!dryRun && tmux_has_session(tmux, azRegion->group)) { return tmux_attach_session(tmux, azRegion->group); } /* * Okay, so we have to create the session now. And for that we need the IP * addresses of the target VMs. */ if (!azure_fetch_ip_addresses(azRegion->group, azRegion->vmArray)) { /* errors have already been logged */ return false; } script = createPQExpBuffer(); if (script == NULL) { log_error("Failed to allocate memory"); return false; } /* prepare the tmux script */ (void) prepare_tmux_azure_script(azRegion, script); /* * Start a tmux session from the script. */ if (dryRun) { fformat(stdout, "%s", script->data); destroyPQExpBuffer(script); } else { /* write the tmux script to file */ sformat(scriptName, sizeof(scriptName), "%s.tmux", azRegion->group); log_info("Writing tmux session script \"%s\"", scriptName); if (!write_file(script->data, script->len, scriptName)) { log_fatal("Failed to write tmux script at \"%s\"", scriptName); destroyPQExpBuffer(script); return false; } if (!tmux_start_server(scriptName, NULL)) { log_fatal("Failed to start the tmux session, see above for details"); destroyPQExpBuffer(script); return false; } } return true; } /* * tmux_azure_kill_session kills a tmux session for the given QA setup, when * the tmux session already exists. */ bool tmux_azure_kill_session(AzureRegionResources *azRegion) { return tmux_kill_session_by_name(azRegion->group); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_drop_node.c000066400000000000000000000630541414244367200235310ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_drop_node.c * Implementation of the pg_autoctl create and pg_autoctl drop CLI for the * pg_auto_failover nodes (monitor, coordinator, worker, postgres). * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "env_utils.h" #include "defaults.h" #include "fsm.h" #include "ini_file.h" #include "ipaddr.h" #include "keeper_config.h" #include "keeper_pg_init.h" #include "keeper.h" #include "monitor.h" #include "monitor_config.h" #include "monitor_pg_init.h" #include "pgctl.h" #include "pghba.h" #include "pidfile.h" #include "primary_standby.h" #include "service_keeper.h" #include "service_keeper_init.h" #include "service_monitor.h" #include "service_monitor_init.h" #include "signals.h" #include "string_utils.h" /* * Global variables that we're going to use to "communicate" in between getopts * functions and their command implementation. We can't pass parameters around. */ bool dropAndDestroy = false; static bool dropForce = false; static void cli_drop_monitor(int argc, char **argv); static void cli_drop_local_monitor(MonitorConfig *mconfig, bool dropAndDestroy); static void cli_drop_node_with_monitor_disabled(KeeperConfig *config, bool dropAndDestroy); static void cli_drop_node_files_and_directories(KeeperConfig *config); static void stop_postgres_and_remove_pgdata_and_config(ConfigFilePaths *pathnames, PostgresSetup *pgSetup); static void cli_drop_node_from_monitor_and_wait(KeeperConfig *config); CommandLine drop_monitor_command = make_command("monitor", "Drop the pg_auto_failover monitor", "[ --pgdata --destroy ]", " --pgdata path to data directory\n" " --destroy also destroy Postgres database\n", cli_drop_node_getopts, cli_drop_monitor); CommandLine drop_node_command = make_command( "node", "Drop a node from the pg_auto_failover monitor", "[ [ [ --pgdata ] [ --destroy ] ] | " "[ --monitor [ [ --hostname --pgport ] | [ --formation --name ] ] ] ] ", " --pgdata path to data directory\n" " --monitor pg_auto_failover Monitor Postgres URL\n" " --formation pg_auto_failover formation\n" " --name drop the node with the given node name\n" " --hostname drop the node with given hostname and pgport\n" " --pgport drop the node with given hostname and pgport\n" " --destroy also destroy Postgres database\n" " --force force dropping the node from the monitor\n" " --wait how many seconds to wait, default to 60 \n", cli_drop_node_getopts, cli_drop_node); /* * cli_drop_node_getopts parses the command line options necessary to drop or * destroy a local pg_autoctl node. */ int cli_drop_node_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "monitor", required_argument, NULL, 'm' }, { "destroy", no_argument, NULL, 'd' }, { "force", no_argument, NULL, 'F' }, { "hostname", required_argument, NULL, 'n' }, { "pgport", required_argument, NULL, 'p' }, { "formation", required_argument, NULL, 'f' }, { "wait", required_argument, NULL, 'w' }, { "name", required_argument, NULL, 'a' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; options.listen_notifications_timeout = PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT; while ((c = getopt_long(argc, argv, "D:dn:p:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'm': { if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'd': { dropAndDestroy = true; log_trace("--destroy"); break; } case 'F': { dropForce = true; log_trace("--force"); break; } case 'n': { strlcpy(options.hostname, optarg, _POSIX_HOST_NAME_MAX); log_trace("--hostname %s", options.hostname); break; } case 'p': { if (!stringToInt(optarg, &options.pgSetup.pgport)) { log_fatal("--pgport argument is a valid port number: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--pgport %d", options.pgSetup.pgport); break; } case 'f': { strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'a': { /* { "name", required_argument, NULL, 'a' }, */ strlcpy(options.name, optarg, _POSIX_HOST_NAME_MAX); log_trace("--name %s", options.name); break; } case 'w': { /* { "wait", required_argument, NULL, 'w' }, */ if (!stringToInt(optarg, &options.listen_notifications_timeout)) { log_fatal("--wait argument is not a valid timeout: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--wait %d", options.listen_notifications_timeout); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); break; } } } if (dropAndDestroy && (!IS_EMPTY_STRING_BUFFER(options.hostname) || options.pgSetup.pgport != 0)) { log_error("Please use either [ --hostname --pgport ] " " or [ --formation --name ] to target a remote node, " " or --destroy to destroy the local node."); log_info("Destroying a node is not supported from a distance"); exit(EXIT_CODE_BAD_ARGS); } /* now that we have the command line parameters, prepare the options */ /* when we have a monitor URI we don't need PGDATA */ if (cli_use_monitor_option(&options)) { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { log_warn("Given --monitor URI, the --pgdata option is ignored"); log_info("Connecting to monitor at \"%s\"", options.monitor_pguri); /* the rest of the program needs pgdata actually empty */ bzero((void *) options.pgSetup.pgdata, sizeof(options.pgSetup.pgdata)); } } else { (void) prepare_keeper_options(&options); } /* * pg_autoctl drop node can be used with one of those set of arguments: * --pgdata ... # to drop the local node * --pgdata # to drop any node from the monitor * --formation ... --name ... # address a node on the monitor * --hostname ... --pgport ... # address a node on the monitor * * We check about the PGDATA being related to a monitor or a keeper later, * here we focus on the optargs. Remember that --formation can be skipped * to mean "default", and --pgport can be skipped to mean either PGPORT * from the environment or just 5432. */ if (!IS_EMPTY_STRING_BUFFER(options.name) && !IS_EMPTY_STRING_BUFFER(options.hostname)) { log_fatal("pg_autoctl drop node target can either be specified " "using [ --formation --name ], or " "using [ --hostname and --pgport ], but not both."); exit(EXIT_CODE_BAD_ARGS); } /* use the "default" formation when not given */ if (IS_EMPTY_STRING_BUFFER(options.formation)) { strlcpy(options.formation, FORMATION_DEFAULT, NAMEDATALEN); } /* publish our option parsing in the global variable */ keeperOptions = options; return optind; } /* * cli_drop_node removes the local PostgreSQL node from the pg_auto_failover * monitor, and when it's a worker, from the Citus coordinator too. */ void cli_drop_node(int argc, char **argv) { KeeperConfig config = keeperOptions; pgAutoCtlNodeRole localNodeRole = IS_EMPTY_STRING_BUFFER(config.pgSetup.pgdata) ? PG_AUTOCTL_ROLE_UNKNOWN : ProbeConfigurationFileRole(config.pathnames.config); bool dropLocalNode = !IS_EMPTY_STRING_BUFFER(config.pgSetup.pgdata) && localNodeRole == PG_AUTOCTL_ROLE_KEEPER; /* * The configuration file is the last bit we remove, so we don't have to * implement "continue from previous failed attempt" when the configuration * file does not exist. */ if (dropLocalNode && !file_exists(config.pathnames.config)) { log_error("Failed to find expected configuration file \"%s\"", config.pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } if (dropLocalNode) { bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; if (!IS_EMPTY_STRING_BUFFER(config.hostname) || config.pgSetup.pgport != 0) { log_fatal("Only dropping the local node is supported, " "[ --hostname --pgport ] are not supported " "when --pgdata is used."); log_info("To drop another node, please use this command " "from the monitor itself."); exit(EXIT_CODE_BAD_ARGS); } if (!IS_EMPTY_STRING_BUFFER(config.name)) { log_fatal("Only dropping the local node is supported, " "[ --formation --name ] are not supported " "when --pgdata is used."); log_info("To drop another node, please use this command " "from the monitor itself."); exit(EXIT_CODE_BAD_ARGS); } /* just read the keeper file in given KeeperConfig */ if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { exit(EXIT_CODE_BAD_CONFIG); } /* now drop the local node files, and maybe --destroy PGDATA */ (void) cli_drop_local_node(&config, dropAndDestroy); return; } else { /* pg_autoctl drop node on the monitor drops another node */ if (IS_EMPTY_STRING_BUFFER(config.name) && IS_EMPTY_STRING_BUFFER(config.hostname)) { log_fatal("pg_autoctl drop node target can either be specified " "using [ --formation --name ], or " "using [ --hostname and --pgport ], " "please use either one."); exit(EXIT_CODE_BAD_ARGS); } (void) cli_drop_node_from_monitor_and_wait(&config); } } /* * cli_drop_monitor removes the local monitor node. */ static void cli_drop_monitor(int argc, char **argv) { KeeperConfig config = keeperOptions; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; /* * The configuration file is the last bit we remove, so we don't have to * implement "continue from previous failed attempt" when the configuration * file does not exist. */ if (!file_exists(config.pathnames.config)) { log_error("Failed to find expected configuration file \"%s\"", config.pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } /* * We are going to need to use the right pg_ctl binary to control the * Postgres cluster: pg_ctl stop. */ switch (ProbeConfigurationFileRole(config.pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { MonitorConfig mconfig = { 0 }; if (!monitor_config_init_from_pgsetup(&mconfig, &(config.pgSetup), missingPgdataIsOk, pgIsNotRunningIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* expose the pgSetup in the given KeeperConfig */ config.pgSetup = mconfig.pgSetup; /* somehow at this point we've lost our pathnames */ if (!keeper_config_set_pathnames_from_pgdata( &(config.pathnames), config.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* drop the node and maybe destroy its PGDATA entirely. */ (void) cli_drop_local_monitor(&mconfig, dropAndDestroy); return; } case PG_AUTOCTL_ROLE_KEEPER: { log_fatal("Local node is not a monitor"); exit(EXIT_CODE_BAD_CONFIG); break; } default: { log_fatal("Unrecognized configuration file \"%s\"", config.pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } } } /* * cli_drop_node_from_monitor calls pgautofailover.remove_node() on the monitor * for the given --hostname and --pgport, or from the given --formation and * --name. */ void cli_drop_node_from_monitor(KeeperConfig *config, int64_t *nodeId, int *groupId) { Monitor monitor = { 0 }; (void) cli_monitor_init_from_option_or_config(&monitor, config); if (!IS_EMPTY_STRING_BUFFER(config->name)) { log_info("Removing node with name \"%s\" in formation \"%s\" " "from the monitor", config->name, config->formation); if (!monitor_remove_by_nodename(&monitor, (char *) config->formation, (char *) config->name, dropForce, nodeId, groupId)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } } else if (!IS_EMPTY_STRING_BUFFER(config->hostname)) { int pgport = config->pgSetup.pgport > 0 ? config->pgSetup.pgport : pgsetup_get_pgport(); log_info("Removing node with hostname \"%s\" and port %d " "in formation \"%s\" from the monitor", config->hostname, pgport, config->formation); if (!monitor_remove_by_hostname(&monitor, (char *) config->hostname, pgport, dropForce, nodeId, groupId)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } } else { log_fatal("BUG: cli_drop_node_from_monitor options contain " " neither --name nor --hostname"); exit(EXIT_CODE_BAD_ARGS); } } /* * cli_drop_local_node drops the local node files, maybe including the PGDATA * directory (when --destroy has been used). */ void cli_drop_local_node(KeeperConfig *config, bool dropAndDestroy) { Keeper keeper = { 0 }; Monitor *monitor = &(keeper.monitor); KeeperStateData *keeperState = &(keeper.state); keeper.config = *config; if (config->monitorDisabled) { (void) cli_drop_node_with_monitor_disabled(config, dropAndDestroy); /* make sure we're done now */ exit(EXIT_CODE_QUIT); } (void) cli_monitor_init_from_option_or_config(monitor, config); /* * First, read the state file and check that it has been assigned the * DROPPED state already. */ if (!keeper_state_read(keeperState, config->pathnames.state)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_STATE); } /* first drop the node from the monitor */ if (keeperState->assigned_role != DROPPED_STATE) { int64_t nodeId = -1; int groupId = -1; (void) cli_drop_node_from_monitor(config, &nodeId, &groupId); } /* * Now, when the pg_autoctl keeper service is still running, wait until * it has reached the DROPPED/DROPPED state on-disk and then exited. */ pid_t pid = 0; /* * Before continuing we need to make sure that a currently running service * has stopped. */ bool stopped; if (dropForce) { /* * If --force is used, we skip the transition to "dropped". So a * currently running process won't realise it's dropped, which means it * will not exit by itself. Thus all we need to know is if it's running * now or not. */ if (!is_process_stopped(config->pathnames.pid, &stopped, &pid)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } } else { /* * If --force isn't used then a running pg_autoctl process will detect * that it is dropped and clean itself up nicely and finally it will * exit. We give the process 30 seconds to exit by itself. */ if (!wait_for_process_to_stop(config->pathnames.pid, 30, &stopped, &pid)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } } /* * If the service is not stopped yet, we just want to process to exit * so we can take over. This can happen either because --force was used * or because 30 seconds was not enough time for the service to exit. */ if (!stopped) { /* if the service isn't terminated, signal it to quit now */ log_info("Sending signal %s to pg_autoctl process %d", signal_to_string(SIGQUIT), pid); if (kill(pid, SIGQUIT) != 0) { log_error("Failed to send SIGQUIT to the keeper's pid %d: %m", pid); exit(EXIT_CODE_INTERNAL_ERROR); } if (!wait_for_process_to_stop(config->pathnames.pid, 30, &stopped, &pid) || !stopped) { log_fatal("Failed to stop the pg_autoctl process with pid %d", pid); exit(EXIT_CODE_INTERNAL_ERROR); } } /* * If the pg_autoctl keeper service was running at the beginning of this * pg_autoctl drop node command, it should have reached the local DROPPED * state already, and reported that to the monitor. But the process could * have failed to communicate with the monitor, too. * * Also, if the pg_autoctl keeper service was not running, then we need to * report that we've reached DROPPED state to the monitor now. */ bool dropped = false; if (keeper_ensure_node_has_been_dropped(&keeper, &dropped) && dropped) { log_info("This node with id %lld in formation \"%s\" and group %d " "has been dropped from the monitor", (long long) keeperState->current_node_id, config->formation, config->groupId); } else { log_fatal("Failed to ensure that the local node with id %lld " "in formation \"%s\" and group %d has been removed " "from the monitor", (long long) keeperState->current_node_id, config->formation, config->groupId); exit(EXIT_CODE_MONITOR); } /* * Either --destroy the whole Postgres cluster and configuration, or leave * enough behind us that it's possible to re-join a formation later. */ if (dropAndDestroy) { (void) cli_drop_node_files_and_directories(config); } else { /* * Now give the whole picture to the user, who might have missed our * --destroy option and might want to use it now to start again with a * fresh environment. */ log_warn("Preserving configuration file: \"%s\"", config->pathnames.config); if (directory_exists(config->pgSetup.pgdata)) { log_warn("Preserving Postgres Data Directory: \"%s\"", config->pgSetup.pgdata); } log_info("pg_autoctl drop node keeps your data and setup safe, " "you can still run Postgres or re-join a pg_auto_failover " "cluster later"); log_info("HINT: to completely remove your local Postgres instance and " "setup, consider `pg_autoctl drop node --destroy`"); } } /* * cli_drop_node_with_monitor_disabled implements pg_autoctl drop node for a * node that runs without a pg_auto_failover monitor. */ static void cli_drop_node_with_monitor_disabled(KeeperConfig *config, bool dropAndDestroy) { log_trace("cli_drop_node_with_monitor_disabled"); if (dropAndDestroy) { pid_t pid = 0; /* first stop the pg_autoctl service if it's running */ if (read_pidfile(config->pathnames.pid, &pid)) { if (kill(pid, SIGQUIT) != 0) { log_error( "Failed to send SIGQUIT to the keeper's pid %d: %m", pid); exit(EXIT_CODE_INTERNAL_ERROR); } bool stopped; if (!wait_for_process_to_stop(config->pathnames.pid, 30, &stopped, &pid) || !stopped) { log_fatal( "Failed to stop the pg_autoctl process with pid %d", pid); exit(EXIT_CODE_INTERNAL_ERROR); } } (void) cli_drop_node_files_and_directories(config); } else { log_fatal("pg_autoctl drop node is not supported when " "the monitor is disabled"); log_info("Consider using the --destroy option"); exit(EXIT_CODE_BAD_ARGS); } exit(EXIT_CODE_QUIT); } /* * cli_drop_node_files_and_directories removes the state files, configuration * files, and the PGDATA directory. */ static void cli_drop_node_files_and_directories(KeeperConfig *config) { /* Now remove the state files */ if (!unlink_file(config->pathnames.init)) { log_error("Failed to remove state init file \"%s\"", config->pathnames.init); } if (!unlink_file(config->pathnames.state)) { log_error("Failed to remove state file \"%s\"", config->pathnames.state); } (void) stop_postgres_and_remove_pgdata_and_config( &config->pathnames, &config->pgSetup); } /* * cli_drop_local_monitor drops the local monitor files, maybe including the * PGDATA directory (when --destroy has been used). */ static void cli_drop_local_monitor(MonitorConfig *mconfig, bool dropAndDestroy) { /* stop the monitor service if it's still running */ pid_t pid = 0; if (read_pidfile(mconfig->pathnames.pid, &pid)) { if (kill(pid, SIGQUIT) != 0) { log_error("Failed to send SIGQUIT to the keeper's pid %d: %m", pid); exit(EXIT_CODE_INTERNAL_ERROR); } bool stopped; if (!wait_for_process_to_stop(mconfig->pathnames.pid, 30, &stopped, &pid) || !stopped) { log_fatal("Failed to stop the pg_autoctl process with pid %d", pid); exit(EXIT_CODE_INTERNAL_ERROR); } } else { /* if we can't read a pidfile that exists on-disk, fail early */ if (file_exists(mconfig->pathnames.pid)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_STATE); } } /* * Either --destroy the whole Postgres cluster and configuration, or leave * enough behind us that it's possible to re-join a formation later. */ if (dropAndDestroy) { if (!unlink_file(mconfig->pathnames.state)) { log_error("Failed to remove state file \"%s\"", mconfig->pathnames.state); } (void) stop_postgres_and_remove_pgdata_and_config( &mconfig->pathnames, &mconfig->pgSetup); } else { /* * Now give the whole picture to the user, who might have missed our * --destroy option and might want to use it now to start again with a * fresh environment. */ log_warn("Preserving configuration file: \"%s\"", mconfig->pathnames.config); if (directory_exists(mconfig->pgSetup.pgdata)) { log_warn("Preserving Postgres Data Directory: \"%s\"", mconfig->pgSetup.pgdata); } log_info("pg_autoctl drop node keeps your data and setup safe, " "you can still run Postgres or re-join a pg_auto_failover " "cluster later"); log_info("HINT: to completely remove your local Postgres instance and " "setup, consider `pg_autoctl drop node --destroy`"); } } /* * stop_postgres_and_remove_pgdata_and_config stops PostgreSQL and then removes * PGDATA, and then config and state files. */ static void stop_postgres_and_remove_pgdata_and_config(ConfigFilePaths *pathnames, PostgresSetup *pgSetup) { log_info("Stopping PostgreSQL at \"%s\"", pgSetup->pgdata); if (!pg_ctl_stop(pgSetup->pg_ctl, pgSetup->pgdata)) { log_error("Failed to stop PostgreSQL at \"%s\"", pgSetup->pgdata); log_fatal("Skipping removal of directory \"%s\"", pgSetup->pgdata); exit(EXIT_CODE_PGCTL); } /* * Only try to rm -rf PGDATA if we managed to stop PostgreSQL. */ if (directory_exists(pgSetup->pgdata)) { log_info("Removing \"%s\"", pgSetup->pgdata); if (!rmtree(pgSetup->pgdata, true)) { log_error("Failed to remove directory \"%s\": %m", pgSetup->pgdata); exit(EXIT_CODE_INTERNAL_ERROR); } } else { log_warn("Skipping removal of \"%s\": directory does not exist", pgSetup->pgdata); } log_info("Removing \"%s\"", pathnames->config); if (!unlink_file(pathnames->config)) { /* errors have already been logged. */ exit(EXIT_CODE_BAD_CONFIG); } } /* * cli_drop_node_from_monitor_and_wait waits until the node doesn't exist * anymore on the monitor, meaning it's been fully dropped now. */ static void cli_drop_node_from_monitor_and_wait(KeeperConfig *config) { bool dropped = false; Monitor monitor = { 0 }; (void) cli_monitor_init_from_option_or_config(&monitor, config); /* call pgautofailover.remove_node() on the monitor */ int64_t nodeId; int groupId; (void) cli_drop_node_from_monitor(config, &nodeId, &groupId); /* if the timeout is zero, just don't wait at all */ if (config->listen_notifications_timeout == 0) { return; } log_info("Waiting until the node with id %lld in group %d has been " "dropped from the monitor, or for %ds, whichever comes first", (long long) nodeId, groupId, config->listen_notifications_timeout); uint64_t start = time(NULL); /* establish a connection for notifications if none present */ (void) pgsql_prepare_to_wait(&(monitor.notificationClient)); while (!dropped) { NodeAddressArray nodesArray = { 0 }; bool groupStateHasChanged = false; int timeoutMs = PG_AUTOCTL_KEEPER_SLEEP_TIME * 1000; uint64_t now = time(NULL); if ((now - start) > config->listen_notifications_timeout) { log_error("Failed to wait until the node has been dropped"); exit(EXIT_CODE_INTERNAL_ERROR); } (void) monitor_wait_for_state_change(&monitor, config->formation, groupId, nodeId, timeoutMs, &groupStateHasChanged); if (!monitor_find_node_by_nodeid(&monitor, config->formation, groupId, nodeId, &nodesArray)) { log_error("Failed to query monitor to see if node id %lld " "has been dropped already", (long long) nodeId); exit(EXIT_CODE_MONITOR); } dropped = nodesArray.count == 0; if (dropped) { log_info("Node with id %lld in group %d has been successfully " "dropped from the monitor", (long long) nodeId, groupId); } } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_enable_disable.c000066400000000000000000001247071414244367200244740ustar00rootroot00000000000000/* * cli_enable_disable.c * Implementation of pg_autoctl enable and disable CLI sub-commands. * Current features that can be enabled and their scope are: * - secondary (scope: formation) * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include "cli_common.h" #include "commandline.h" #include "env_utils.h" #include "fsm.h" #include "keeper_config.h" #include "log.h" #include "monitor.h" #include "parsing.h" #include "pgsetup.h" static bool allowFailover = false; static bool optForce = false; static int cli_secondary_getopts(int argc, char **argv); static void cli_enable_secondary(int argc, char **argv); static void cli_disable_secondary(int argc, char **argv); static int cli_maintenance_getopts(int argc, char **argv); static void cli_enable_maintenance(int argc, char **argv); static void cli_disable_maintenance(int argc, char **argv); static int cli_ssl_getopts(int argc, char **argv); static void cli_enable_ssl(int argc, char **argv); static void cli_disable_ssl(int argc, char **argv); static int cli_enable_monitor_getopts(int argc, char **argv); static int cli_disable_monitor_getopts(int argc, char **argv); static void cli_enable_monitor(int argc, char **argv); static void cli_disable_monitor(int argc, char **argv); static bool update_ssl_configuration(LocalPostgresServer *postgres, const char *hostname); static bool update_monitor_connection_string(KeeperConfig *config); static CommandLine enable_secondary_command = make_command("secondary", "Enable secondary nodes on a formation", " [ --pgdata --formation ] ", " --pgdata path to data directory\n" \ " --formation Formation to enable secondary on\n", cli_secondary_getopts, cli_enable_secondary); static CommandLine disable_secondary_command = make_command("secondary", "Disable secondary nodes on a formation", " [ --pgdata --formation ] ", " --pgdata path to data directory\n" \ " --formation Formation to disable secondary on\n", cli_secondary_getopts, cli_disable_secondary); static CommandLine enable_maintenance_command = make_command("maintenance", "Enable Postgres maintenance mode on this node", " [ --pgdata --allow-failover ]", CLI_PGDATA_OPTION, cli_maintenance_getopts, cli_enable_maintenance); static CommandLine disable_maintenance_command = make_command("maintenance", "Disable Postgres maintenance mode on this node", " [ --pgdata ]", CLI_PGDATA_OPTION, cli_maintenance_getopts, cli_disable_maintenance); static CommandLine enable_ssl_command = make_command("ssl", "Enable SSL configuration on this node", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION KEEPER_CLI_SSL_OPTIONS, cli_ssl_getopts, cli_enable_ssl); static CommandLine disable_ssl_command = make_command("ssl", "Disable SSL configuration on this node", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_disable_ssl); static CommandLine enable_monitor_command = make_command("monitor", "Enable a monitor for this node to be orchestrated from", " [ --pgdata --allow-failover ] " "postgres://autoctl_node@new.monitor.add.ress/pg_auto_failover", " --pgdata path to data directory\n", cli_enable_monitor_getopts, cli_enable_monitor); static CommandLine disable_monitor_command = make_command("monitor", "Disable the monitor for this node", " [ --pgdata --force ] ", " --pgdata path to data directory\n" " --force force unregistering from the monitor\n", cli_disable_monitor_getopts, cli_disable_monitor); static CommandLine *enable_subcommands[] = { &enable_secondary_command, &enable_maintenance_command, &enable_ssl_command, &enable_monitor_command, NULL }; static CommandLine *disable_subcommands[] = { &disable_secondary_command, &disable_maintenance_command, &disable_ssl_command, &disable_monitor_command, NULL }; CommandLine enable_commands = make_command_set("enable", "Enable a feature on a formation", NULL, NULL, NULL, enable_subcommands); CommandLine disable_commands = make_command_set("disable", "Disable a feature on a formation", NULL, NULL, NULL, disable_subcommands); /* * cli_secondary_getopts parses command line options for the secondary feature, * both during enable and disable. Little verification is performed however the * function will error when no --pgdata or --formation are provided, existance * of either are not verified. */ static int cli_secondary_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "formation", required_argument, NULL, 'f' }, { "allow-failover", no_argument, NULL, 'A' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; while ((c = getopt_long(argc, argv, "D:f:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'f': { strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'A': { allowFailover = true; log_trace("--allow-failover"); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); if (IS_EMPTY_STRING_BUFFER(options.formation)) { log_error("Option --formation is mandatory"); exit(EXIT_CODE_BAD_ARGS); } /* publish our option parsing in the global variable */ keeperOptions = options; return optind; } /* * cli_enable_secondary enables secondaries on the specified formation. */ static void cli_enable_secondary(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; if (!monitor_init_from_pgsetup(&monitor, &config.pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } if (!monitor_enable_secondary_for_formation(&monitor, config.formation)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } log_info("Enabled secondaries for formation \"%s\", make sure to add " "worker nodes to the formation to have secondaries ready " "for failover.", config.formation); } /* * cli_disable_secondary disables secondaries on the specified formation. */ static void cli_disable_secondary(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; if (!monitor_init_from_pgsetup(&monitor, &config.pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* * disabling secondaries on a formation happens on the monitor. When the * formation is still operating with secondaries an error will be logged * and the function will return with a false value. As we will exit the * successful info message below is only printed if secondaries on the * formation have been disabled successfully. */ if (!monitor_disable_secondary_for_formation(&monitor, config.formation)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } log_info("Disabled secondaries for formation \"%s\".", config.formation); } /* * cli_maintenance_getopts parses command line options for the pg_autoctl * enable|disable maintenance feature. We accept the --allow-failover option * that is unique to this command and so we have our own version of the getopt * parsing. */ static int cli_maintenance_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "allow-failover", no_argument, NULL, 'A' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; while ((c = getopt_long(argc, argv, "D:f:AVvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'A': { allowFailover = true; log_trace("--allow-failover"); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* now that we have the command line parameters, prepare the options */ (void) prepare_keeper_options(&options); /* publish our option parsing in the global variable */ keeperOptions = options; return optind; } /* * cli_enable_maintenance calls the pgautofailover.start_maintenance() function * on the monitor for the local node. */ static void cli_enable_maintenance(int argc, char **argv) { Keeper keeper = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; char *channels[] = { "state", NULL }; ConnectionRetryPolicy retryPolicy = { 0 }; keeper.config = keeperOptions; (void) exit_unless_role_is_keeper(&(keeper.config)); if (!keeper_config_read_file(&(keeper.config), missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_init(&keeper, &keeper.config)) { log_fatal("Failed to initialize keeper, see above for details"); exit(EXIT_CODE_KEEPER); } if (keeper.state.current_role == PRIMARY_STATE && !allowFailover) { log_warn("Enabling maintenance on a primary causes a failover"); log_fatal("Please use --allow-failover to allow the command proceed"); exit(EXIT_CODE_BAD_ARGS); } if (!monitor_init(&(keeper.monitor), keeper.config.monitor_pguri)) { log_fatal("Failed to initialize the monitor connection, " "see above for details."); exit(EXIT_CODE_MONITOR); } /* * If we're already in MAINTENANCE the monitor returns true but we don't * want to listen to changes, we don't expect any */ if (keeper.state.current_role != MAINTENANCE_STATE) { if (!pgsql_listen(&(keeper.monitor.notificationClient), channels)) { log_error("Failed to listen to state changes from the monitor"); exit(EXIT_CODE_MONITOR); } } /* * Set a retry policy for cases when we have a transient error on the * monitor. */ (void) pgsql_set_monitor_interactive_retry_policy(&retryPolicy); while (!pgsql_retry_policy_expired(&retryPolicy)) { int64_t nodeId = keeper.state.current_node_id; bool mayRetry = false; if (monitor_start_maintenance(&(keeper.monitor), nodeId, &mayRetry)) { /* start_maintenance was successful, break out of the retry loop */ break; } if (!mayRetry) { log_fatal("Failed to enable maintenance of node %" PRId64 " on the monitor, see above for details", nodeId); exit(EXIT_CODE_MONITOR); } int sleepTimeMs = pgsql_compute_connection_retry_sleep_time(&retryPolicy); log_warn("Failed to enable maintenance of node %" PRId64 " on the monitor, retrying in %d ms.", nodeId, sleepTimeMs); /* we have milliseconds, pg_usleep() wants microseconds */ (void) pg_usleep(sleepTimeMs * 1000); } if (keeper.state.current_role == MAINTENANCE_STATE) { log_info("This node is already in the \"maintenance\" state."); exit(EXIT_CODE_QUIT); } NodeState targetStates[] = { MAINTENANCE_STATE }; if (!monitor_wait_until_node_reported_state( &(keeper.monitor), keeper.config.formation, keeper.config.groupId, keeper.state.current_node_id, keeper.config.pgSetup.pgKind, targetStates, lengthof(targetStates))) { log_error("Failed to wait until the node reached the maintenance state"); exit(EXIT_CODE_MONITOR); } } /* * cli_disable_maintenance calls pgautofailver.stop_maintenance(name, port) on * the monitor. */ static void cli_disable_maintenance(int argc, char **argv) { Keeper keeper = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; char *channels[] = { "state", NULL }; ConnectionRetryPolicy retryPolicy = { 0 }; keeper.config = keeperOptions; (void) exit_unless_role_is_keeper(&(keeper.config)); if (!keeper_config_read_file(&(keeper.config), missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_init(&keeper, &keeper.config)) { log_fatal("Failed to initialize keeper, see above for details"); exit(EXIT_CODE_KEEPER); } if (!monitor_init(&(keeper.monitor), keeper.config.monitor_pguri)) { log_fatal("Failed to initialize the monitor connection, " "see above for details."); exit(EXIT_CODE_MONITOR); } if (!pgsql_listen(&(keeper.monitor.notificationClient), channels)) { log_error("Failed to listen to state changes from the monitor"); exit(EXIT_CODE_MONITOR); } /* * Set a retry policy for cases when we have a transient error on the * monitor. */ (void) pgsql_set_monitor_interactive_retry_policy(&retryPolicy); while (!pgsql_retry_policy_expired(&retryPolicy)) { int64_t nodeId = keeper.state.current_node_id; bool mayRetry = false; if (monitor_stop_maintenance(&(keeper.monitor), nodeId, &mayRetry)) { /* stop_maintenance was successful, break out of the retry loop */ break; } if (!mayRetry) { log_fatal("Failed to disable maintenance of node %" PRId64 " on the monitor, see above for details", nodeId); exit(EXIT_CODE_MONITOR); } int sleepTimeMs = pgsql_compute_connection_retry_sleep_time(&retryPolicy); log_warn("Failed to disable maintenance of node %" PRId64 " on the monitor, retrying in %d ms.", nodeId, sleepTimeMs); /* we have milliseconds, pg_usleep() wants microseconds */ (void) pg_usleep(sleepTimeMs * 1000); } NodeState targetStates[] = { SECONDARY_STATE, PRIMARY_STATE }; if (!monitor_wait_until_node_reported_state( &(keeper.monitor), keeper.config.formation, keeper.config.groupId, keeper.state.current_node_id, keeper.config.pgSetup.pgKind, targetStates, lengthof(targetStates))) { log_error("Failed to wait until a node reached the secondary or primary state"); exit(EXIT_CODE_MONITOR); } } /* * cli_ssl_getopts parses the command line options necessary to initialize a * PostgreSQL instance as our monitor. */ static int cli_ssl_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; SSLCommandLineOptions sslCommandLineOptions = SSL_CLI_UNKNOWN; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { "no-ssl", no_argument, NULL, 'N' }, { "ssl-self-signed", no_argument, NULL, 's' }, { "ssl-mode", required_argument, &ssl_flag, SSL_MODE_FLAG }, { "ssl-ca-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG }, { "ssl-crl-file", required_argument, &ssl_flag, SSL_CRL_FILE_FLAG }, { "server-cert", required_argument, &ssl_flag, SSL_SERVER_CRT_FLAG }, { "server-key", required_argument, &ssl_flag, SSL_SERVER_KEY_FLAG }, { NULL, 0, NULL, 0 } }; /* hard-coded defaults */ options.pgSetup.pgport = pgsetup_get_pgport(); optind = 0; while ((c = getopt_long(argc, argv, "D:VvqhNs", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 's': { /* { "ssl-self-signed", no_argument, NULL, 's' }, */ if (!cli_getopt_accept_ssl_options(SSL_CLI_SELF_SIGNED, sslCommandLineOptions)) { errors++; break; } sslCommandLineOptions = SSL_CLI_SELF_SIGNED; options.pgSetup.ssl.active = 1; options.pgSetup.ssl.createSelfSignedCert = true; log_trace("--ssl-self-signed"); break; } case 'N': { /* { "no-ssl", no_argument, NULL, 'N' }, */ if (!cli_getopt_accept_ssl_options(SSL_CLI_NO_SSL, sslCommandLineOptions)) { errors++; break; } sslCommandLineOptions = SSL_CLI_NO_SSL; options.pgSetup.ssl.active = 0; options.pgSetup.ssl.createSelfSignedCert = false; log_trace("--no-ssl"); break; } /* * { "ssl-ca-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG } * { "ssl-crl-file", required_argument, &ssl_flag, SSL_CA_FILE_FLAG } * { "server-crt", required_argument, &ssl_flag, SSL_SERVER_CRT_FLAG } * { "server-key", required_argument, &ssl_flag, SSL_SERVER_KEY_FLAG } * { "ssl-mode", required_argument, &ssl_flag, SSL_MODE_FLAG }, */ case 0: { if (ssl_flag != SSL_MODE_FLAG) { if (!cli_getopt_accept_ssl_options(SSL_CLI_USER_PROVIDED, sslCommandLineOptions)) { errors++; break; } sslCommandLineOptions = SSL_CLI_USER_PROVIDED; options.pgSetup.ssl.active = 1; } if (!cli_getopt_ssl_flags(ssl_flag, optarg, &(options.pgSetup))) { errors++; } break; } default: { /* getopt_long already wrote an error message */ commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); break; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* Initialize with given PGDATA */ cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); if (!keeper_config_set_pathnames_from_pgdata(&(options.pathnames), options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* * If any --ssl-* option is provided, either we have a root ca file and a * server.key and a server.crt or none of them. Any other combo is a * mistake. */ if (sslCommandLineOptions == SSL_CLI_UNKNOWN) { log_fatal("Explicit SSL choice is required: please use either " "--ssl-self-signed or provide your certificates " "using --ssl-ca-file, --ssl-crl-file, " "--server-key, and --server-crt (or use --no-ssl if you " "are very sure that you do not want encrypted traffic)"); exit(EXIT_CODE_BAD_ARGS); } if (!pgsetup_validate_ssl_settings(&(options.pgSetup))) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* publish our option parsing in the global variable */ keeperOptions = options; return optind; } /* * cli_enable_ssl enables SSL setup on this node. * * - edit our Postgres configuration with the given SSL files and options * - when run on a keeper, edit the monitor connection string to use SSL * - edits our configuration at pg_autoctl.conf */ static void cli_enable_ssl(int argc, char **argv) { KeeperConfig options = keeperOptions; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; switch (ProbeConfigurationFileRole(options.pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { MonitorConfig mconfig = { 0 }; PostgresSetup *pgSetup = &(mconfig.pgSetup); LocalPostgresServer postgres = { 0 }; bool reloadedService = false; if (!monitor_config_init_from_pgsetup(&mconfig, &options.pgSetup, missingPgdataIsOk, pgIsNotRunningIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* now override current on-file settings with CLI ssl options */ pgSetup->ssl = options.pgSetup.ssl; local_postgres_init(&postgres, pgSetup); /* update the Postgres SSL setup and maybe create the certificate */ if (!update_ssl_configuration(&postgres, mconfig.hostname)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } /* make sure that the new SSL files are part of the setup */ mconfig.pgSetup.ssl = postgres.postgresSetup.ssl; /* update the monitor's configuration to use SSL */ if (!monitor_config_write_file(&mconfig)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (file_exists(mconfig.pathnames.pid)) { reloadedService = cli_pg_autoctl_reload(mconfig.pathnames.pid); if (!reloadedService) { log_warn("Failed to reload the pg_autoctl, consider " "restarting it to implement the SSL changes"); } } /* display a nice summary to our users */ log_info("Successfully enabled new SSL configuration:"); log_info(" SSL is now %s", pgSetup->ssl.active ? "active" : "disabled"); if (pgSetup->ssl.createSelfSignedCert) { log_info(" Self-Signed certificates have been created and " "deployed in Postgres configuration settings " "ssl_key_file and ssl_cert_file"); } if (reloadedService) { log_info(" pg_autoctl service has been signaled to reload " "its configuration"); } else { log_warn(" pg_autoctl service is not running, changes " "will only apply at next start of pg_autoctl"); } break; } case PG_AUTOCTL_ROLE_KEEPER: { KeeperConfig kconfig = { 0 }; PostgresSetup *pgSetup = &(kconfig.pgSetup); LocalPostgresServer postgres = { 0 }; bool reloadedService = false; bool updatedMonitorString = true; kconfig.pgSetup = options.pgSetup; kconfig.pathnames = options.pathnames; if (!keeper_config_read_file(&kconfig, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { log_fatal("Failed to read configuration file \"%s\"", kconfig.pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } /* now override current on-file settings with CLI ssl options */ pgSetup->ssl = options.pgSetup.ssl; local_postgres_init(&postgres, pgSetup); /* log about the need to edit the monitor connection string */ if (!update_monitor_connection_string(&kconfig)) { updatedMonitorString = false; log_error( "Failed to update the monitor URI, rerun this command " "again after resolving the issue to update it"); } /* update the Postgres SSL setup and maybe create the certificate */ if (!update_ssl_configuration(&postgres, kconfig.hostname)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } /* make sure that the new SSL files are part of the setup */ kconfig.pgSetup.ssl = postgres.postgresSetup.ssl; /* and write our brand new setup to file */ if (!keeper_config_write_file(&kconfig)) { log_fatal("Failed to write the pg_autoctl configuration file, " "see above"); exit(EXIT_CODE_BAD_CONFIG); } if (file_exists(kconfig.pathnames.pid)) { reloadedService = cli_pg_autoctl_reload(kconfig.pathnames.pid); if (!reloadedService) { log_error("Failed to reload the pg_autoctl, consider " "restarting it to implement the SSL changes"); } } /* display a nice summary to our users */ log_info("Successfully enabled new SSL configuration:"); log_info(" SSL is now %s", pgSetup->ssl.active ? "active" : "disabled"); if (pgSetup->ssl.createSelfSignedCert) { log_info(" Self-Signed certificates have been created and " "deployed in Postgres configuration settings " "ssl_key_file and ssl_cert_file"); } if (updatedMonitorString) { log_info(" Postgres connection string to the monitor " "has been changed to use sslmode \"%s\"", pgsetup_sslmode_to_string(pgSetup->ssl.sslMode)); } else { log_error(" Postgres connection string to the monitor " "could not be updated, see above for details"); } log_info(" Replication connection string primary_conninfo " "is going to be updated in the main service loop " "to use ssl mode \"%s\"", pgsetup_sslmode_to_string(pgSetup->ssl.sslMode)); if (reloadedService) { log_info(" pg_autoctl service has been signaled to reload " "its configuration"); } else { log_error(" pg_autoctl service is not running, changes " "will only apply at next start of pg_autoctl"); } break; } default: { log_fatal("Unrecognized configuration file \"%s\"", options.pathnames.config); exit(EXIT_CODE_INTERNAL_ERROR); } } } /* * update_ssl_configuration updates the local SSL configuration. */ static bool update_ssl_configuration(LocalPostgresServer *postgres, const char *hostname) { PostgresSetup *pgSetup = &(postgres->postgresSetup); log_trace("update_ssl_configuration: ssl %s", pgSetup->ssl.active ? "on" : "off"); /* * When --ssl-self-signed is used, create a certificate. * * In the caller function cli_enable_ssl() we then later write the * pg_autoctl.conf file with the new SSL settings, including both the * ssl.cert_file and the ssl.key_file values, and reload the pg_autoctl * service if it's running. * * At reload time, the pg_autoctl service will edit our Postgres settings * in postgresql-auto-failover.conf with the new values and reload * Postgres. */ if (pgSetup->ssl.createSelfSignedCert && (!file_exists(pgSetup->ssl.serverKey) || !file_exists(pgSetup->ssl.serverCert))) { if (!pg_create_self_signed_cert(pgSetup, hostname)) { log_error("Failed to create SSL self-signed certificate, " "see above for details"); return false; } } /* HBA rules for hostssl are not edited */ log_warn("HBA rules in \"%s/pg_hba.conf\" have NOT been edited: \"host\" " " records match either SSL or non-SSL connection attempts.", pgSetup->pgdata); return true; } /* * update_monitor_connection_string connects to the monitor to see if ssl is * active on the server. When that's the case, the function complains about * updating the monitor URI in the given KeeperConfig. */ static bool update_monitor_connection_string(KeeperConfig *config) { Monitor monitor = { 0 }; URIParams params = { 0 }; KeyVal sslParams = { 3, { "sslmode", "sslrootcert", "sslcrl" }, { 0 } }; bool checkForCompleteURI = true; char newPgURI[MAXCONNINFO] = { 0 }; /* initialize SSL Params values */ strlcpy(sslParams.values[0], pgsetup_sslmode_to_string(config->pgSetup.ssl.sslMode), MAXCONNINFO); strlcpy(sslParams.values[1], config->pgSetup.ssl.caFile, MAXCONNINFO); strlcpy(sslParams.values[2], config->pgSetup.ssl.crlFile, MAXCONNINFO); if (!parse_pguri_info_key_vals(config->monitor_pguri, &sslParams, ¶ms, checkForCompleteURI)) { log_warn( "The monitor SSL setup is ready and your current " "connection string is \"%s\", you might need to update it", config->monitor_pguri); log_info( "Use pg_autoctl config set pg_autoctl.monitor for updating " "your monitor connection string, then restart pg_autoctl "); } if (!buildPostgresURIfromPieces(¶ms, newPgURI)) { log_error("Failed to produce the new monitor connection string"); return false; } if (!monitor_init(&monitor, newPgURI)) { /* errors have already been logged */ return false; } char scrubbedConnectionString[MAXCONNINFO] = { 0 }; if (parse_and_scrub_connection_string(newPgURI, scrubbedConnectionString)) { log_info("Trying to connect to monitor using connection string \"%s\"", scrubbedConnectionString); } else { log_error( "Trying to connect to monitor using unparseable connection string \"%s\"", newPgURI); return false; } /* * Try to connect using the new connection string and don't update it if it * does not actually allow connecting. */ if (!pgsql_execute_with_params(&(monitor.pgsql), "SELECT 1", 0, NULL, NULL, NULL, NULL)) { return false; } /* we have a new monitor URI with our new SSL parameters */ strlcpy(config->monitor_pguri, newPgURI, MAXCONNINFO); log_info("Updating the monitor URI to \"%s\"", scrubbedConnectionString); return true; } /* * cli_disable_ssl enables SSL setup on this node. * * The following two commands do the same thing: * * - pg_autoctl enable ssl --no-ssl * - pg_autoctl disable ssl */ static void cli_disable_ssl(int argc, char **argv) { /* prepare the global command line options keeperOptions as if --no-ssl */ keeperOptions.pgSetup.ssl.active = 0; keeperOptions.pgSetup.ssl.createSelfSignedCert = false; /* this does some validation and user facing WARNing messages */ if (!pgsetup_validate_ssl_settings(&(keeperOptions.pgSetup))) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } (void) cli_enable_ssl(argc, argv); } /* * cli_enable_monitor_getopts parses the command line options for the * command `pg_autoctl enable monitor`. */ static int cli_enable_monitor_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "allow-failover", no_argument, NULL, 'A' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; /* set default values for our options, when we have some */ options.groupId = -1; options.network_partition_timeout = -1; options.prepare_promotion_catchup = -1; options.prepare_promotion_walreceiver = -1; options.postgresql_restart_failure_timeout = -1; options.postgresql_restart_failure_max_retries = -1; options.pgSetup.settings.candidatePriority = FAILOVER_NODE_CANDIDATE_PRIORITY; options.pgSetup.settings.replicationQuorum = FAILOVER_NODE_REPLICATION_QUORUM; optind = 0; /* * The only command lines that are using keeper_cli_getopt_pgdata are * terminal ones: they don't accept subcommands. In that case our option * parsing can happen in any order and we don't need getopt_long to behave * in a POSIXLY_CORRECT way. * * The unsetenv() call allows getopt_long() to reorder arguments for us. */ unsetenv("POSIXLY_CORRECT"); while ((c = getopt_long(argc, argv, "D:m:AVvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'A': { allowFailover = true; log_trace("--allow-failover"); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ errors++; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); if (!keeper_config_set_pathnames_from_pgdata(&(options.pathnames), options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } keeperOptions = options; return optind; } /* * cli_enable_monitor enables a monitor (again?) on a pg_autoctl node where it * currently is setup without a monitor. */ static void cli_enable_monitor(int argc, char **argv) { Keeper keeper = { 0 }; Monitor *monitor = &(keeper.monitor); KeeperConfig *config = &(keeper.config); bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; keeper.config = keeperOptions; (void) exit_unless_role_is_keeper(&(keeper.config)); if (!keeper_config_read_file(config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!config->monitorDisabled) { log_fatal("Failed to enable monitor \"%s\": " "there is already a monitor enabled", config->monitor_pguri); exit(EXIT_CODE_BAD_CONFIG); } /* * Parse monitor Postgres URI expected as the first (only) argument. */ if (argc != 1) { log_fatal("Failed to parse new monitor URI as an argument."); commandline_print_usage(&enable_monitor_command, stderr); exit(EXIT_CODE_BAD_ARGS); } strlcpy(config->monitor_pguri, argv[0], MAXCONNINFO); if (!validate_connection_string(config->monitor_pguri)) { log_fatal("Failed to parse the new monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } config->monitorDisabled = false; if (!keeper_init(&keeper, &keeper.config)) { log_fatal("Failed to initialize keeper, see above for details"); exit(EXIT_CODE_KEEPER); } if (!monitor_init(monitor, config->monitor_pguri)) { log_fatal("Failed to initialize the monitor connection, " "see above for details."); exit(EXIT_CODE_MONITOR); } /* * Now register to the new monitor from this "client-side" process, and * then signal the background pg_autoctl service for this node (if any) to * reload its configuration so that it starts calling node_active() to the * new monitor. */ if (!keeper_register_again(&keeper)) { exit(EXIT_CODE_MONITOR); } /* * Now that we have registered again, reload the background process (if any * is running) so that it connects to the monitor for the node_active * protocol. When we reload the background process, we need the * configuration file to have been updated first on-disk: */ if (!keeper_config_write_file(config)) { log_fatal("Failed to write pg_autoctl configuration file \"%s\", " "see above for details", keeper.config.pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } /* time to reload the running pg_autoctl service, when it's running */ if (file_exists(keeper.config.pathnames.pid)) { bool reloadedService = cli_pg_autoctl_reload(keeper.config.pathnames.pid); if (!reloadedService) { log_fatal("Failed to reload the pg_autoctl service"); } } } /* * cli_disable_monitor_getopts parses the command line options for the * command `pg_autoctl disable monitor`. */ static int cli_disable_monitor_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "force", no_argument, NULL, 'F' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; /* set default values for our options, when we have some */ options.groupId = -1; options.network_partition_timeout = -1; options.prepare_promotion_catchup = -1; options.prepare_promotion_walreceiver = -1; options.postgresql_restart_failure_timeout = -1; options.postgresql_restart_failure_max_retries = -1; /* do not set a default formation, it should be found in the config file */ optind = 0; while ((c = getopt_long(argc, argv, "D:FVvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'F': { optForce = true; log_trace("--force"); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ errors++; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); if (!keeper_config_set_pathnames_from_pgdata(&(options.pathnames), options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } keeperOptions = options; return optind; } /* * cli_disable_monitor disables the monitor on a running pg_autoctl node. This * is useful when the monitor has been lost and a maintenance operation has to * register the node to a new monitor without stopping Postgres. */ static void cli_disable_monitor(int argc, char **argv) { Keeper keeper = { 0 }; Monitor *monitor = &(keeper.monitor); KeeperConfig *config = &(keeper.config); bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; keeper.config = keeperOptions; (void) exit_unless_role_is_keeper(&(keeper.config)); if (!keeper_config_read_file(&(keeper.config), missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_init(&keeper, &keeper.config)) { log_fatal("Failed to initialize keeper, see above for details"); exit(EXIT_CODE_KEEPER); } if (!monitor_init(monitor, keeper.config.monitor_pguri)) { log_fatal("Failed to initialize the monitor connection, " "see above for details."); exit(EXIT_CODE_MONITOR); } /* * Unless --force has been used, we only disable the monitor when the * current node has not been registered. When --force is used, we remove * our registration from the monitor first. */ NodeAddressArray nodesArray = { 0 }; int nodeIndex = 0; /* * There might be some race conditions here, but it's all to be * user-friendly so in the worst case we're going to be less friendly that * we could have. */ if (!monitor_get_nodes(monitor, config->formation, config->groupId, &nodesArray)) { if (optForce) { /* ignore the error, just don't wait in that case */ log_warn("Failed to get_nodes() on the monitor"); log_info("Failed to contact the monitor, disabling it as requested"); } else { log_warn("Failed to get_nodes() on the monitor"); log_fatal("Failed to contact the monitor, use --force to continue"); exit(EXIT_CODE_MONITOR); } } for (nodeIndex = 0; nodeIndex < nodesArray.count; nodeIndex++) { if (nodesArray.nodes[nodeIndex].nodeId == keeper.state.current_node_id) { /* we found our node, exit */ break; } } /* did we find the local node on the monitor? */ if (nodeIndex < nodesArray.count) { if (optForce) { /* --force, and we found the node */ log_info("Removing node %" PRId64 " \"%s\" (%s:%d) from monitor", nodesArray.nodes[nodeIndex].nodeId, nodesArray.nodes[nodeIndex].name, nodesArray.nodes[nodeIndex].host, nodesArray.nodes[nodeIndex].port); int64_t nodeId = -1; int groupId = -1; if (!monitor_remove_by_hostname( monitor, nodesArray.nodes[nodeIndex].host, nodesArray.nodes[nodeIndex].port, optForce, &nodeId, &groupId)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } } else { /* node was found on the monitor, but --force not provided */ log_info("Found node %" PRId64 " \"%s\" (%s:%d) on the monitor", nodesArray.nodes[nodeIndex].nodeId, nodesArray.nodes[nodeIndex].name, nodesArray.nodes[nodeIndex].host, nodesArray.nodes[nodeIndex].port); log_fatal("Use --force to remove the node from the monitor"); exit(EXIT_CODE_BAD_STATE); } } /* * Now either we didn't find the node on the monitor, or we just removed it * from there. In either case, we can proceed with disabling the monitor * from the node setup, and removing the local state file. */ strlcpy(config->monitor_pguri, PG_AUTOCTL_MONITOR_DISABLED, sizeof(config->monitor_pguri)); config->monitorDisabled = true; if (!keeper_config_write_file(config)) { log_fatal("Failed to write pg_autoctl configuration file \"%s\", " "see above for details", keeper.config.pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } /* time to reload the running pg_autoctl service, when it's running */ if (file_exists(keeper.config.pathnames.pid)) { bool reloadedService = cli_pg_autoctl_reload(keeper.config.pathnames.pid); if (!reloadedService) { log_fatal("Failed to reload the pg_autoctl service"); } } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_formation.c000066400000000000000000000312171414244367200235520ustar00rootroot00000000000000/* * cli_formation.c * Implementation of a CLI to manage a pg_auto_failover formation. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "defaults.h" #include "env_utils.h" #include "formation_config.h" #include "log.h" #include "pgsetup.h" static FormationConfig formationOptions; static bool cli_formation_use_monitor_option(FormationConfig *options); static int keeper_cli_formation_getopts(int argc, char **argv); static int keeper_cli_formation_create_getopts(int argc, char **argv); static void keeper_cli_formation_create(int argc, char **argv); static void keeper_cli_formation_drop(int argc, char **argv); CommandLine create_formation_command = make_command("formation", "Create a new formation on the pg_auto_failover monitor", " [ --pgdata --monitor --formation --kind --dbname " " --with-secondary --without-secondary ] ", " --pgdata path to data directory\n" " --monitor pg_auto_failover Monitor Postgres URL\n" " --formation name of the formation to create \n" " --kind formation kind, either \"pgsql\" or \"citus\"\n" " --dbname name for postgres database to use in this formation \n" " --enable-secondary create a formation that has multiple nodes that can be \n" " used for fail over when others have issues \n" " --disable-secondary create a citus formation without nodes to fail over to \n" " --number-sync-standbys minimum number of standbys to confirm write \n", keeper_cli_formation_create_getopts, keeper_cli_formation_create); CommandLine drop_formation_command = make_command("formation", "Drop a formation on the pg_auto_failover monitor", " [ --pgdata --formation ]", " --pgdata path to data directory \n" \ " --monitor pg_auto_failover Monitor Postgres URL\n" " --formation name of the formation to drop \n", keeper_cli_formation_getopts, keeper_cli_formation_drop); /* * cli_formation_use_monitor_option returns true when the --monitor option * should be used, or when PG_AUTOCTL_MONITOR has been set in the environment. * In that case the options->monitor_pguri is also set to the value found in * the environment. * * See cli_use_monitor_option() for the general KeeperConfig version of the * same function. */ static bool cli_formation_use_monitor_option(FormationConfig *options) { /* if --monitor is used, then use it */ if (!IS_EMPTY_STRING_BUFFER(options->monitor_pguri)) { return true; } /* otherwise, have a look at the PG_AUTOCTL_MONITOR environment variable */ if (env_exists(PG_AUTOCTL_MONITOR) && get_env_copy(PG_AUTOCTL_MONITOR, options->monitor_pguri, sizeof(options->monitor_pguri))) { log_debug("Using environment PG_AUTOCTL_MONITOR \"%s\"", options->monitor_pguri); return true; } /* * Still nothing? well don't use --monitor then. * * Now, on commands that are compatible with using just a monitor and no * local pg_autoctl node, we want to include an error message about the * lack of a --monitor when we also lack --pgdata. */ if (IS_EMPTY_STRING_BUFFER(options->pgSetup.pgdata) && !env_exists("PGDATA")) { log_error("Failed to get value for environment variable '%s', " "which is unset", PG_AUTOCTL_MONITOR); log_warn("This command also supports the --monitor option, which " "is not used here"); } return false; } /* * keeper_cli_formation_getopts parses the command line options * necessary to describe an already existing formation */ int keeper_cli_formation_getopts(int argc, char **argv) { FormationConfig options = { 0 }; int c = 0, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "monitor", required_argument, NULL, 'm' }, { "formation", required_argument, NULL, 'f' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; while ((c = getopt_long(argc, argv, "D:f:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'm': { if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'f': { strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* when we have a monitor URI we don't need PGDATA */ if (cli_formation_use_monitor_option(&options)) { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { log_warn("Given --monitor URI, the --pgdata option is ignored"); log_info("Connecting to monitor at \"%s\"", options.monitor_pguri); /* the rest of the program needs pgdata actually empty */ bzero((void *) options.pgSetup.pgdata, sizeof(options.pgSetup.pgdata)); } } else { cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); } /* publish our option parsing in the global variable */ formationOptions = options; return optind; } /* * keeper_cli_formation_create_getopts parses the command line options * necessary to create a new formation. */ int keeper_cli_formation_create_getopts(int argc, char **argv) { FormationConfig options = { 0 }; int c = 0, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "monitor", required_argument, NULL, 'm' }, { "formation", required_argument, NULL, 'f' }, { "kind", required_argument, NULL, 'k' }, { "dbname", required_argument, NULL, 'd' }, { "enable-secondary", no_argument, NULL, 's' }, { "disable-secondary", no_argument, NULL, 'S' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { "number-sync-standbys", required_argument, NULL, 'n' }, { NULL, 0, NULL, 0 } }; optind = 0; /* set defaults for formations */ options.formationHasSecondary = true; while ((c = getopt_long(argc, argv, "D:f:k:sSVvqhn:", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'm': { if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'f': { strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'k': { strlcpy(options.formationKind, optarg, NAMEDATALEN); log_trace("--kind %s", options.formationKind); break; } case 'd': { strlcpy(options.dbname, optarg, NAMEDATALEN); log_trace("--dbname %s", options.dbname); break; } case 's': { options.formationHasSecondary = true; log_trace("--enable-secondary"); break; } case 'S': { options.formationHasSecondary = false; log_trace("--disable-secondary"); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'n': { /* { "number-sync-standbys", required_argument, NULL, 'n'} */ int numberSyncStandbys = strtol(optarg, NULL, 10); if (errno == EINVAL || numberSyncStandbys < 0) { log_fatal("--number-sync-standbys argument is not valid." " Use a non-negative integer value."); exit(EXIT_CODE_BAD_ARGS); } options.numberSyncStandbys = numberSyncStandbys; log_trace("--number-sync-standbys %d", numberSyncStandbys); break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } /* when we have a monitor URI we don't need PGDATA */ if (cli_formation_use_monitor_option(&options)) { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { log_warn("Given --monitor URI, the --pgdata option is ignored"); log_info("Connecting to monitor at \"%s\"", options.monitor_pguri); /* the rest of the program needs pgdata actually empty */ bzero((void *) options.pgSetup.pgdata, sizeof(options.pgSetup.pgdata)); } } else { cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); } if (IS_EMPTY_STRING_BUFFER(options.formation) || IS_EMPTY_STRING_BUFFER(options.formationKind)) { log_error("Options --formation and --kind are mandatory"); exit(EXIT_CODE_BAD_ARGS); } /* --dbname is not provided, use default */ if (IS_EMPTY_STRING_BUFFER(options.dbname)) { log_debug("--dbname not provided, setting to \"%s\"", DEFAULT_DATABASE_NAME); strlcpy(options.dbname, DEFAULT_DATABASE_NAME, NAMEDATALEN); } /* publish our option parsing in the global variable */ formationOptions = options; return optind; } /* * keeper_cli_formation_create creates a new formation of a given kind in the * pg_auto_failover monitor. */ static void keeper_cli_formation_create(int argc, char **argv) { FormationConfig config = formationOptions; Monitor monitor = { 0 }; if (IS_EMPTY_STRING_BUFFER(config.monitor_pguri)) { if (!monitor_init_from_pgsetup(&monitor, &config.pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } } else { if (!monitor_init(&monitor, config.monitor_pguri)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } } if (!monitor_create_formation(&monitor, config.formation, config.formationKind, config.dbname, config.formationHasSecondary, config.numberSyncStandbys)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } log_info("Created formation \"%s\" of kind \"%s\" on the monitor, with secondary %s.", config.formation, config.formationKind, config.formationHasSecondary ? "enabled" : "disabled"); } /* * keeper_cli_formation_drop removes a formation in the pg_auto_failover monitor. */ static void keeper_cli_formation_drop(int argc, char **argv) { FormationConfig config = formationOptions; Monitor monitor = { 0 }; if (IS_EMPTY_STRING_BUFFER(config.formation)) { log_error("Options --formation is mandatory"); exit(EXIT_CODE_BAD_ARGS); } if (IS_EMPTY_STRING_BUFFER(config.monitor_pguri)) { if (!monitor_init_from_pgsetup(&monitor, &config.pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } } else { if (!monitor_init(&monitor, config.monitor_pguri)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } } if (!monitor_drop_formation(&monitor, config.formation)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } log_info("Dropped formation \"%s\" on the monitor", config.formation); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_get_set_properties.c000066400000000000000000000543261414244367200254700ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_get_set_properties.c * Implementation of a CLI to get and set properties managed by the * pg_auto_failover monitor. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include "parson.h" #include "cli_common.h" #include "parsing.h" #include "string_utils.h" static bool get_node_replication_settings(NodeReplicationSettings *settings); static void cli_get_node_replication_quorum(int argc, char **argv); static void cli_get_node_candidate_priority(int argc, char **argv); static void cli_get_formation_number_sync_standbys(int argc, char **argv); static void cli_set_node_replication_quorum(int argc, char **argv); static void cli_set_node_candidate_priority(int argc, char **argv); static void cli_set_node_metadata(int argc, char **argv); static void cli_set_formation_number_sync_standbys(int arc, char **argv); static bool set_node_candidate_priority(Keeper *keeper, int candidatePriority); static bool set_node_replication_quorum(Keeper *keeper, bool replicationQuorum); static bool set_formation_number_sync_standbys(Monitor *monitor, char *formation, int groupId, int numberSyncStandbys); CommandLine get_node_replication_quorum = make_command("replication-quorum", "get replication-quorum property from the monitor", " [ --pgdata ] [ --json ] [ --formation ] [ --name ]", " --pgdata path to data directory\n" " --formation pg_auto_failover formation\n" " --name pg_auto_failover node name\n" " --json output data in the JSON format\n", cli_get_name_getopts, cli_get_node_replication_quorum); CommandLine get_node_candidate_priority = make_command("candidate-priority", "get candidate property from the monitor", " [ --pgdata ] [ --json ] [ --formation ] [ --name ]", " --pgdata path to data directory\n" " --formation pg_auto_failover formation\n" " --name pg_auto_failover node name\n" " --json output data in the JSON format\n", cli_get_name_getopts, cli_get_node_candidate_priority); static CommandLine *get_node_subcommands[] = { &get_node_replication_quorum, &get_node_candidate_priority, NULL }; static CommandLine get_node_command = make_command_set("node", "get a node property from the pg_auto_failover monitor", NULL, NULL, NULL, get_node_subcommands); static CommandLine get_formation_settings = make_command("settings", "get replication settings for a formation from the monitor", " [ --pgdata ] [ --json ] [ --formation ] ", " --pgdata path to data directory\n" " --json output data in the JSON format\n" " --formation pg_auto_failover formation\n", cli_get_name_getopts, cli_get_formation_settings); static CommandLine get_formation_number_sync_standbys = make_command("number-sync-standbys", "get number_sync_standbys for a formation from the monitor", " [ --pgdata ] [ --json ] [ --formation ] ", " --pgdata path to data directory\n" " --json output data in the JSON format\n" " --formation pg_auto_failover formation\n", cli_get_name_getopts, cli_get_formation_number_sync_standbys); static CommandLine *get_formation_subcommands[] = { &get_formation_settings, &get_formation_number_sync_standbys, NULL }; static CommandLine get_formation_command = make_command_set("formation", "get a formation property from the pg_auto_failover monitor", NULL, NULL, NULL, get_formation_subcommands); static CommandLine *get_subcommands[] = { &get_node_command, &get_formation_command, NULL }; CommandLine get_commands = make_command_set("get", "Get a pg_auto_failover node, or formation setting", NULL, NULL, NULL, get_subcommands); /* set commands */ static CommandLine set_node_replication_quorum_command = make_command("replication-quorum", "set replication-quorum property on the monitor", " [ --pgdata ] [ --json ] [ --formation ] [ --name ] " "", " --pgdata path to data directory\n" " --formation pg_auto_failover formation\n" " --name pg_auto_failover node name\n" " --json output data in the JSON format\n", cli_get_name_getopts, cli_set_node_replication_quorum); static CommandLine set_node_candidate_priority_command = make_command("candidate-priority", "set candidate property on the monitor", " [ --pgdata ] [ --json ] [ --formation ] [ --name ] " "", " --pgdata path to data directory\n" " --formation pg_auto_failover formation\n" " --name pg_auto_failover node name\n" " --json output data in the JSON format\n", cli_get_name_getopts, cli_set_node_candidate_priority); static CommandLine set_node_metadata_command = make_command("metadata", "set metadata on the monitor", " [ --pgdata --name --hostname --pgport ] ", " --pgdata path to data directory\n" " --name pg_auto_failover node name\n" " --hostname hostname used to connect from other nodes\n" " --pgport PostgreSQL's port number\n", cli_node_metadata_getopts, cli_set_node_metadata); static CommandLine *set_node_subcommands[] = { &set_node_metadata_command, &set_node_replication_quorum_command, &set_node_candidate_priority_command, NULL }; CommandLine set_node_command = make_command_set("node", "set a node property on the monitor", NULL, NULL, NULL, set_node_subcommands); static CommandLine set_formation_number_sync_standby_command = make_command("number-sync-standbys", "set number-sync-standbys for a formation on the monitor", " [ --pgdata ] [ --json ] [ --formation ] " "", " --pgdata path to data directory\n" " --formation pg_auto_failover formation\n" " --json output data in the JSON format\n", cli_get_name_getopts, cli_set_formation_number_sync_standbys); static CommandLine *set_formation_subcommands[] = { &set_formation_number_sync_standby_command, NULL }; static CommandLine set_formation_command = make_command_set("formation", "set a formation property on the monitor", NULL, NULL, NULL, set_formation_subcommands); static CommandLine *set_subcommands[] = { &set_node_command, &set_formation_command, NULL }; CommandLine set_commands = make_command_set("set", "Set a pg_auto_failover node, or formation setting", NULL, NULL, NULL, set_subcommands); /* * get_node_replication_settings retrieves candidate priority and * replication quorum settings for this node from the monitor */ static bool get_node_replication_settings(NodeReplicationSettings *settings) { Keeper keeper = { 0 }; Monitor *monitor = &(keeper.monitor); keeper.config = keeperOptions; (void) cli_monitor_init_from_option_or_config(monitor, &(keeper.config)); /* grab --name from either the command options or the configuration file */ (void) cli_ensure_node_name(&keeper); /* copy the target name */ strlcpy(settings->name, keeper.config.name, _POSIX_HOST_NAME_MAX); return monitor_get_node_replication_settings(&(keeper.monitor), settings); } /* * cli_get_node_replication_quorum function prints * replication quorum property of this node to standard output. */ static void cli_get_node_replication_quorum(int argc, char **argv) { NodeReplicationSettings settings = { { 0 }, 0, false }; if (!get_node_replication_settings(&settings)) { log_error("Unable to get replication quorum value from monitor"); exit(EXIT_CODE_MONITOR); } if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *jsObj = json_value_get_object(js); json_object_set_string(jsObj, "name", settings.name); json_object_set_boolean(jsObj, "replication-quorum", settings.replicationQuorum); (void) cli_pprint_json(js); } else { fformat(stdout, "%s\n", boolToString(settings.replicationQuorum)); } } /* * cli_get_node_candidate_priority function prints * candidate priority property of this node to standard output. */ static void cli_get_node_candidate_priority(int argc, char **argv) { NodeReplicationSettings settings = { { 0 }, 0, false }; if (!get_node_replication_settings(&settings)) { log_error("Unable to get candidate priority value from monitor"); exit(EXIT_CODE_MONITOR); } if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *jsObj = json_value_get_object(js); json_object_set_string(jsObj, "name", settings.name); json_object_set_number(jsObj, "candidate-priority", (double) settings.candidatePriority); (void) cli_pprint_json(js); } else { fformat(stdout, "%d\n", settings.candidatePriority); } } /* * cli_get_formation_settings function prints the replication settings for a * given formation. */ void cli_get_formation_settings(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; (void) cli_monitor_init_from_option_or_config(&monitor, &config); if (outputJSON) { if (!monitor_print_formation_settings_as_json(&monitor, config.formation)) { exit(EXIT_CODE_MONITOR); } } else { if (!monitor_print_formation_settings(&monitor, config.formation)) { exit(EXIT_CODE_MONITOR); } } } /* * cli_get_formation_number_sync_standbys function prints * number sync standbys property of this formation to standard output. */ static void cli_get_formation_number_sync_standbys(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; int numberSyncStandbys = 0; (void) cli_monitor_init_from_option_or_config(&monitor, &config); if (!monitor_get_formation_number_sync_standbys(&monitor, config.formation, &numberSyncStandbys)) { log_error("Failed to get number-sync-standbys for formation \"%s\"", config.formation); exit(EXIT_CODE_MONITOR); } if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *jsObj = json_value_get_object(js); json_object_set_number(jsObj, "number-sync-standbys", (double) numberSyncStandbys); (void) cli_pprint_json(js); } else { fformat(stdout, "%d\n", numberSyncStandbys); } } /* * cli_set_node_replication_quorum sets the replication quorum property on the * monitor for current pg_autoctl node. */ static void cli_set_node_replication_quorum(int argc, char **argv) { Keeper keeper = { 0 }; Monitor *monitor = &(keeper.monitor); bool replicationQuorum = false; keeper.config = keeperOptions; if (argc != 1) { log_error("Failed to parse command line arguments: " "got %d when 1 is expected", argc); commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } if (!parse_bool(argv[0], &replicationQuorum)) { log_error("replication-quorum value %s is not valid." " Valid values are \"true\" or \"false.", argv[0]); exit(EXIT_CODE_BAD_ARGS); } (void) cli_monitor_init_from_option_or_config(monitor, &(keeper.config)); /* grab --name from either the command options or the configuration file */ (void) cli_ensure_node_name(&keeper); if (!set_node_replication_quorum(&keeper, replicationQuorum)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *jsObj = json_value_get_object(js); json_object_set_boolean(jsObj, "replication-quorum", replicationQuorum); (void) cli_pprint_json(js); } else { fformat(stdout, "%s\n", boolToString(replicationQuorum)); } } /* * cli_set_node_candidate_priority sets the candidate priority property on the * monitor for current pg_autoctl node. */ static void cli_set_node_candidate_priority(int argc, char **argv) { Keeper keeper = { 0 }; Monitor *monitor = &(keeper.monitor); keeper.config = keeperOptions; if (argc != 1) { log_error("Failed to parse command line arguments: " "got %d when 1 is expected", argc); commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } int candidatePriority = strtol(argv[0], NULL, 10); if (errno == EINVAL || candidatePriority < 0 || candidatePriority > 100) { log_error("candidate-priority value %s is not valid." " Valid values are integers from 0 to 100. ", argv[0]); exit(EXIT_CODE_BAD_ARGS); } (void) cli_monitor_init_from_option_or_config(monitor, &(keeper.config)); /* grab --name from either the command options or the configuration file */ (void) cli_ensure_node_name(&keeper); if (!set_node_candidate_priority(&keeper, candidatePriority)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *jsObj = json_value_get_object(js); json_object_set_number(jsObj, "candidate-priority", (double) candidatePriority); (void) cli_pprint_json(js); } else { fformat(stdout, "%d\n", candidatePriority); } } /* * cli_set_node_metadata sets this pg_autoctl node name, hostname, and port on * the monitor. That's the hostname that is used by every other node in the * system to contact the local node, so it can be an IP address as well. */ static void cli_set_node_metadata(int argc, char **argv) { Keeper keeper = { 0 }; KeeperConfig *config = &(keeper.config); Monitor *monitor = &(keeper.monitor); bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = false; KeeperConfig oldConfig = { 0 }; /* initialize from the command lines options */ *config = keeperOptions; if (IS_EMPTY_STRING_BUFFER(keeperOptions.name) && IS_EMPTY_STRING_BUFFER(keeperOptions.hostname) && keeperOptions.pgSetup.pgport == 0) { log_error("Please use at least one of " "--nodename, --hostname, or --pgport"); commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } if (!file_exists(config->pathnames.config)) { log_error("Failed to read configuration file \"%s\"", config->pathnames.config); } if (!keeper_config_read_file(config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { log_fatal("Failed to read configuration file \"%s\"", config->pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } if (config->monitorDisabled) { log_error("This node has disabled monitor"); exit(EXIT_CODE_BAD_CONFIG); } /* keep a copy */ oldConfig = *config; /* * Now that we have loaded the configuration file, apply the command * line options on top of it, giving them priority over the config. */ if (!keeper_config_merge_options(config, &keeperOptions)) { /* errors have been logged already */ exit(EXIT_CODE_BAD_CONFIG); } if (!monitor_init(monitor, config->monitor_pguri)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } if (!keeper_set_node_metadata(&keeper, &oldConfig)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } if (file_exists(config->pathnames.pid)) { if (!cli_pg_autoctl_reload(config->pathnames.pid)) { log_error("Failed to reload the pg_autoctl service, consider " "restarting it to implement the metadata changes"); exit(EXIT_CODE_INTERNAL_ERROR); } } if (outputJSON) { JSON_Value *js = json_value_init_object(); if (!keeper_config_to_json(config, js)) { log_fatal("Failed to serialize configuration to JSON"); exit(EXIT_CODE_BAD_CONFIG); } (void) cli_pprint_json(js); } } /* * cli_set_formation_property sets a formation property on the monitor * for a formation the current keeper node belongs to. */ static void cli_set_formation_number_sync_standbys(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; char synchronous_standby_names[BUFSIZE] = { 0 }; if (argc != 1) { log_error("Failed to parse command line arguments: " "got %d when 1 is expected", argc); commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } int numberSyncStandbys = strtol(argv[0], NULL, 10); if (errno == EINVAL || numberSyncStandbys < 0) { log_error("number-sync-standbys value %s is not valid." " Expected a non-negative integer value. ", argv[0]); exit(EXIT_CODE_BAD_ARGS); } (void) cli_monitor_init_from_option_or_config(&monitor, &config); /* change the default group when it is still unknown */ if (config.groupId == -1) { config.groupId = 0; } if (!set_formation_number_sync_standbys(&monitor, config.formation, config.groupId, numberSyncStandbys)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } if (monitor_synchronous_standby_names( &monitor, config.formation, config.groupId, synchronous_standby_names, BUFSIZE)) { log_info("primary node has now set synchronous_standby_names = '%s'", synchronous_standby_names); } if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *jsObj = json_value_get_object(js); json_object_set_number(jsObj, "number-sync-standbys", (double) numberSyncStandbys); if (!IS_EMPTY_STRING_BUFFER(synchronous_standby_names)) { json_object_set_string(jsObj, "synchronous_standby_names", synchronous_standby_names); } (void) cli_pprint_json(js); } else { fformat(stdout, "%d\n", numberSyncStandbys); } } /* * set_node_candidate_priority sets the candidate priority on the monitor, and * if we have more than one node registered, waits until the primary has * applied the settings. */ static bool set_node_candidate_priority(Keeper *keeper, int candidatePriority) { KeeperConfig *config = &(keeper->config); CurrentNodeStateArray nodesArray = { 0 }; /* * There might be some race conditions here, but it's all to be * user-friendly so in the worst case we're going to be less friendly that * we could have. */ if (!monitor_get_current_state(&(keeper->monitor), config->formation, config->groupId, &nodesArray)) { /* ignore the error, just don't wait in that case */ log_warn("Failed to get the list of all the nodes in formation \"%s\" " "from the monitor, see above for details", keeper->config.formation); } /* ignore the result of the filtering, worst case we don't wait */ (void) nodestateFilterArrayGroup(&nodesArray, config->name); /* listen for state changes BEFORE we apply new settings */ if (nodesArray.count > 1) { char *channels[] = { "state", NULL }; if (!pgsql_listen(&(keeper->monitor.notificationClient), channels)) { log_error("Failed to listen to state changes from the monitor"); return false; } } if (!monitor_set_node_candidate_priority(&(keeper->monitor), config->formation, config->name, candidatePriority)) { log_error("Failed to set \"candidate-priority\" to \"%d\".", candidatePriority); return false; } /* now wait until the primary actually applied the new setting */ if (nodesArray.count > 1) { if (!monitor_wait_until_primary_applied_settings( &(keeper->monitor), config->formation)) { log_error("Failed to wait until the new setting has been applied"); return false; } } return true; } /* * set_node_replication_quorum sets the replication quorum on the monitor, and * if we have more than one node registered, waits until the primary has * applied the settings. */ static bool set_node_replication_quorum(Keeper *keeper, bool replicationQuorum) { KeeperConfig *config = &(keeper->config); NodeAddressArray nodesArray = { 0 }; /* * There might be some race conditions here, but it's all to be * user-friendly so in the worst case we're going to be less friendly that * we could have. */ if (!monitor_get_nodes(&(keeper->monitor), config->formation, config->groupId, &nodesArray)) { /* ignore the error, just don't wait in that case */ log_warn("Failed to get_nodes() on the monitor"); } /* listen for state changes BEFORE we apply new settings */ if (nodesArray.count > 1) { char *channels[] = { "state", NULL }; if (!pgsql_listen(&(keeper->monitor.notificationClient), channels)) { log_error("Failed to listen to state changes from the monitor"); return false; } } if (!monitor_set_node_replication_quorum(&(keeper->monitor), config->formation, config->name, replicationQuorum)) { log_error("Failed to set \"replication-quorum\" to \"%s\".", boolToString(replicationQuorum)); return false; } /* now wait until the primary actually applied the new setting */ if (nodesArray.count > 1) { if (!monitor_wait_until_primary_applied_settings( &(keeper->monitor), config->formation)) { log_error("Failed to wait until the new setting has been applied"); return false; } } return true; } /* * set_node_replication_quorum sets the number_sync_standbys on the monitor, * and if we have more than one node registered in the target formation, waits * until the primary has applied the settings. */ static bool set_formation_number_sync_standbys(Monitor *monitor, char *formation, int groupId, int numberSyncStandbys) { NodeAddressArray nodesArray = { 0 }; /* * There might be some race conditions here, but it's all to be * user-friendly so in the worst case we're going to be less friendly that * we could have. */ if (!monitor_get_nodes(monitor, formation, groupId, &nodesArray)) { /* ignore the error, just don't wait in that case */ log_warn("Failed to get_nodes() on the monitor"); } /* listen for state changes BEFORE we apply new settings */ if (nodesArray.count > 1) { char *channels[] = { "state", NULL }; if (!pgsql_listen(&(monitor->notificationClient), channels)) { log_error("Failed to listen to state changes from the monitor"); return false; } } /* set the new number_sync_standbys value */ if (!monitor_set_formation_number_sync_standbys( monitor, formation, numberSyncStandbys)) { log_error("Failed to set \"number-sync-standbys\" to \"%d\".", numberSyncStandbys); return false; } /* now wait until the primary actually applied the new setting */ if (nodesArray.count > 1) { if (!monitor_wait_until_primary_applied_settings( monitor, formation)) { log_error("Failed to wait until the new setting has been applied"); return false; } } return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_perform.c000066400000000000000000000335531414244367200232330ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_perform.c * Implementation of the pg_autoctl perform CLI for the pg_auto_failover * nodes (monitor, coordinator, worker, postgres). * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include "cli_common.h" #include "commandline.h" #include "defaults.h" #include "env_utils.h" #include "ini_file.h" #include "keeper_config.h" #include "keeper.h" #include "monitor.h" #include "monitor_config.h" #include "string_utils.h" static int cli_perform_failover_getopts(int argc, char **argv); static void cli_perform_failover(int argc, char **argv); static int cli_perform_promotion_getopts(int argc, char **argv); static void cli_perform_promotion(int argc, char **argv); CommandLine perform_failover_command = make_command("failover", "Perform a failover for given formation and group", " [ --pgdata --formation --group ] ", " --pgdata path to data directory\n" " --formation formation to target, defaults to 'default'\n" " --group group to target, defaults to 0\n" " --wait how many seconds to wait, default to 60 \n", cli_perform_failover_getopts, cli_perform_failover); CommandLine perform_switchover_command = make_command("switchover", "Perform a switchover for given formation and group", " [ --pgdata --formation --group ] ", " --pgdata path to data directory\n" " --formation formation to target, defaults to 'default'\n" " --group group to target, defaults to 0\n" " --wait how many seconds to wait, default to 60 \n", cli_perform_failover_getopts, cli_perform_failover); CommandLine perform_promotion_command = make_command("promotion", "Perform a failover that promotes a target node", " [ --pgdata --formation --group ] --name ", " --pgdata path to data directory\n" " --formation formation to target, defaults to 'default' \n" " --name node name to target, defaults to current node\n" " --wait how many seconds to wait, default to 60 \n", cli_perform_promotion_getopts, cli_perform_promotion); CommandLine *perform_subcommands[] = { &perform_failover_command, &perform_switchover_command, &perform_promotion_command, NULL, }; CommandLine perform_commands = make_command_set("perform", "Perform an action orchestrated by the monitor", NULL, NULL, NULL, perform_subcommands); /* * cli_perform_failover_getopts parses the command line options for the * command `pg_autoctl perform failover`. */ static int cli_perform_failover_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "monitor", required_argument, NULL, 'm' }, { "formation", required_argument, NULL, 'f' }, { "group", required_argument, NULL, 'g' }, { "wait", required_argument, NULL, 'w' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; /* set default values for our options, when we have some */ options.groupId = -1; options.network_partition_timeout = -1; options.prepare_promotion_catchup = -1; options.prepare_promotion_walreceiver = -1; options.postgresql_restart_failure_timeout = -1; options.postgresql_restart_failure_max_retries = -1; options.listen_notifications_timeout = PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT; /* do not set a default formation, it should be found in the config file */ optind = 0; while ((c = getopt_long(argc, argv, "D:f:g:n:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'm': { if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'f': { strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'g': { if (!stringToInt(optarg, &options.groupId)) { log_fatal("--group argument is not a valid group ID: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--group %d", options.groupId); break; } case 'w': { if (!stringToInt(optarg, &options.listen_notifications_timeout)) { log_fatal("--wait argument is not a valid timeout: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--wait %d", options.listen_notifications_timeout); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ errors++; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* when we have a monitor URI we don't need PGDATA */ if (cli_use_monitor_option(&options)) { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { log_warn("Given --monitor URI, the --pgdata option is ignored"); log_info("Connecting to monitor at \"%s\"", options.monitor_pguri); } /* the rest of the program needs pgdata actually empty */ bzero((void *) options.pgSetup.pgdata, sizeof(options.pgSetup.pgdata)); } else { cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); if (!keeper_config_set_pathnames_from_pgdata(&(options.pathnames), options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } } /* ensure --formation, or get it from the configuration file */ if (!cli_common_ensure_formation(&options)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } keeperOptions = options; return optind; } /* * cli_perform_failover calls the SQL function * pgautofailover.perform_failover() on the monitor. */ static void cli_perform_failover(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; char *channels[] = { "state", NULL }; (void) cli_monitor_init_from_option_or_config(&monitor, &config); (void) cli_set_groupId(&monitor, &config); /* start listening to the state changes before we call perform_failover */ if (!pgsql_listen(&(monitor.notificationClient), channels)) { log_error("Failed to listen to state changes from the monitor"); exit(EXIT_CODE_MONITOR); } if (!monitor_perform_failover(&monitor, config.formation, config.groupId)) { log_fatal("Failed to perform failover/switchover, " "see above for details"); exit(EXIT_CODE_MONITOR); } /* process state changes notification until we have a new primary */ if (!monitor_wait_until_some_node_reported_state( &monitor, config.formation, config.groupId, config.pgSetup.pgKind, PRIMARY_STATE, config.listen_notifications_timeout)) { log_error("Failed to wait until a new primary has been notified"); exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_perform_promotion_getopts parses the command line options for the * command `pg_autoctl perform promotion` command. */ static int cli_perform_promotion_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "monitor", required_argument, NULL, 'm' }, { "formation", required_argument, NULL, 'f' }, { "name", required_argument, NULL, 'a' }, { "wait", required_argument, NULL, 'w' }, { "json", no_argument, NULL, 'J' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; /* set default values for our options, when we have some */ options.groupId = -1; options.network_partition_timeout = -1; options.prepare_promotion_catchup = -1; options.prepare_promotion_walreceiver = -1; options.postgresql_restart_failure_timeout = -1; options.postgresql_restart_failure_max_retries = -1; options.listen_notifications_timeout = PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT; optind = 0; /* * The only command lines that are using keeper_cli_getopt_pgdata are * terminal ones: they don't accept subcommands. In that case our option * parsing can happen in any order and we don't need getopt_long to behave * in a POSIXLY_CORRECT way. * * The unsetenv() call allows getopt_long() to reorder arguments for us. */ unsetenv("POSIXLY_CORRECT"); while ((c = getopt_long(argc, argv, "D:f:g:n:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'm': { if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'f': { strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'a': { /* { "name", required_argument, NULL, 'a' }, */ strlcpy(options.name, optarg, _POSIX_HOST_NAME_MAX); log_trace("--name %s", options.name); break; } case 'w': { /* { "wait", required_argument, NULL, 'w' }, */ if (!stringToInt(optarg, &options.listen_notifications_timeout)) { log_fatal("--wait argument is not a valid timeout: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--wait %d", options.listen_notifications_timeout); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'J': { outputJSON = true; log_trace("--json"); break; } default: { /* getopt_long already wrote an error message */ errors++; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* now that we have the command line parameters, prepare the options */ /* when we have a monitor URI we don't need PGDATA */ if (cli_use_monitor_option(&options)) { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { log_warn("Given --monitor URI, the --pgdata option is ignored"); log_info("Connecting to monitor at \"%s\"", options.monitor_pguri); /* the rest of the program needs pgdata actually empty */ bzero((void *) options.pgSetup.pgdata, sizeof(options.pgSetup.pgdata)); } } else { (void) prepare_keeper_options(&options); } /* ensure --formation, or get it from the configuration file */ if (!cli_common_ensure_formation(&options)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* publish our option parsing in the global variable */ keeperOptions = options; return optind; } /* * cli_perform_promotion calls the function pgautofailover.perform_promotion() * on the monitor. */ static void cli_perform_promotion(int argc, char **argv) { Keeper keeper = { 0 }; Monitor *monitor = &(keeper.monitor); KeeperConfig *config = &(keeper.config); int groupId = 0; PgInstanceKind nodeKind = NODE_KIND_UNKNOWN; char *channels[] = { "state", NULL }; keeper.config = keeperOptions; (void) cli_monitor_init_from_option_or_config(monitor, config); /* grab --name from either the command options or the configuration file */ (void) cli_ensure_node_name(&keeper); if (!monitor_get_groupId_from_name(monitor, config->formation, config->name, &groupId)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* start listening to the state changes before we call perform_promotion */ if (!pgsql_listen(&(monitor->notificationClient), channels)) { log_error("Failed to listen to state changes from the monitor"); exit(EXIT_CODE_MONITOR); } /* * pgautofailover.perform_promotion returns true when a promotion has been * triggered, and false when none was necessary. When an error occurs, it * reports an error condition, which is logged about already. */ if (monitor_perform_promotion(monitor, config->formation, config->name)) { /* process state changes notification until we have a new primary */ if (!monitor_wait_until_some_node_reported_state( monitor, config->formation, groupId, nodeKind, PRIMARY_STATE, config->listen_notifications_timeout)) { log_error("Failed to wait until a new primary has been notified"); exit(EXIT_CODE_INTERNAL_ERROR); } } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_root.c000066400000000000000000000110401414244367200225270ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_root.c * Implementation of a CLI which lets you run individual keeper routines * directly * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include "cli_common.h" #include "cli_root.h" #include "commandline.h" /* local bindings for all the commands */ CommandLine help = make_command("help", "print help message", "", "", NULL, keeper_cli_help); CommandLine version = make_command("version", "print pg_autoctl version", "", "", cli_print_version_getopts, keeper_cli_print_version); /* non-local to be able to patch it from other files */ CommandLine *create_subcommands[] = { &create_monitor_command, &create_postgres_command, &create_formation_command, NULL }; CommandLine create_commands = make_command_set("create", "Create a pg_auto_failover node, or formation", NULL, NULL, NULL, create_subcommands); CommandLine *show_subcommands_with_debug[] = { &show_uri_command, &show_events_command, &show_state_command, &show_settings_command, &show_standby_names_command, &show_file_command, &systemd_cat_service_file_command, NULL }; CommandLine show_commands_with_debug = make_command_set("show", "Show pg_auto_failover information", NULL, NULL, NULL, show_subcommands_with_debug); CommandLine *show_subcommands[] = { &show_uri_command, &show_events_command, &show_state_command, &show_settings_command, &show_standby_names_command, &show_file_command, &systemd_cat_service_file_command, NULL }; CommandLine show_commands = make_command_set("show", "Show pg_auto_failover information", NULL, NULL, NULL, show_subcommands); CommandLine *drop_subcommands[] = { &drop_monitor_command, &drop_node_command, &drop_formation_command, NULL }; CommandLine drop_commands = make_command_set("drop", "Drop a pg_auto_failover node, or formation", NULL, NULL, NULL, drop_subcommands); /* * Binding them all into the top-level command: */ CommandLine *root_subcommands_with_debug[] = { &create_commands, &drop_commands, &config_commands, &show_commands_with_debug, &enable_commands, &disable_commands, &get_commands, &set_commands, &perform_commands, &do_commands, &service_run_command, &watch_command, &service_stop_command, &service_reload_command, &service_status_command, &help, &version, NULL }; CommandLine root_with_debug = make_command_set("pg_autoctl", "pg_auto_failover control tools and service", "[ --debug|verbose|quiet ]", NULL, root_options, root_subcommands_with_debug); CommandLine *root_subcommands[] = { &create_commands, &drop_commands, &config_commands, &show_commands, &enable_commands, &disable_commands, &get_commands, &set_commands, &perform_commands, &service_run_command, &watch_command, &service_stop_command, &service_reload_command, &service_status_command, &help, &version, NULL }; CommandLine root = make_command_set("pg_autoctl", "pg_auto_failover control tools and service", "[ --verbose --quiet ]", NULL, root_options, root_subcommands); /* * root_options parses flags from the list of arguments that are common to all * commands. */ int root_options(int argc, char **argv) { int verboseCount = 0; bool printVersion = false; static struct option long_options[] = { { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "json", no_argument, NULL, 'J' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; int c, option_index, errors = 0; optind = 0; while ((c = getopt_long(argc, argv, "JVvqh", long_options, &option_index)) != -1) { switch (c) { case 'J': { outputJSON = true; log_trace("--json"); break; } case 'V': { printVersion = true; break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } if (printVersion) { keeper_cli_print_version(argc, argv); } return optind; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_root.h000066400000000000000000000022231414244367200225370ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_root.h * Implementation of a CLI which lets you run individual keeper routines * directly * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef CLI_ROOT_H #define CLI_ROOT_H #include "commandline.h" #include "lock_utils.h" extern char pg_autoctl_argv0[]; extern char pg_autoctl_program[]; extern int pgconnect_timeout; extern int logLevel; extern Semaphore log_semaphore; extern char *ps_buffer; extern size_t ps_buffer_size; extern size_t last_status_len; extern CommandLine help; extern CommandLine version; extern CommandLine create_commands; extern CommandLine *create_subcommands[]; extern CommandLine show_commands; extern CommandLine *show_subcommands[]; extern CommandLine show_commands_with_debug; extern CommandLine *show_subcommands_with_debug[]; extern CommandLine drop_commands; extern CommandLine *drop_subcommands[]; extern CommandLine root_with_debug; extern CommandLine *root_subcommands_with_debug[]; extern CommandLine root; extern CommandLine *root_subcommands[]; int root_options(int argc, char **argv); #endif /* CLI_ROOT_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_service.c000066400000000000000000000321071414244367200232130ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_service.c * Implementation of a CLI for controlling the pg_autoctl service. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "defaults.h" #include "env_utils.h" #include "fsm.h" #include "keeper_config.h" #include "keeper.h" #include "monitor.h" #include "monitor_config.h" #include "pidfile.h" #include "service_keeper.h" #include "service_monitor.h" #include "signals.h" static int stop_signal = SIGTERM; static void cli_service_run(int argc, char **argv); static void cli_keeper_run(int argc, char **argv); static void cli_monitor_run(int argc, char **argv); static int cli_getopt_pgdata_and_mode(int argc, char **argv); static void cli_service_stop(int argc, char **argv); static void cli_service_reload(int argc, char **argv); static void cli_service_status(int argc, char **argv); CommandLine service_run_command = make_command("run", "Run the pg_autoctl service (monitor or keeper)", " [ --pgdata --nodename --hostname --pgport ] ", " --pgdata path to data directory\n" " --nodename pg_auto_failover node name\n" " --hostname hostname used to connect from other nodes\n" " --pgport PostgreSQL's port number\n", cli_node_metadata_getopts, cli_service_run); CommandLine service_stop_command = make_command("stop", "signal the pg_autoctl service for it to stop", " [ --pgdata --fast --immediate ]", " --pgdata path to data directory \n" " --fast fast shutdown mode for the keeper \n" " --immediate immediate shutdown mode for the keeper \n", cli_getopt_pgdata_and_mode, cli_service_stop); CommandLine service_reload_command = make_command("reload", "signal the pg_autoctl for it to reload its configuration", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_service_reload); CommandLine service_status_command = make_command("status", "Display the current status of the pg_autoctl service", CLI_PGDATA_USAGE, CLI_PGDATA_OPTION, cli_getopt_pgdata, cli_service_status); /* * cli_service_run starts the local pg_auto_failover service, either the * monitor or the keeper, depending on the configuration file associated with * the current PGDATA, or the --pgdata argument. */ static void cli_service_run(int argc, char **argv) { KeeperConfig config = keeperOptions; if (!keeper_config_set_pathnames_from_pgdata(&config.pathnames, config.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } switch (ProbeConfigurationFileRole(config.pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { (void) cli_monitor_run(argc, argv); break; } case PG_AUTOCTL_ROLE_KEEPER: { (void) cli_keeper_run(argc, argv); break; } default: { log_fatal("Unrecognized configuration file \"%s\"", config.pathnames.config); exit(EXIT_CODE_INTERNAL_ERROR); } } } /* * keeper_cli_fsm_run runs the keeper state machine in an infinite * loop. */ static void cli_keeper_run(int argc, char **argv) { Keeper keeper = { 0 }; Monitor *monitor = &(keeper.monitor); KeeperConfig *config = &(keeper.config); PostgresSetup *pgSetup = &(keeper.config.pgSetup); LocalPostgresServer *postgres = &(keeper.postgres); /* in case --name, --hostname, or --pgport are used */ KeeperConfig oldConfig = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; keeper.config = keeperOptions; /* initialize our pgSetup and LocalPostgresServer instances */ if (!keeper_config_read_file(config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged. */ exit(EXIT_CODE_BAD_CONFIG); } /* keep a copy */ oldConfig = *config; /* * Now that we have loaded the configuration file, apply the command * line options on top of it, giving them priority over the config. */ if (!keeper_config_merge_options(config, &keeperOptions)) { /* errors have been logged already */ exit(EXIT_CODE_BAD_CONFIG); } if (!config->monitorDisabled) { if (!monitor_init(monitor, config->monitor_pguri)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } /* * Handle the pg_autoctl run options: --name, --hostname, --pgport. * * When those options have been used, then the configuration file has * been merged with the command line values, and we can update the * metadata for this node on the monitor. */ if (!keeper_set_node_metadata(&keeper, &oldConfig)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } /* * Now, at 1.3 to 1.4 upgrade, the monitor assigns a new name to * pg_autoctl nodes, which did not use to have a name before. In that * case, and then pg_autoctl run has been used without options, our * name might be empty here. We then need to fetch it from the monitor. */ if (!keeper_update_nodename_from_monitor(&keeper)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* we don't keep a connection to the monitor in this process */ pgsql_finish(&(monitor->pgsql)); } /* initialize our local Postgres instance representation */ (void) local_postgres_init(postgres, pgSetup); if (!start_keeper(&keeper)) { log_fatal("Failed to start pg_autoctl keeper service, " "see above for details"); exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_monitor_run ensures PostgreSQL is running and then listens for state * changes from the monitor, logging them as INFO messages. Also listens for * log messages from the monitor, and outputs them as DEBUG messages. */ static void cli_monitor_run(int argc, char **argv) { KeeperConfig options = keeperOptions; Monitor monitor = { 0 }; bool missingPgdataIsOk = false; bool pgIsNotRunningIsOk = true; /* Prepare MonitorConfig from the CLI options fed in options */ if (!monitor_config_init_from_pgsetup(&(monitor.config), &options.pgSetup, missingPgdataIsOk, pgIsNotRunningIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_PGCTL); } /* Start the monitor service */ if (!start_monitor(&monitor)) { log_fatal("Failed to start pg_autoctl monitor service, " "see above for details"); exit(EXIT_CODE_INTERNAL_ERROR); } } /* * service_cli_reload sends a SIGHUP signal to the keeper. */ static void cli_service_reload(int argc, char **argv) { pid_t pid; Keeper keeper = { 0 }; keeper.config = keeperOptions; if (read_pidfile(keeper.config.pathnames.pid, &pid)) { if (kill(pid, SIGHUP) != 0) { log_error("Failed to send SIGHUP to pg_autoctl pid %d: %m", pid); exit(EXIT_CODE_INTERNAL_ERROR); } } } /* * cli_getopt_pgdata_and_mode gets both the --pgdata and the stopping mode * options (either --fast or --immediate) from the command line. */ static int cli_getopt_pgdata_and_mode(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "fast", no_argument, NULL, 'f' }, { "immediate", no_argument, NULL, 'i' }, { "sigkill", no_argument, NULL, '9' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; while ((c = getopt_long(argc, argv, "D:fiVvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'f': { /* change the signal to send from SIGTERM to SIGINT. */ if (stop_signal != SIGTERM) { log_fatal("Please use either --fast or --immediate, not both"); exit(EXIT_CODE_BAD_ARGS); } stop_signal = SIGINT; break; } case 'i': { /* change the signal to send from SIGTERM to SIGQUIT. */ if (stop_signal != SIGTERM) { log_fatal("Please use either --fast or --immediate, not both"); exit(EXIT_CODE_BAD_ARGS); } stop_signal = SIGQUIT; break; } case '9': { /* change the signal to send from SIGTERM to SIGKILL */ if (!env_exists(PG_AUTOCTL_DEBUG)) { log_fatal("Option --sigkill is only available in debug " "environments"); exit(EXIT_CODE_BAD_ARGS); break; } if (stop_signal != SIGTERM) { log_fatal("Please use only one of either " " --sigkill, --fast or --immediate"); exit(EXIT_CODE_BAD_ARGS); } stop_signal = SIGKILL; break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); break; } } } /* now that we have the command line parameters, prepare the options */ (void) prepare_keeper_options(&options); keeperOptions = options; return optind; } /* * cli_service_stop sends a SIGTERM signal to the keeper. */ static void cli_service_stop(int argc, char **argv) { pid_t pid; Keeper keeper = { 0 }; keeper.config = keeperOptions; if (read_pidfile(keeper.config.pathnames.pid, &pid)) { /* * Send the signal to the top-level process only, except when using * --sigkill and then `kill -9`. The intend there is to trigger a crash * of Postgres and pg_autoctl and see how we recover from it. Target * the whole process group then. */ if (stop_signal == SIGKILL) { if (killpg(pid, stop_signal) != 0) { log_error("Failed to send %s to pg_autoctl pid %d: %m", strsignal(stop_signal), pid); exit(EXIT_CODE_INTERNAL_ERROR); } } else { if (kill(pid, stop_signal) != 0) { log_error("Failed to send %s to pg_autoctl pid %d: %m", strsignal(stop_signal), pid); exit(EXIT_CODE_INTERNAL_ERROR); } } } else { log_fatal("Failed to read the keeper's PID at \"%s\"", keeper.config.pathnames.pid); exit(EXIT_CODE_INTERNAL_ERROR); } } /* * cli_service_status displays the status of the pg_autoctl service and the * Postgres service. */ static void cli_service_status(int argc, char **argv) { pid_t pid = 0; Keeper keeper = { 0 }; PostgresSetup *pgSetup = &(keeper.config.pgSetup); ConfigFilePaths *pathnames = &(keeper.config.pathnames); keeper.config = keeperOptions; if (!cli_common_pgsetup_init(pathnames, pgSetup)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!file_exists(pathnames->pid)) { log_debug("pg_autoctl pid file \"%s\" does not exist", pathnames->pid); /* * pg_autoctl should be the parent process of Postgres. That said, when * in maintenance, operators could stop pg_autoctl and then start/stop * Postgres to make some configuration changes, and then use pg_autoctl * again. * * So check if Postgres is running, and complain about it when it's the * case and pg_autoctl is not running, as it will get in the way when * starting pg_autoctl again. */ if (pg_setup_is_running(pgSetup)) { log_fatal("Postgres is running at \"%s\" with pid %d", pgSetup->pgdata, pgSetup->pidFile.pid); } log_info("pg_autoctl is not running at \"%s\"", pgSetup->pgdata); exit(PG_CTL_STATUS_NOT_RUNNING); } /* ok now we have a pidfile for pg_autoctl */ if (!read_pidfile(pathnames->pid, &pid)) { exit(PG_CTL_STATUS_NOT_RUNNING); } /* and now we know pg_autoctl is running */ log_info("pg_autoctl is running with pid %d", pid); /* add a word about the Postgres service itself */ if (pg_setup_is_ready(pgSetup, false)) { log_info("Postgres is serving PGDATA \"%s\" on port %d with pid %d", pgSetup->pgdata, pgSetup->pgport, pgSetup->pidFile.pid); } else { exit(EXIT_CODE_PGCTL); } if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Value *jsPGAutoCtl = json_value_init_object(); JSON_Value *jsPostgres = json_value_init_object(); JSON_Object *root = json_value_get_object(js); bool includeStatus = true; pidfile_as_json(jsPGAutoCtl, pathnames->pid, includeStatus); if (!pg_setup_as_json(pgSetup, jsPostgres)) { /* can't happen */ exit(EXIT_CODE_INTERNAL_ERROR); } /* concatenate JSON objects into a container object */ json_object_set_value(root, "postgres", jsPostgres); json_object_set_value(root, "pg_autoctl", jsPGAutoCtl); (void) cli_pprint_json(js); } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_show.c000066400000000000000000001206731414244367200225410ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_show.c * Implementation of a CLI to show events, states, and URI from the * pg_auto_failover monitor. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "defaults.h" #include "env_utils.h" #include "ipaddr.h" #include "keeper_config.h" #include "keeper.h" #include "monitor_config.h" #include "monitor_pg_init.h" #include "monitor.h" #include "nodestate_utils.h" #include "parsing.h" #include "pgctl.h" #include "pghba.h" #include "pgsetup.h" #include "pgsql.h" #include "pidfile.h" #include "state.h" #include "string_utils.h" #include "watch.h" static int eventCount = 10; static bool localState = false; static bool watch = false; static int cli_show_state_getopts(int argc, char **argv); static void cli_show_state(int argc, char **argv); static void cli_show_local_state(void); static void cli_show_events(int argc, char **argv); static int cli_show_standby_names_getopts(int argc, char **argv); static void cli_show_standby_names(int argc, char **argv); static int cli_show_file_getopts(int argc, char **argv); static void cli_show_file(int argc, char **argv); static bool fprint_file_contents(const char *filename); static int cli_show_uri_getopts(int argc, char **argv); static void cli_show_uri(int argc, char **argv); static void print_monitor_uri(Monitor *monitor, FILE *stream); static void print_formation_uri(SSLOptions *ssl, Monitor *monitor, const char *formation, const char *citusClusterName, FILE *stream); static void print_all_uri(SSLOptions *ssl, Monitor *monitor, FILE *stream); CommandLine show_uri_command = make_command("uri", "Show the postgres uri to use to connect to pg_auto_failover nodes", " [ --pgdata --monitor --formation --json ] ", " --pgdata path to data directory\n" " --monitor show the monitor uri\n" " --formation show the coordinator uri of given formation\n" " --json output data in the JSON format\n", cli_show_uri_getopts, cli_show_uri); CommandLine show_events_command = make_command("events", "Prints monitor's state of nodes in a given formation and group", " [ --pgdata --formation --group --count ] ", " --pgdata path to data directory \n" " --monitor pg_auto_failover Monitor Postgres URL\n" \ " --formation formation to query, defaults to 'default' \n" " --group group to query formation, defaults to all \n" " --count how many events to fetch, defaults to 10 \n" " --watch display an auto-updating dashboard\n" " --json output data in the JSON format\n", cli_show_state_getopts, cli_show_events); CommandLine show_state_command = make_command("state", "Prints monitor's state of nodes in a given formation and group", " [ --pgdata --formation --group ] ", " --pgdata path to data directory \n" " --monitor show the monitor uri\n" " --formation formation to query, defaults to 'default' \n" " --group group to query formation, defaults to all \n" " --local show local data, do not connect to the monitor\n" " --watch display an auto-updating dashboard\n" " --json output data in the JSON format\n", cli_show_state_getopts, cli_show_state); CommandLine show_settings_command = make_command("settings", "Print replication settings for a formation from the monitor", " [ --pgdata ] [ --json ] [ --formation ] ", " --pgdata path to data directory\n" " --monitor pg_auto_failover Monitor Postgres URL\n" " --json output data in the JSON format\n" " --formation pg_auto_failover formation\n", cli_get_name_getopts, cli_get_formation_settings); CommandLine show_standby_names_command = make_command("standby-names", "Prints synchronous_standby_names for a given group", " [ --pgdata ] --formation --group", " --pgdata path to data directory \n" " --monitor show the monitor uri\n" " --formation formation to query, defaults to 'default'\n" " --group group to query formation, defaults to all\n" " --json output data in the JSON format\n", cli_show_standby_names_getopts, cli_show_standby_names); CommandLine show_file_command = make_command("file", "List pg_autoctl internal files (config, state, pid)", " [ --pgdata --all --config | --state | --init | --pid --contents ]", " --pgdata path to data directory \n" " --all show all pg_autoctl files \n" " --config show pg_autoctl configuration file \n" " --state show pg_autoctl state file \n" " --init show pg_autoctl initialisation state file \n" " --pid show pg_autoctl PID file \n" " --contents show selected file contents \n", cli_show_file_getopts, cli_show_file); typedef enum { SHOW_FILE_UNKNOWN = 0, /* no option selected yet */ SHOW_FILE_ALL, /* --all, or no option at all */ SHOW_FILE_CONFIG, SHOW_FILE_STATE, SHOW_FILE_INIT, SHOW_FILE_PID } ShowFileSelection; typedef struct ShowFileOptions { bool showFileContents; ShowFileSelection selection; } ShowFileOptions; static ShowFileOptions showFileOptions; typedef struct ShowUriOptions { bool monitorOnly; char formation[NAMEDATALEN]; char citusClusterName[NAMEDATALEN]; } ShowUriOptions; static ShowUriOptions showUriOptions = { 0 }; /* * keeper_cli_monitor_state_getopts parses the command line options for the * command `pg_autoctl show state`. */ static int cli_show_state_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "monitor", required_argument, NULL, 'm' }, { "formation", required_argument, NULL, 'f' }, { "group", required_argument, NULL, 'g' }, { "count", required_argument, NULL, 'n' }, { "local", no_argument, NULL, 'L' }, { "watch", no_argument, NULL, 'W' }, { "json", no_argument, NULL, 'J' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; /* set default values for our options, when we have some */ options.groupId = -1; options.network_partition_timeout = -1; options.prepare_promotion_catchup = -1; options.prepare_promotion_walreceiver = -1; options.postgresql_restart_failure_timeout = -1; options.postgresql_restart_failure_max_retries = -1; optind = 0; while ((c = getopt_long(argc, argv, "D:f:g:n:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'm': { if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'f': { strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'g': { if (!stringToInt(optarg, &options.groupId)) { log_fatal("--group argument is not a valid group ID: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--group %d", options.groupId); break; } case 'n': { if (!stringToInt(optarg, &eventCount)) { log_fatal("--count argument is not a valid count: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--count %d", eventCount); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'L': { localState = true; log_trace("--local"); break; } case 'W': { watch = true; log_trace("--watch"); break; } case 'J': { outputJSON = true; log_trace("--json"); break; } default: { /* getopt_long already wrote an error message */ errors++; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } if (watch && localState) { log_error("Please use either --local or --watch, but not both"); exit(EXIT_CODE_BAD_ARGS); } if (watch && outputJSON) { log_error("Please use either --json or --watch, but not both"); exit(EXIT_CODE_BAD_ARGS); } if (localState) { cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); } else { /* when we have a monitor URI we don't need PGDATA */ if (cli_use_monitor_option(&options)) { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { log_warn("Given --monitor URI, the --pgdata option is ignored"); log_info("Connecting to monitor at \"%s\"", options.monitor_pguri); } } else { cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); } } /* when --pgdata is given, still initialise our pathnames */ if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { if (!keeper_config_set_pathnames_from_pgdata(&(options.pathnames), options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } } /* ensure --formation, or get it from the configuration file */ if (!cli_common_ensure_formation(&options)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } keeperOptions = options; return optind; } /* * keeper_cli_monitor_print_events prints the list of the most recent events * known to the monitor. */ static void cli_show_events(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; if (watch) { WatchContext context = { 0 }; (void) cli_monitor_init_from_option_or_config(&(context.monitor), &config); strlcpy(context.formation, config.formation, sizeof(context.formation)); context.groupId = config.groupId; (void) cli_watch_main_loop(&context); exit(EXIT_CODE_QUIT); } (void) cli_monitor_init_from_option_or_config(&monitor, &config); if (outputJSON) { if (!monitor_print_last_events_as_json(&monitor, config.formation, config.groupId, eventCount, stdout)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } } else { if (!monitor_print_last_events(&monitor, config.formation, config.groupId, eventCount)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } } } /* * keeper_cli_monitor_print_state prints the current state of given formation * and port from the monitor's point of view. */ static void cli_show_state(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; if (localState) { (void) cli_show_local_state(); exit(EXIT_CODE_QUIT); } /* * When dealing with a keeper node with a disabled monitor, we force the * --local option. */ if (!IS_EMPTY_STRING_BUFFER(config.pgSetup.pgdata) && ProbeConfigurationFileRole(config.pathnames.config) == PG_AUTOCTL_ROLE_KEEPER) { bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (config.monitorDisabled) { log_info("Monitor is disabled, showing --local state"); (void) cli_show_local_state(); exit(EXIT_CODE_QUIT); } } if (watch) { WatchContext context = { 0 }; (void) cli_monitor_init_from_option_or_config(&(context.monitor), &config); strlcpy(context.formation, config.formation, sizeof(context.formation)); context.groupId = config.groupId; (void) cli_watch_main_loop(&context); exit(EXIT_CODE_QUIT); } (void) cli_monitor_init_from_option_or_config(&monitor, &config); if (outputJSON) { if (!monitor_print_state_as_json(&monitor, config.formation, config.groupId)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } } else { if (!monitor_print_state(&monitor, config.formation, config.groupId)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } } } /* * cli_show_local_state implements pg_autoctl show state --local, which * composes the state from what we have in the configuration file and the state * file for the local (keeper) node. */ static void cli_show_local_state() { KeeperConfig config = keeperOptions; int optionGroupId = keeperOptions.groupId; switch (ProbeConfigurationFileRole(config.pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { log_error("pg_autoctl show state --local is not supported " "on a monitor"); exit(EXIT_CODE_MONITOR); } case PG_AUTOCTL_ROLE_KEEPER: { bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; Keeper keeper = { 0 }; CurrentNodeState nodeState = { 0 }; if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_init(&keeper, &config)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* ensure that --group makes sense then */ if (optionGroupId != -1 && config.groupId != optionGroupId) { log_error("--group %d does not match this node's group: %d", optionGroupId, config.groupId); exit(EXIT_CODE_BAD_CONFIG); } /* build the CurrentNodeState from pieces */ nodeState.node.nodeId = keeper.state.current_node_id; strlcpy(nodeState.node.name, config.name, _POSIX_HOST_NAME_MAX); strlcpy(nodeState.node.host, config.hostname, _POSIX_HOST_NAME_MAX); nodeState.node.port = config.pgSetup.pgport; strlcpy(nodeState.formation, config.formation, NAMEDATALEN); nodeState.groupId = config.groupId; nodeState.reportedState = keeper.state.current_role; nodeState.goalState = keeper.state.assigned_role; if (pg_setup_is_ready(&(config.pgSetup), pgIsNotRunningIsOk)) { if (!pgsql_get_postgres_metadata( &(keeper.postgres.sqlClient), &(keeper.postgres.postgresSetup.is_in_recovery), keeper.postgres.pgsrSyncState, keeper.postgres.currentLSN, &(keeper.postgres.postgresSetup.control))) { log_warn("Failed to update the local Postgres metadata"); strlcpy(nodeState.node.lsn, "0/0", PG_LSN_MAXLENGTH); } nodeState.node.tli = keeper.postgres.postgresSetup.control.timeline_id; strlcpy(nodeState.node.lsn, keeper.postgres.currentLSN, PG_LSN_MAXLENGTH); } else { /* also grab the minimum recovery LSN if that's possible */ if (!pg_controldata(&(config.pgSetup), missingPgdataIsOk)) { /* errors have already been logged, just continue */ } nodeState.node.tli = config.pgSetup.control.timeline_id; strlcpy(nodeState.node.lsn, config.pgSetup.control.latestCheckpointLSN, PG_LSN_MAXLENGTH); } /* we have no idea, only the monitor knows, so report "unknown" */ nodeState.health = -1; if (outputJSON) { JSON_Value *js = json_value_init_object(); if (!nodestateAsJSON(&nodeState, js)) { /* can't happen */ exit(EXIT_CODE_INTERNAL_ERROR); } (void) cli_pprint_json(js); } else { NodeAddressHeaders headers = { 0 }; headers.nodeKind = keeper.config.pgSetup.pgKind; (void) nodestateAdjustHeaders(&headers, &(nodeState.node), nodeState.groupId); (void) prepareHeaderSeparators(&headers); (void) nodestatePrintHeader(&headers); (void) nodestatePrintNodeState(&headers, &nodeState); fformat(stdout, "\n"); } break; } default: { log_fatal("Unrecognized configuration file \"%s\"", config.pathnames.config); exit(EXIT_CODE_BAD_CONFIG); } } } /* * cli_show_nodes_getopts parses the command line options for the * command `pg_autoctl show nodes`. */ static int cli_show_standby_names_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "monitor", required_argument, NULL, 'm' }, { "formation", required_argument, NULL, 'f' }, { "group", required_argument, NULL, 'g' }, { "json", no_argument, NULL, 'J' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; /* set default values for our options, when we have some */ options.groupId = -1; options.network_partition_timeout = -1; options.prepare_promotion_catchup = -1; options.prepare_promotion_walreceiver = -1; options.postgresql_restart_failure_timeout = -1; options.postgresql_restart_failure_max_retries = -1; /* do not set a default formation, it should be found in the config file */ optind = 0; while ((c = getopt_long(argc, argv, "D:f:g:n:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'm': { if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'f': { strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'g': { if (!stringToInt(optarg, &options.groupId)) { log_fatal("--group argument is not a valid group ID: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--group %d", options.groupId); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'J': { outputJSON = true; log_trace("--json"); break; } default: { /* getopt_long already wrote an error message */ errors++; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* when we have a monitor URI we don't need PGDATA */ if (cli_use_monitor_option(&options)) { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { log_warn("Given --monitor URI, the --pgdata option is ignored"); log_info("Connecting to monitor at \"%s\"", options.monitor_pguri); } } else { cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); } /* when --pgdata is given, still initialise our pathnames */ if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { if (!keeper_config_set_pathnames_from_pgdata(&(options.pathnames), options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } } /* ensure --formation, or get it from the configuration file */ if (!cli_common_ensure_formation(&options)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } keeperOptions = options; return optind; } /* * cli_show_standby_names prints the synchronous_standby_names setting value * for a given group (in a known formation). */ static void cli_show_standby_names(int argc, char **argv) { KeeperConfig config = keeperOptions; Monitor monitor = { 0 }; char synchronous_standby_names[BUFSIZE] = { 0 }; (void) cli_monitor_init_from_option_or_config(&monitor, &config); (void) cli_set_groupId(&monitor, &config); if (!monitor_synchronous_standby_names( &monitor, config.formation, config.groupId, synchronous_standby_names, BUFSIZE)) { log_fatal("Failed to get the synchronous_standby_names setting value " " from the monitor, see above for details"); exit(EXIT_CODE_MONITOR); } if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *jsObj = json_value_get_object(js); json_object_set_string(jsObj, "formation", config.formation); json_object_set_number(jsObj, "group", (double) config.groupId); json_object_set_string(jsObj, "synchronous_standby_names", synchronous_standby_names); (void) cli_pprint_json(js); } else { /* current synchronous_standby_names might be an empty string */ (void) fformat(stdout, "'%s'\n", synchronous_standby_names); } } /* * keeper_show_uri_getopts parses the command line options for the * command `pg_autoctl show uri`. */ static int cli_show_uri_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "monitor", required_argument, NULL, 'm' }, { "formation", required_argument, NULL, 'f' }, { "citus-cluster", required_argument, NULL, 'Z' }, { "json", no_argument, NULL, 'J' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; /* set default values for our options, when we have some */ options.groupId = -1; options.network_partition_timeout = -1; options.prepare_promotion_catchup = -1; options.prepare_promotion_walreceiver = -1; options.postgresql_restart_failure_timeout = -1; options.postgresql_restart_failure_max_retries = -1; optind = 0; while ((c = getopt_long(argc, argv, "D:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'm': { if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'f': { strlcpy(showUriOptions.formation, optarg, NAMEDATALEN); log_trace("--formation %s", showUriOptions.formation); if (strcmp(showUriOptions.formation, "monitor") == 0) { showUriOptions.monitorOnly = true; } break; } case 'Z': { strlcpy(showUriOptions.citusClusterName, optarg, NAMEDATALEN); log_trace("--citus-cluster %s", showUriOptions.citusClusterName); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } case 'J': { outputJSON = true; log_trace("--json"); break; } default: { log_error("Failed to parse command line, see above for details."); commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); break; } } } /* when we have a monitor URI we don't need PGDATA */ if (cli_use_monitor_option(&options)) { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { log_warn("Given --monitor URI, the --pgdata option is ignored"); log_info("Connecting to monitor at \"%s\"", options.monitor_pguri); } } else { cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); if (!keeper_config_set_pathnames_from_pgdata(&(options.pathnames), options.pgSetup.pgdata)) { if (!keeper_config_set_pathnames_from_pgdata(&(options.pathnames), options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } } } /* * When --citus-cluster is used, but not --formation, then we assume * --formation default */ if (!IS_EMPTY_STRING_BUFFER(showUriOptions.citusClusterName) && IS_EMPTY_STRING_BUFFER(showUriOptions.formation)) { strlcpy(showUriOptions.formation, FORMATION_DEFAULT, NAMEDATALEN); } /* use "default" citus cluster name when user didn't provide it */ if (IS_EMPTY_STRING_BUFFER(showUriOptions.citusClusterName)) { strlcpy(showUriOptions.citusClusterName, DEFAULT_CITUS_CLUSTER_NAME, NAMEDATALEN); } keeperOptions = options; return optind; } /* * cli_show_uri_monitor_init_from_config initialises a Monitor instance so that * we can connect to the monitor and grab information from there. The * KeeperConfig instance might belong to a monitor node or to a keeper role. * * The SSLOptions are read from the configuration file and used to compute the * target connection strings. */ static void cli_show_uri_monitor_init_from_config(KeeperConfig *kconfig, Monitor *monitor, SSLOptions *ssl) { switch (ProbeConfigurationFileRole(kconfig->pathnames.config)) { case PG_AUTOCTL_ROLE_MONITOR: { MonitorConfig mconfig = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; char connInfo[MAXCONNINFO]; if (!monitor_config_init_from_pgsetup(&mconfig, &(kconfig->pgSetup), missingPgdataIsOk, pgIsNotRunningIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_PGCTL); } if (!monitor_config_get_postgres_uri(&mconfig, connInfo, MAXCONNINFO)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!monitor_init(monitor, connInfo)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } *ssl = mconfig.pgSetup.ssl; break; } case PG_AUTOCTL_ROLE_KEEPER: { bool monitorDisabledIsOk = false; if (!keeper_config_read_file_skip_pgsetup(kconfig, monitorDisabledIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!monitor_init(monitor, kconfig->monitor_pguri)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } *ssl = kconfig->pgSetup.ssl; break; } default: { log_fatal("Unrecognized configuration file \"%s\"", kconfig->pathnames.config); exit(EXIT_CODE_INTERNAL_ERROR); } } } /* * cli_show_uri prints the URI to connect to with psql. */ static void cli_show_uri(int argc, char **argv) { KeeperConfig kconfig = keeperOptions; Monitor monitor = { 0 }; SSLOptions ssl = { 0 }; /* * We are given either --monitor postgres://uri or --pgdata; in the first * case we just connect to that URI, in the second case we read the monitor * URI's from the local configuration file. */ if (!IS_EMPTY_STRING_BUFFER(kconfig.monitor_pguri)) { if (!monitor_init(&monitor, kconfig.monitor_pguri)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } if (!parse_pguri_ssl_settings(kconfig.monitor_pguri, &ssl)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } } else { /* read the monitor URI from the configuration file */ (void) cli_show_uri_monitor_init_from_config(&kconfig, &monitor, &ssl); } if (showUriOptions.monitorOnly) { (void) print_monitor_uri(&monitor, stdout); } else if (!IS_EMPTY_STRING_BUFFER(showUriOptions.formation)) { (void) print_formation_uri(&ssl, &monitor, showUriOptions.formation, showUriOptions.citusClusterName, stdout); } else { (void) print_all_uri(&ssl, &monitor, stdout); } } /* * print_monitor_uri shows the connection strings for the monitor and all * formations managed by it */ static void print_monitor_uri(Monitor *monitor, FILE *stream) { if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *jsObj = json_value_get_object(js); json_object_set_string(jsObj, "monitor", monitor->pgsql.connectionString); (void) cli_pprint_json(js); } else { fformat(stdout, "%s\n", monitor->pgsql.connectionString); } } /* * print_formation_uri connects to given monitor to fetch the * keeper configuration formation's URI, and prints it out on given stream. It * is printed in JSON format when outputJSON is true (--json options). */ static void print_formation_uri(SSLOptions *ssl, Monitor *monitor, const char *formation, const char *citusClusterName, FILE *stream) { char postgresUri[MAXCONNINFO]; if (!monitor_formation_uri(monitor, formation, citusClusterName, ssl, postgresUri, MAXCONNINFO)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *jsObj = json_value_get_object(js); json_object_set_string(jsObj, "monitor", monitor->pgsql.connectionString); json_object_set_string(jsObj, formation, postgresUri); (void) cli_pprint_json(js); } else { fformat(stdout, "%s\n", postgresUri); } } /* * print_all_uri prints the connection strings for the monitor and all * formations managed by it */ static void print_all_uri(SSLOptions *ssl, Monitor *monitor, FILE *stream) { if (outputJSON) { if (!monitor_print_every_formation_uri_as_json(monitor, ssl, stdout)) { log_fatal("Failed to get the list of formation URIs"); exit(EXIT_CODE_MONITOR); } } else { if (!monitor_print_every_formation_uri(monitor, ssl)) { log_fatal("Failed to get the list of formation URIs"); exit(EXIT_CODE_MONITOR); } } } /* * cli_show_file_getopts parses the command line options for the * command `pg_autoctl show file`. */ static int cli_show_file_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; ShowFileOptions fileOptions = { 0 }; int c, option_index = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "all", no_argument, NULL, 'a' }, { "config", no_argument, NULL, 'c' }, { "state", no_argument, NULL, 's' }, { "init", no_argument, NULL, 'i' }, { "pid", no_argument, NULL, 'p' }, { "contents", no_argument, NULL, 'C' }, { "json", no_argument, NULL, 'J' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; while ((c = getopt_long(argc, argv, "D:acsipCVvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'C': { fileOptions.showFileContents = true; if (fileOptions.selection == SHOW_FILE_ALL) { log_warn("Ignoring option --content with --all"); } break; } case 'a': { fileOptions.selection = SHOW_FILE_ALL; if (fileOptions.showFileContents) { log_warn("Ignoring option --content with --all"); } break; } case 'c': { if (fileOptions.selection != SHOW_FILE_UNKNOWN && fileOptions.selection != SHOW_FILE_CONFIG) { log_error( "Please use only one of --config --state --init --pid"); commandline_help(stderr); } fileOptions.selection = SHOW_FILE_CONFIG; log_trace("--config"); break; } case 's': { if (fileOptions.selection != SHOW_FILE_UNKNOWN && fileOptions.selection != SHOW_FILE_STATE) { log_error( "Please use only one of --config --state --init --pid"); commandline_help(stderr); } fileOptions.selection = SHOW_FILE_STATE; log_trace("--state"); break; } case 'i': { if (fileOptions.selection != SHOW_FILE_UNKNOWN && fileOptions.selection != SHOW_FILE_INIT) { log_error( "Please use only one of --config --state --init --pid"); commandline_help(stderr); } fileOptions.selection = SHOW_FILE_INIT; log_trace("--init"); break; } case 'p': { if (fileOptions.selection != SHOW_FILE_UNKNOWN && fileOptions.selection != SHOW_FILE_PID) { log_error( "Please use only one of --config --state --init --pid"); commandline_help(stderr); } fileOptions.selection = SHOW_FILE_PID; log_trace("--pid"); break; } case 'J': { outputJSON = true; log_trace("--json"); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { log_error("Failed to parse command line, see above for details."); commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); break; } } } cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); if (!keeper_config_set_pathnames_from_pgdata(&options.pathnames, options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* default to --all when no option has been selected */ if (fileOptions.selection == SHOW_FILE_UNKNOWN) { fileOptions.selection = SHOW_FILE_ALL; } keeperOptions = options; showFileOptions = fileOptions; return optind; } /* * cli_show_files lists the files used by pg_autoctl. */ static void cli_show_file(int argc, char **argv) { KeeperConfig config = keeperOptions; pgAutoCtlNodeRole role = ProbeConfigurationFileRole(config.pathnames.config); switch (showFileOptions.selection) { case SHOW_FILE_ALL: { if (outputJSON) { JSON_Value *js = json_value_init_object(); JSON_Object *root = json_value_get_object(js); json_object_set_string(root, "config", config.pathnames.config); if (role == PG_AUTOCTL_ROLE_KEEPER) { json_object_set_string(root, "state", config.pathnames.state); json_object_set_string(root, "init", config.pathnames.init); } json_object_set_string(root, "pid", config.pathnames.pid); char *serialized_string = json_serialize_to_string_pretty(js); fformat(stdout, "%s\n", serialized_string); json_free_serialized_string(serialized_string); json_value_free(js); } else { fformat(stdout, "%7s | %s\n", "File", "Path"); fformat(stdout, "%7s-+-%15s\n", "-------", "---------------"); fformat(stdout, "%7s | %s\n", "Config", config.pathnames.config); if (role == PG_AUTOCTL_ROLE_KEEPER) { fformat(stdout, "%7s | %s\n", "State", config.pathnames.state); fformat(stdout, "%7s | %s\n", "Init", config.pathnames.init); } fformat(stdout, "%7s | %s\n", "Pid", config.pathnames.pid); fformat(stdout, "\n"); } break; } case SHOW_FILE_CONFIG: { if (showFileOptions.showFileContents) { if (outputJSON) { JSON_Value *js = json_value_init_object(); const bool missingPgdataIsOk = true; const bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; switch (role) { case PG_AUTOCTL_ROLE_MONITOR: { MonitorConfig mconfig = { 0 }; mconfig.pathnames = config.pathnames; if (!monitor_config_read_file(&mconfig, missingPgdataIsOk, pgIsNotRunningIsOk)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!monitor_config_to_json(&mconfig, js)) { log_fatal( "Failed to serialize configuration to JSON"); exit(EXIT_CODE_BAD_CONFIG); } break; } case PG_AUTOCTL_ROLE_KEEPER: { if (!keeper_config_read_file(&config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_config_to_json(&config, js)) { log_fatal( "Failed to serialize configuration to JSON"); exit(EXIT_CODE_BAD_CONFIG); } break; } case PG_AUTOCTL_ROLE_UNKNOWN: { log_fatal("Unknown node role %d", role); exit(EXIT_CODE_BAD_CONFIG); } } /* we have the config as a JSON object, print it out now */ (void) cli_pprint_json(js); } else if (!fprint_file_contents(config.pathnames.config)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } } else { fformat(stdout, "%s\n", config.pathnames.config); } break; } case SHOW_FILE_STATE: { if (role == PG_AUTOCTL_ROLE_MONITOR) { log_error("A monitor has not state file"); exit(EXIT_CODE_BAD_ARGS); } if (showFileOptions.showFileContents) { KeeperStateData keeperState = { 0 }; if (keeper_state_read(&keeperState, config.pathnames.state)) { if (outputJSON) { JSON_Value *js = json_value_init_object(); keeperStateAsJSON(&keeperState, js); (void) cli_pprint_json(js); } else { (void) print_keeper_state(&keeperState, stdout); } } else { /* errors have already been logged */ exit(EXIT_CODE_BAD_STATE); } } else { fformat(stdout, "%s\n", config.pathnames.state); } break; } case SHOW_FILE_INIT: { if (role == PG_AUTOCTL_ROLE_MONITOR) { log_error("A monitor has not init state file"); exit(EXIT_CODE_BAD_ARGS); } if (showFileOptions.showFileContents) { Keeper keeper = { 0 }; keeper.config = config; if (keeper_init_state_read(&(keeper.initState), config.pathnames.init)) { (void) print_keeper_init_state(&(keeper.initState), stdout); } else { /* errors have already been logged */ exit(EXIT_CODE_BAD_STATE); } } else { fformat(stdout, "%s\n", config.pathnames.init); } break; } case SHOW_FILE_PID: { if (showFileOptions.showFileContents) { if (outputJSON) { JSON_Value *js = json_value_init_object(); bool includeStatus = false; (void) pidfile_as_json(js, config.pathnames.pid, includeStatus); (void) cli_pprint_json(js); } else { if (!fprint_file_contents(config.pathnames.pid)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } } } else { fformat(stdout, "%s\n", config.pathnames.pid); } break; } default: { log_fatal("Unrecognized configuration file \"%s\"", config.pathnames.config); exit(EXIT_CODE_INTERNAL_ERROR); } } } /* * fprint_file_contents prints the content of the given filename to stdout. */ static bool fprint_file_contents(const char *filename) { char *contents = NULL; long size = 0L; if (read_file(filename, &contents, &size)) { fformat(stdout, "%s\n", contents); free(contents); return true; } else { /* errors have already been logged */ return false; } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_systemd.c000066400000000000000000000074141414244367200232460ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_do_systemd.c * Implementation of a CLI which lets you run operations on the local * postgres server directly. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "cli_do_root.h" #include "commandline.h" #include "env_utils.h" #include "config.h" #include "defaults.h" #include "file_utils.h" #include "keeper_config.h" #include "keeper.h" #include "systemd_config.h" static SystemdServiceConfig systemdOptions; static int cli_systemd_getopt(int argc, char **argv); static void cli_systemd_cat_service_file(int argc, char **argv); /* pg_autoctl show systemd, see cli_show.c */ CommandLine systemd_cat_service_file_command = make_command("systemd", "Print systemd service file for this node", "", "", cli_systemd_getopt, cli_systemd_cat_service_file); /* * cli_systemd_getopt parses the command line options necessary to handle * systemd integration for the pg_autoctl keeper service. */ int cli_systemd_getopt(int argc, char **argv) { SystemdServiceConfig options = { 0 }; int c = 0, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; optind = 0; while ((c = getopt_long(argc, argv, "D:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ errors++; break; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); if (!pg_setup_set_absolute_pgdata(&(options.pgSetup))) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_config_set_pathnames_from_pgdata(&options.pathnames, options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* publish our option parsing in the global variable */ systemdOptions = options; return optind; } /* * cli_systemd_cat_service_file prints the systemd service file for this * pg_autoctl node. */ static void cli_systemd_cat_service_file(int argc, char **argv) { SystemdServiceConfig config = systemdOptions; PostgresSetup pgSetup = { 0 }; (void) systemd_config_init(&config, pgSetup.pgdata); log_info("HINT: to complete a systemd integration, " "run the following commands (as root):"); log_info("pg_autoctl -q show systemd --pgdata \"%s\" | tee %s", config.pgSetup.pgdata, config.pathnames.systemd); log_info("systemctl daemon-reload"); log_info("systemctl enable pgautofailover"); log_info("systemctl start pgautofailover"); if (!systemd_config_write(stdout, &config)) { exit(EXIT_CODE_INTERNAL_ERROR); } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/cli_watch.c000066400000000000000000000126371414244367200226670ustar00rootroot00000000000000/* * src/bin/pg_autoctl/cli_watch.c * Implementation of a CLI to show events, states, and URI from the * pg_auto_failover monitor. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "defaults.h" #include "env_utils.h" #include "ipaddr.h" #include "keeper_config.h" #include "keeper.h" #include "monitor_config.h" #include "monitor_pg_init.h" #include "monitor.h" #include "nodestate_utils.h" #include "parsing.h" #include "pgctl.h" #include "pghba.h" #include "pgsetup.h" #include "pgsql.h" #include "pidfile.h" #include "state.h" #include "string_utils.h" #include "watch.h" static int cli_watch_getopts(int argc, char **argv); static void cli_watch(int argc, char **argv); CommandLine watch_command = make_command("watch", "Display a dashboard to watch monitor's events and state", " [ --pgdata --formation --group ] ", " --pgdata path to data directory \n" " --monitor show the monitor uri\n" " --formation formation to query, defaults to 'default' \n" " --group group to query formation, defaults to all \n" " --json output data in the JSON format\n", cli_watch_getopts, cli_watch); static int cli_watch_getopts(int argc, char **argv) { KeeperConfig options = { 0 }; int c, option_index = 0, errors = 0; int verboseCount = 0; static struct option long_options[] = { { "pgdata", required_argument, NULL, 'D' }, { "monitor", required_argument, NULL, 'm' }, { "formation", required_argument, NULL, 'f' }, { "group", required_argument, NULL, 'g' }, { "version", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; /* set default values for our options, when we have some */ options.groupId = -1; options.network_partition_timeout = -1; options.prepare_promotion_catchup = -1; options.prepare_promotion_walreceiver = -1; options.postgresql_restart_failure_timeout = -1; options.postgresql_restart_failure_max_retries = -1; optind = 0; while ((c = getopt_long(argc, argv, "D:f:g:n:Vvqh", long_options, &option_index)) != -1) { switch (c) { case 'D': { strlcpy(options.pgSetup.pgdata, optarg, MAXPGPATH); log_trace("--pgdata %s", options.pgSetup.pgdata); break; } case 'm': { if (!validate_connection_string(optarg)) { log_fatal("Failed to parse --monitor connection string, " "see above for details."); exit(EXIT_CODE_BAD_ARGS); } strlcpy(options.monitor_pguri, optarg, MAXCONNINFO); log_trace("--monitor %s", options.monitor_pguri); break; } case 'f': { strlcpy(options.formation, optarg, NAMEDATALEN); log_trace("--formation %s", options.formation); break; } case 'g': { if (!stringToInt(optarg, &options.groupId)) { log_fatal("--group argument is not a valid group ID: \"%s\"", optarg); exit(EXIT_CODE_BAD_ARGS); } log_trace("--group %d", options.groupId); break; } case 'V': { /* keeper_cli_print_version prints version and exits. */ keeper_cli_print_version(argc, argv); break; } case 'v': { ++verboseCount; switch (verboseCount) { case 1: { log_set_level(LOG_INFO); break; } case 2: { log_set_level(LOG_DEBUG); break; } default: { log_set_level(LOG_TRACE); break; } } break; } case 'q': { log_set_level(LOG_ERROR); break; } case 'h': { commandline_help(stderr); exit(EXIT_CODE_QUIT); break; } default: { /* getopt_long already wrote an error message */ errors++; } } } if (errors > 0) { commandline_help(stderr); exit(EXIT_CODE_BAD_ARGS); } /* when we have a monitor URI we don't need PGDATA */ if (cli_use_monitor_option(&options)) { if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { log_warn("Given --monitor URI, the --pgdata option is ignored"); log_info("Connecting to monitor at \"%s\"", options.monitor_pguri); } } else { cli_common_get_set_pgdata_or_exit(&(options.pgSetup)); } /* when --pgdata is given, still initialise our pathnames */ if (!IS_EMPTY_STRING_BUFFER(options.pgSetup.pgdata)) { if (!keeper_config_set_pathnames_from_pgdata(&(options.pathnames), options.pgSetup.pgdata)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } } /* ensure --formation, or get it from the configuration file */ if (!cli_common_ensure_formation(&options)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_ARGS); } keeperOptions = options; return optind; } /* * cli_watch starts a ncurses dashboard that displays relevant information * about a running formation at a given monitor. */ static void cli_watch(int argc, char **argv) { WatchContext context = { 0 }; KeeperConfig config = keeperOptions; (void) cli_monitor_init_from_option_or_config(&(context.monitor), &config); strlcpy(context.formation, config.formation, sizeof(context.formation)); context.groupId = config.groupId; (void) cli_watch_main_loop(&context); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/config.c000066400000000000000000000227521414244367200221760ustar00rootroot00000000000000/* * src/bin/pg_autoctl/config.c * Common configuration functions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include "postgres_fe.h" #include "config.h" #include "defaults.h" #include "env_utils.h" #include "file_utils.h" #include "ini_file.h" #include "keeper.h" #include "keeper_config.h" #include "log.h" #include "pgctl.h" /* * build_xdg_path is an helper function that builds the full path to an XDG * compatible resource: either a configuration file, a runtime file, or a data * file. */ bool build_xdg_path(char *dst, XDGResourceType xdgType, const char *pgdata, const char *name) { char filename[MAXPGPATH]; char home[MAXPGPATH]; char fallback[MAXPGPATH]; char xdg_topdir[MAXPGPATH]; char *envVarName = NULL; if (!get_env_copy("HOME", home, MAXPGPATH)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } switch (xdgType) { case XDG_DATA: { join_path_components(fallback, home, ".local/share"); envVarName = "XDG_DATA_HOME"; break; } case XDG_CONFIG: { join_path_components(fallback, home, ".config"); envVarName = "XDG_CONFIG_HOME"; break; } case XDG_RUNTIME: { strlcpy(fallback, "/tmp", MAXPGPATH); envVarName = "XDG_RUNTIME_DIR"; break; } default: /* developper error */ log_error("No support for XDG Resource Type %d", xdgType); return false; } if (!get_env_copy_with_fallback(envVarName, xdg_topdir, MAXPGPATH, fallback)) { /* errors have already been logged */ return false; } if (xdgType == XDG_RUNTIME && !directory_exists(xdg_topdir)) { strlcpy(xdg_topdir, "/tmp", MAXPGPATH); } join_path_components(filename, xdg_topdir, "pg_autoctl"); /* append PGDATA now */ if (pgdata[0] == '/') { /* skip the first / to avoid having a double-slash in the name */ join_path_components(filename, filename, pgdata + 1); } else { /* * We have a relative pathname to PGDATA, and we want an absolute * pathname in our configuration directory name so that we make * sure to find it again. * * It could be that the PGDATA directory we are given doesn't exist * yet, precluding the use of realpath(3) to get the absolute name * here. */ char currentWorkingDirectory[MAXPGPATH] = { 0 }; if (getcwd(currentWorkingDirectory, MAXPGPATH) == NULL) { log_error("Failed to get the current working directory: %m"); return false; } /* avoid double-slash by skipping the first one */ join_path_components(filename, filename, currentWorkingDirectory + 1); /* now add in pgdata */ join_path_components(filename, filename, pgdata); } /* mkdir -p the target directory */ if (pg_mkdir_p(filename, 0755) == -1) { log_error("Failed to create state directory \"%s\": %m", filename); return false; } /* normalize the existing path to the configuration file */ if (!normalize_filename(filename, dst, MAXPGPATH)) { /* errors have already been logged */ return false; } /* and finally add the configuration file name */ join_path_components(dst, dst, name); return true; } /* * SetConfigFilePath sets config.pathnames.config from config.pgSetup.pgdata, * which must have been set previously. */ bool SetConfigFilePath(ConfigFilePaths *pathnames, const char *pgdata) { /* don't overwrite already computed value */ if (IS_EMPTY_STRING_BUFFER(pathnames->config)) { if (!build_xdg_path(pathnames->config, XDG_CONFIG, pgdata, KEEPER_CONFIGURATION_FILENAME)) { log_error("Failed to build our configuration file pathname, " "see above."); return false; } } log_trace("SetConfigFilePath: \"%s\"", pathnames->config); return true; } /* * SetStateFilePath sets config.pathnames.state from our PGDATA value, and * using the XDG Base Directory Specification for a data file. Per specs at: * * https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html * */ bool SetStateFilePath(ConfigFilePaths *pathnames, const char *pgdata) { if (IS_EMPTY_STRING_BUFFER(pathnames->state)) { if (!build_xdg_path(pathnames->state, XDG_DATA, pgdata, KEEPER_STATE_FILENAME)) { log_error("Failed to build pg_autoctl state file pathname, " "see above."); return false; } } log_trace("SetStateFilePath: \"%s\"", pathnames->state); /* now the init state file */ if (IS_EMPTY_STRING_BUFFER(pathnames->init)) { if (!build_xdg_path(pathnames->init, XDG_DATA, pgdata, KEEPER_INIT_STATE_FILENAME)) { log_error("Failed to build pg_autoctl init state file pathname, " "see above."); return false; } } log_trace("SetKeeperStateFilePath: \"%s\"", pathnames->init); return true; } /* * SetNodesFilePath sets config.pathnames.nodes from our PGDATA value, and * using the XDG Base Directory Specification for a data file. Per specs at: * * https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html * */ bool SetNodesFilePath(ConfigFilePaths *pathnames, const char *pgdata) { if (IS_EMPTY_STRING_BUFFER(pathnames->nodes)) { if (!build_xdg_path(pathnames->nodes, XDG_DATA, pgdata, KEEPER_NODES_FILENAME)) { log_error("Failed to build pg_autoctl state file pathname, " "see above."); return false; } } log_trace("SetNodesFilePath: \"%s\"", pathnames->nodes); return true; } /* * SetPidFilePath sets config.pathnames.pidfile from our PGDATA value, and * using the XDG Base Directory Specification for a runtime file. */ bool SetPidFilePath(ConfigFilePaths *pathnames, const char *pgdata) { if (IS_EMPTY_STRING_BUFFER(pathnames->pid)) { if (!build_xdg_path(pathnames->pid, XDG_RUNTIME, pgdata, KEEPER_PID_FILENAME)) { log_error("Failed to build pg_autoctl pid file pathname, " "see above."); return false; } } log_trace("SetPidFilePath: \"%s\"", pathnames->pid); return true; } /* * ProbeConfigurationFileRole opens a configuration file at given filename and * probes the pg_autoctl role it belongs to: either a monitor or a keeper. * * We use a IniOption array with a single entry here, the pg_autoctl.role * setting that indicates which role is our configuration file intended to be * read as: either "monitor" or "keeper". */ pgAutoCtlNodeRole ProbeConfigurationFileRole(const char *filename) { MinimalConfig config = { 0 }; IniOption configOptions[] = { make_strbuf_option("pg_autoctl", "role", NULL, true, NAMEDATALEN, config.role), INI_OPTION_LAST }; log_debug("Probing configuration file \"%s\"", filename); /* * There is a race condition at process startup where a configuration file * can disappear while being overwritten. Reduce the chances of that * happening by making more than one attempt at reading the file. */ char *fileContents = NULL; for (int attempts = 0; fileContents == NULL && attempts < 3; attempts++) { long fileSize = 0L; if (read_file_if_exists(filename, &fileContents, &fileSize)) { break; } pg_usleep(100 * 1000); /* 100ms */ } if (fileContents == NULL) { log_error("Failed to read configuration file \"%s\"", filename); return PG_AUTOCTL_ROLE_UNKNOWN; } if (!parse_ini_buffer(filename, fileContents, configOptions)) { log_error("Failed to parse configuration file \"%s\"", filename); return PG_AUTOCTL_ROLE_UNKNOWN; } log_debug("ProbeConfigurationFileRole: %s", config.role); if (strcmp(config.role, MONITOR_ROLE) == 0) { return PG_AUTOCTL_ROLE_MONITOR; } else if (strcmp(config.role, KEEPER_ROLE) == 0) { return PG_AUTOCTL_ROLE_KEEPER; } else { log_fatal("Failed to recognize configuration file setting for " "pg_autoctl.role: \"%s\"", config.role); exit(EXIT_CODE_BAD_CONFIG); } /* can't happen: keep compiler happy */ return PG_AUTOCTL_ROLE_UNKNOWN; } /* * config_accept_new_ssloptions allows to reload SSL options at runtime. */ bool config_accept_new_ssloptions(PostgresSetup *pgSetup, PostgresSetup *newPgSetup) { if (pgSetup->ssl.active != newPgSetup->ssl.active) { log_info("Reloading configuration: ssl is now %s; used to be %s", newPgSetup->ssl.active ? "active" : "disabled", pgSetup->ssl.active ? "active" : "disabled"); } if (pgSetup->ssl.sslMode != newPgSetup->ssl.sslMode) { log_info("Reloading configuration: sslmode is now \"%s\"; " "used to be \"%s\"", pgsetup_sslmode_to_string(newPgSetup->ssl.sslMode), pgsetup_sslmode_to_string(pgSetup->ssl.sslMode)); } if (strneq(pgSetup->ssl.caFile, newPgSetup->ssl.caFile)) { log_info("Reloading configuration: ssl CA file is now \"%s\"; " "used to be \"%s\"", newPgSetup->ssl.caFile, pgSetup->ssl.caFile); } if (strneq(pgSetup->ssl.crlFile, newPgSetup->ssl.crlFile)) { log_info("Reloading configuration: ssl CRL file is now \"%s\"; " "used to be \"%s\"", newPgSetup->ssl.crlFile, pgSetup->ssl.crlFile); } if (strneq(pgSetup->ssl.serverCert, newPgSetup->ssl.serverCert)) { log_info("Reloading configuration: ssl server cert file is now \"%s\"; " "used to be \"%s\"", newPgSetup->ssl.serverCert, pgSetup->ssl.serverCert); } if (strneq(pgSetup->ssl.serverKey, newPgSetup->ssl.serverKey)) { log_info("Reloading configuration: ssl server key file is now \"%s\"; " "used to be \"%s\"", newPgSetup->ssl.serverKey, pgSetup->ssl.serverKey); } /* install the new SSL settings, wholesale */ pgSetup->ssl = newPgSetup->ssl; strlcpy(pgSetup->ssl.sslModeStr, pgsetup_sslmode_to_string(pgSetup->ssl.sslMode), SSL_MODE_STRLEN); return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/config.h000066400000000000000000000044721414244367200222020ustar00rootroot00000000000000/* * src/bin/pg_autoctl/config.h * Common configuration data structure and function definitions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef CONFIG_H #define CONFIG_H #include #include #include "pgctl.h" #include "pgsql.h" #define KEEPER_ROLE "keeper" #define MONITOR_ROLE "monitor" typedef enum { PG_AUTOCTL_ROLE_UNKNOWN, PG_AUTOCTL_ROLE_MONITOR, PG_AUTOCTL_ROLE_KEEPER } pgAutoCtlNodeRole; typedef struct MinimalConfig { char role[NAMEDATALEN]; } MinimalConfig; typedef struct ConfigFilePaths { char config[MAXPGPATH]; /* ~/.config/pg_autoctl/${PGDATA}/pg_autoctl.cfg */ char state[MAXPGPATH]; /* ~/.local/share/pg_autoctl/${PGDATA}/pg_autoctl.state */ char pid[MAXPGPATH]; /* /tmp/${PGDATA}/pg_autoctl.pid */ char init[MAXPGPATH]; /* /tmp/${PGDATA}/pg_autoctl.init */ char nodes[MAXPGPATH]; /* ~/.local/share/pg_autoctl/${PGDATA}/nodes.json */ char systemd[MAXPGPATH]; /* ~/.config/systemd/user/pgautofailover.service */ } ConfigFilePaths; /* * We implement XDG Base Directory Specification (in parts), and the following * XDGResourceType makes it possible to make some decisions in the generic * build_xdg_path() helper function: * * - XDG_DATA resource uses XDG_DATA_HOME environment variable and defaults to * ${HOME}.local/share * * - XDG_CONFIG resource uses XDG_CONFIG_HOME environement variable and * defaults to ${HOME}/.config * * - XDG_CACHE and XDG_RUNTIME are not implemented yet. * * https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html */ typedef enum { XDG_DATA, XDG_CONFIG, XDG_CACHE, XDG_RUNTIME } XDGResourceType; bool build_xdg_path(char *dst, XDGResourceType xdgType, const char *pgdata, const char *name); bool SetConfigFilePath(ConfigFilePaths *pathnames, const char *pgdata); bool SetStateFilePath(ConfigFilePaths *pathnames, const char *pgdata); bool SetNodesFilePath(ConfigFilePaths *pathnames, const char *pgdata); bool SetPidFilePath(ConfigFilePaths *pathnames, const char *pgdata); pgAutoCtlNodeRole ProbeConfigurationFileRole(const char *filename); #define strneq(x, y) \ ((x != NULL) && (y != NULL) && (strcmp(x, y) != 0)) bool config_accept_new_ssloptions(PostgresSetup *pgSetup, PostgresSetup *newPgSetup); #endif /* CONFIG_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/debian.c000066400000000000000000000450031414244367200221450ustar00rootroot00000000000000/* * src/bin/pg_autoctl/debian.c * * Debian specific code to support registering a pg_autoctl node from a * Postgres cluster created with pg_createcluster. We need to move the * configuration files back to PGDATA. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "debian.h" #include "keeper.h" #include "keeper_config.h" #include "parsing.h" #define EDITED_BY_PG_AUTOCTL "# edited by pg_auto_failover \n" static bool debian_find_postgres_configuration_files(PostgresSetup *pgSetup, PostgresConfigFiles *pgConfigFiles); static bool debian_init_postgres_config_files(PostgresSetup *pgSetup, PostgresConfigFiles *pgConfFiles, PostgresConfigurationKind confKind); static bool buildDebianDataAndConfDirectoryNames(PostgresSetup *pgSetup, DebianPathnames *debPathnames); static bool expandDebianPatterns(DebianPathnames *debPathnames, const char *dataDirectoryTemplate, const char *confDirectoryTemplate); static bool expandDebianPatternsInDirectoryName(char *pathname, int pathnameSize, const char *template, const char *versionName, const char *clusterName); static void initPostgresConfigFiles(const char *dirname, PostgresConfigFiles *pgConfigFiles, PostgresConfigurationKind kind); static bool postgresConfigFilesAllExist(PostgresConfigFiles *pgConfigFiles); static bool move_configuration_files(PostgresConfigFiles *src, PostgresConfigFiles *dst); static bool comment_out_configuration_parameters(const char *srcConfPath, const char *dstConfPath); static bool disableAutoStart(PostgresConfigFiles *pgConfigFiles); /* * keeper_ensure_pg_configuration_files_in_pgdata checks if postgresql.conf, * pg_hba.conf, pg_ident.conf files exist in $PGDATA, if not it tries to get * them from default location and modifies paths inside copied postgresql.conf. */ bool keeper_ensure_pg_configuration_files_in_pgdata(PostgresSetup *pgSetup) { PostgresConfigFiles pgConfigFiles = { 0 }; if (!debian_find_postgres_configuration_files(pgSetup, &pgConfigFiles)) { /* errors have already been logged */ return false; } switch (pgConfigFiles.kind) { case PG_CONFIG_TYPE_POSTGRES: { /* that's it, we're good */ return true; } case PG_CONFIG_TYPE_DEBIAN: { /* * So now pgConfigFiles is the debian path for configuration files, * and we're building a new pgdataConfigFiles for the Postgres * configuration files in PGDATA. */ PostgresConfigFiles pgdataConfigFiles = { 0 }; log_info("Found a debian style installation in PGDATA \"%s\" with " "postgresql.conf located at \"%s\"", pgSetup->pgdata, pgConfigFiles.conf); initPostgresConfigFiles(pgSetup->pgdata, &pgdataConfigFiles, PG_CONFIG_TYPE_POSTGRES); log_info("Moving configuration files back to PGDATA at \"%s\"", pgSetup->pgdata); /* move configuration files back to PGDATA, or die trying */ if (!move_configuration_files(&pgConfigFiles, &pgdataConfigFiles)) { char *_dirname = dirname(pgConfigFiles.conf); log_fatal("Failed to move the debian configuration files from " "\"%s\" back to PGDATA at \"%s\"", _dirname, pgSetup->pgdata); return false; } /* also disable debian auto start of the cluster we now own */ if (!disableAutoStart(&pgConfigFiles)) { log_fatal("Failed to disable debian auto-start behavior, " "see above for details"); return false; } return true; } case PG_CONFIG_TYPE_UNKNOWN: { log_fatal("Failed to find the \"postgresql.conf\" file. " "It's not in PGDATA, and it's not in the debian " "place we had a look at. See above for details"); return false; } } /* This is a huge bug */ log_error("BUG: some unknown PG_CONFIG enum value was encountered"); return false; } /* * debian_find_postgres_configuration_files finds the Postgres configuration * files following the following strategies: * * - first attempt to find the files where we expect them, in PGDATA * - then attempt to find the files in the debian /etc/postgresql/%v/%c place * * At the moment we only have those two strategies, and with some luck that's * all we're ever going to need. */ static bool debian_find_postgres_configuration_files(PostgresSetup *pgSetup, PostgresConfigFiles *pgConfigFiles) { PostgresConfigFiles postgresConfFiles = { 0 }; PostgresConfigFiles debianConfFiles = { 0 }; pgConfigFiles->kind = PG_CONFIG_TYPE_UNKNOWN; if (!pg_setup_pgdata_exists(pgSetup)) { return PG_CONFIG_TYPE_UNKNOWN; } /* is it a Postgres core initdb style setup? */ if (debian_init_postgres_config_files(pgSetup, &postgresConfFiles, PG_CONFIG_TYPE_POSTGRES)) { /* so we're dealing with a "normal" Postgres installation */ *pgConfigFiles = postgresConfFiles; return true; } /* * Is it a debian postgresql-common style setup then? * * We only search for debian style setup when the main postgresql.conf file * was not found. The previous call to debian_init_postgres_config_files * might see a partial failure because of e.g. missing only pg_ident.conf. */ if (!file_exists(postgresConfFiles.conf)) { if (debian_init_postgres_config_files(pgSetup, &debianConfFiles, PG_CONFIG_TYPE_DEBIAN)) { /* so we're dealing with a "debian style" Postgres installation */ *pgConfigFiles = debianConfFiles; return true; } } /* well that's all we know how to detect at this point */ return false; } /* * debian_init_postgres_config_files initializes the given PostgresConfigFiles * structure with the location of existing files as found on-disk given a * Postgres configuration kind. */ static bool debian_init_postgres_config_files(PostgresSetup *pgSetup, PostgresConfigFiles *pgConfigFiles, PostgresConfigurationKind confKind) { const char *pgdata = pgSetup->pgdata; switch (confKind) { case PG_CONFIG_TYPE_UNKNOWN: { /* that's a bug really */ log_error("BUG: debian_init_postgres_config_files " "called with UNKNOWN conf kind"); return false; } case PG_CONFIG_TYPE_POSTGRES: { initPostgresConfigFiles(pgdata, pgConfigFiles, PG_CONFIG_TYPE_POSTGRES); return postgresConfigFilesAllExist(pgConfigFiles); } case PG_CONFIG_TYPE_DEBIAN: { DebianPathnames debPathnames = { 0 }; if (!buildDebianDataAndConfDirectoryNames(pgSetup, &debPathnames)) { log_warn("Failed to match PGDATA at \"%s\" with a debian " "setup following the data_directory template " "'/var/lib/postgresql/%%v/%%c'", pgSetup->pgdata); return false; } initPostgresConfigFiles(debPathnames.confDirectory, pgConfigFiles, PG_CONFIG_TYPE_DEBIAN); return postgresConfigFilesAllExist(pgConfigFiles); } } /* This is a huge bug */ log_error("BUG: some unknown PG_CONFIG enum value was encountered"); return false; } /* * buildDebianDataAndConfDirectoryNames builds the debian specific directory * pathnames from the pgSetup pgdata location. * * For a debian cluster, we first have to extract the "cluster" name (%c) and * then find the configuration files in /etc/postgresql/%v/%c with %v being the * version number. * * Note that debian's /etc/postgresql-common/createcluster.conf defaults to * using the following setup, and that's the only one we support at this * moment. * * data_directory = '/var/lib/postgresql/%v/%c' * */ static bool buildDebianDataAndConfDirectoryNames(PostgresSetup *pgSetup, DebianPathnames *debPathnames) { char *pgmajor = strdup(pgSetup->pg_version); char pgdata[MAXPGPATH]; char clusterDir[MAXPGPATH] = { 0 }; char versionDir[MAXPGPATH] = { 0 }; /* we need to work with the absolute pathname of PGDATA */ if (!normalize_filename(pgSetup->pgdata, pgdata, MAXPGPATH)) { /* errors have already been logged */ return false; } /* clusterDir is the same as pgdata really */ strlcpy(clusterDir, pgdata, MAXPGPATH); /* from PGDATA, get the directory one-level up */ strlcpy(versionDir, clusterDir, MAXPGPATH); get_parent_directory(versionDir); /* get the names of our version and cluster directories */ char *clusterDirName = strdup(basename(clusterDir)); char *versionDirName = strdup(basename(versionDir)); /* transform pgversion "11.4" to "11" to get the major version part */ char *dot = strchr(pgmajor, '.'); if (dot) { *dot = '\0'; } /* check that debian pathname version string == Postgres version string */ if (strcmp(versionDirName, pgmajor) != 0) { log_debug("Failed to match the version component of the " "debian data_directory \"%s\" with the current " "version of Postgres: \"%s\"", pgdata, pgmajor); return false; } /* prepare given debPathnames */ strlcpy(debPathnames->versionName, versionDirName, PG_VERSION_STRING_MAX); strlcpy(debPathnames->clusterName, clusterDirName, MAXPGPATH); if (!expandDebianPatterns(debPathnames, "/var/lib/postgresql/%v/%c", "/etc/postgresql/%v/%c")) { /* errors have already been logged */ return false; } /* free memory allocated with strdup */ free(pgmajor); free(clusterDirName); free(versionDirName); return true; } /* * expandDebianPatterns expands the %v and %c values in given templates and * apply the result to debPathnames->dataDirectory and * debPathnames->confDirectory. */ static bool expandDebianPatterns(DebianPathnames *debPathnames, const char *dataDirectoryTemplate, const char *confDirectoryTemplate) { return expandDebianPatternsInDirectoryName(debPathnames->dataDirectory, MAXPGPATH, dataDirectoryTemplate, debPathnames->versionName, debPathnames->clusterName) && expandDebianPatternsInDirectoryName(debPathnames->confDirectory, MAXPGPATH, confDirectoryTemplate, debPathnames->versionName, debPathnames->clusterName); } /* * expandDebianPatternsInDirectoryName prepares a debian target data_directory * or configuration directory from a pattern. * * Given the parameters: * template = "/var/lib/postgresql/%v/%c" * versionName = "11" * clusterName = "main" * * Then the following string is copied in pre-allocated pathname: * "/var/lib/postgresql/11/main" */ static bool expandDebianPatternsInDirectoryName(char *pathname, int pathnameSize, const char *template, const char *versionName, const char *clusterName) { int pathnameIndex = 0; int templateIndex = 0; int templateSize = strlen(template); bool previousCharIsPercent = false; for (templateIndex = 0; templateIndex < templateSize; templateIndex++) { char currentChar = template[templateIndex]; if (pathnameIndex >= pathnameSize) { log_error("BUG: expandDebianPatternsInDirectoryName destination " "buffer is too short (%d bytes)", pathnameSize); return false; } if (previousCharIsPercent) { switch (currentChar) { case 'v': { int versionSize = strlen(versionName); /* * Only copy if we have enough room, increment pathnameSize * anyways so that the first check in the main loop catches * and report the error. */ if ((pathnameIndex + versionSize) < pathnameSize) { strlcpy(pathname + pathnameIndex, versionName, pathnameSize - pathnameIndex); pathnameIndex += versionSize; } break; } case 'c': { int clusterSize = strlen(clusterName); /* * Only copy if we have enough room, increment pathnameSize * anyways so that the first check in the main loop catches * and report the error. */ if ((pathnameIndex + clusterSize) < pathnameSize) { strlcpy(pathname + pathnameIndex, clusterName, pathnameSize - pathnameIndex); pathnameIndex += clusterSize; } break; } default: { pathname[pathnameIndex++] = currentChar; break; } } } else if (currentChar != '%') { pathname[pathnameIndex++] = currentChar; } previousCharIsPercent = currentChar == '%'; } return true; } /* * initPostgresConfigFiles initializes PostgresConfigFiles structure with our * filenames located in given directory pathname. */ static void initPostgresConfigFiles(const char *dirname, PostgresConfigFiles *pgConfigFiles, PostgresConfigurationKind confKind) { pgConfigFiles->kind = confKind; join_path_components(pgConfigFiles->conf, dirname, "postgresql.conf"); join_path_components(pgConfigFiles->ident, dirname, "pg_ident.conf"); join_path_components(pgConfigFiles->hba, dirname, "pg_hba.conf"); } /* * postgresConfigFilesAllExist returns true when the three files that we track * all exit on the file system, per file_exists() test. */ static bool postgresConfigFilesAllExist(PostgresConfigFiles *pgConfigFiles) { /* * WARN the user about the unexpected nature of our setup here, even if we * then move on to make it the way we expect it. */ if (!file_exists(pgConfigFiles->conf)) { log_warn("Failed to find Postgres configuration files in PGDATA, " "as expected: \"%s\" does not exist", pgConfigFiles->conf); } if (!file_exists(pgConfigFiles->ident)) { log_warn("Failed to find Postgres configuration files in PGDATA, " "as expected: \"%s\" does not exist", pgConfigFiles->ident); } if (!file_exists(pgConfigFiles->hba)) { log_warn("Failed to find Postgres configuration files in PGDATA, " "as expected: \"%s\" does not exist", pgConfigFiles->hba); } return file_exists(pgConfigFiles->conf) && file_exists(pgConfigFiles->ident) && file_exists(pgConfigFiles->hba); } /* * move_configuration_files moves configuration files from the source place to * the destination place as given. * * While moving the files, we also need to edit the "postgresql.conf" content * to comment out the lines for the config_file, hba_file, and ident_file * location. We're going to use the Postgres defaults in PGDATA. */ static bool move_configuration_files(PostgresConfigFiles *src, PostgresConfigFiles *dst) { /* edit postgresql.conf and move it to its dst pathname */ log_info("Preparing \"%s\" from \"%s\"", dst->conf, src->conf); if (!comment_out_configuration_parameters(src->conf, dst->conf)) { return false; } /* HBA and ident files are copied without edits */ log_info("Moving \"%s\" to \"%s\"", src->hba, dst->hba); if (!move_file(src->hba, dst->hba)) { /* * Clean-up the mess then, and return false whether the clean-up is a * success or not. */ (void) unlink_file(dst->conf); return false; } /* HBA and ident files are copied without edits */ log_info("Moving \"%s\" to \"%s\"", src->ident, dst->ident); if (!move_file(src->ident, dst->ident)) { /* * Clean-up the mess then, and return false whether the clean-up is a * success or not. */ (void) unlink_file(dst->conf); (void) move_file(dst->hba, src->hba); return false; } /* finish the move of the postgresql.conf */ if (!unlink_file(src->conf)) { /* * Clean-up the mess then, and return false whether the clean-up is a * success or not. */ (void) move_file(dst->hba, src->hba); (void) move_file(dst->ident, src->ident); return false; } /* consider failure to symlink as a non-fatal event */ (void) create_symbolic_link(src->conf, dst->conf); (void) create_symbolic_link(src->ident, dst->ident); (void) create_symbolic_link(src->hba, dst->hba); return true; } /* * comment_out_configuration_parameters reads postgresql.conf file from source * location and writes a new version of it at destination location with some * parameters commented out: * * data_directory * config_file * hba_file * ident_file * include_dir */ static bool comment_out_configuration_parameters(const char *srcConfPath, const char *dstConfPath) { char lineBuffer[BUFSIZE]; /* * configuration parameters can appear in any order, and we * need to check for patterns for NAME = VALUE and NAME=VALUE */ char *targetVariableExpression = "(" "data_directory" "|hba_file" "|ident_file" "|include_dir" "|stats_temp_directory" ")( *)="; /* open a file */ FILE *fileStream = fopen_read_only(srcConfPath); if (fileStream == NULL) { log_error("Failed to open file \"%s\": %m", srcConfPath); return false; } PQExpBuffer newConfContents = createPQExpBuffer(); if (newConfContents == NULL) { log_error("Failed to allocate memory"); return false; } /* read each line including terminating new line and process it */ while (fgets(lineBuffer, BUFSIZE, fileStream) != NULL) { bool variableFound = false; char *matchedString = regexp_first_match(lineBuffer, targetVariableExpression); /* check if the line contains any of target variables */ if (matchedString != NULL) { variableFound = true; /* regexp_first_match uses malloc, result must be deallocated */ free(matchedString); } /* * comment out the line if any of target variables is found * and if it was not already commented */ if (variableFound && lineBuffer[0] != '#') { appendPQExpBufferStr(newConfContents, EDITED_BY_PG_AUTOCTL); appendPQExpBufferStr(newConfContents, "# "); } /* copy rest of the line */ appendPQExpBufferStr(newConfContents, lineBuffer); } fclose(fileStream); /* write the resulting content at the destination path */ if (!write_file(newConfContents->data, newConfContents->len, dstConfPath)) { destroyPQExpBuffer(newConfContents); return false; } /* we don't need the buffer anymore */ destroyPQExpBuffer(newConfContents); /* * Refrain from removing the source file, we might fail to proceed and then * we will want to offer a path forward to the user where the original * configuration file is still around */ return true; } /* * disableAutoStart disables auto start in default configuration */ static bool disableAutoStart(PostgresConfigFiles *pgConfigFiles) { char startConfPath[MAXPGPATH] = { 0 }; char copyStartConfPath[MAXPGPATH] = { 0 }; char *newStartConfData = EDITED_BY_PG_AUTOCTL "disabled"; path_in_same_directory(pgConfigFiles->conf, "start.conf", startConfPath); path_in_same_directory(pgConfigFiles->conf, "start.conf.orig", copyStartConfPath); if (rename(startConfPath, copyStartConfPath) != 0) { log_error("Failed to rename debian auto start setup to \"%s\": %m", copyStartConfPath); return false; } return write_file(newStartConfData, strlen(newStartConfData), startConfPath); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/debian.h000066400000000000000000000032511414244367200221510ustar00rootroot00000000000000/* * src/bin/pg_autoctl/debian.h * * Debian specific code to support registering a pg_autoctl node from a * Postgres cluster created with pg_createcluster. We need to move the * configuration files back to PGDATA. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef DEBIAN_H #define DEBIAN_H #include "keeper_config.h" #include "pgsetup.h" /* * We know how to find configuration files in either PGDATA as per Postgres * core, or in the debian cluster configuration directory as per debian * postgres-common packaging, implemented in pg_createcluster. */ typedef enum { PG_CONFIG_TYPE_UNKNOWN = 0, PG_CONFIG_TYPE_POSTGRES, PG_CONFIG_TYPE_DEBIAN } PostgresConfigurationKind; /* * debian's pg_createcluster moves the 3 configuration files to a place in /etc: * * - postgresql.conf * - pg_ident.conf * - pg_hba.conf * * On top of that debian also manages a "start.conf" file to decide if their * systemd integration should manage a given cluster. */ typedef struct pg_config_files { PostgresConfigurationKind kind; char conf[MAXPGPATH]; char ident[MAXPGPATH]; char hba[MAXPGPATH]; } PostgresConfigFiles; /* * debian handles paths for data_directory and configuration directory that * depend on two components: Postgres version string ("11", "12", etc) and * debian cluster name (defaults to "main"). */ typedef struct debian_pathnames { char versionName[PG_VERSION_STRING_MAX]; char clusterName[MAXPGPATH]; char dataDirectory[MAXPGPATH]; char confDirectory[MAXPGPATH]; } DebianPathnames; bool keeper_ensure_pg_configuration_files_in_pgdata(PostgresSetup *pgSetup); #endif /* DEBIAN_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/defaults.h000066400000000000000000000133731414244367200225440ustar00rootroot00000000000000/* * src/bin/pg_autoctl/defaults.h * Default values for pg_autoctl configuration settings * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef DEFAULTS_H #define DEFAULTS_H /* to be written in the state file */ #define PG_AUTOCTL_STATE_VERSION 1 /* additional version information for printing version on CLI */ #define PG_AUTOCTL_VERSION "1.6.3" /* version of the extension that we requite to talk to on the monitor */ #define PG_AUTOCTL_EXTENSION_VERSION "1.6" /* environment variable to use to make DEBUG facilities available */ #define PG_AUTOCTL_DEBUG "PG_AUTOCTL_DEBUG" #define PG_AUTOCTL_EXTENSION_VERSION_VAR "PG_AUTOCTL_EXTENSION_VERSION" /* environment variable for containing the id of the logging semaphore */ #define PG_AUTOCTL_LOG_SEMAPHORE "PG_AUTOCTL_LOG_SEMAPHORE" /* environment variable for --monitor, when used instead of --pgdata */ #define PG_AUTOCTL_MONITOR "PG_AUTOCTL_MONITOR" /* default values for the pg_autoctl settings */ #define POSTGRES_PORT 5432 #define POSTGRES_DEFAULT_LISTEN_ADDRESSES "*" #define DEFAULT_DATABASE_NAME "postgres" #define DEFAULT_USERNAME "postgres" #define DEFAULT_AUTH_METHOD "trust" #define REPLICATION_SLOT_NAME_DEFAULT "pgautofailover_standby" #define REPLICATION_SLOT_NAME_PATTERN "^pgautofailover_standby_" #define REPLICATION_PASSWORD_DEFAULT NULL #define REPLICATION_APPLICATION_NAME_PREFIX "pgautofailover_standby_" #define FORMATION_DEFAULT "default" #define GROUP_ID_DEFAULT 0 #define POSTGRES_CONNECT_TIMEOUT "2" #define MAXIMUM_BACKUP_RATE "100M" #define MAXIMUM_BACKUP_RATE_LEN 32 /* * Microsoft approved cipher string. * This cipher string implicitely enables only TLSv1.2+, because these ciphers * were all added in TLSv1.2. This can be confirmed by running: * openssl -v */ #define DEFAULT_SSL_CIPHERS "ECDHE-ECDSA-AES128-GCM-SHA256:" \ "ECDHE-ECDSA-AES256-GCM-SHA384:" \ "ECDHE-RSA-AES128-GCM-SHA256:" \ "ECDHE-RSA-AES256-GCM-SHA384:" \ "ECDHE-ECDSA-AES128-SHA256:" \ "ECDHE-ECDSA-AES256-SHA384:" \ "ECDHE-RSA-AES128-SHA256:" \ "ECDHE-RSA-AES256-SHA384" /* retry PQping for a maximum of 15 mins, up to 2 secs between attemps */ #define POSTGRES_PING_RETRY_TIMEOUT 900 /* seconds */ #define POSTGRES_PING_RETRY_CAP_SLEEP_TIME (2 * 1000) /* milliseconds */ #define POSTGRES_PING_RETRY_BASE_SLEEP_TIME 5 /* milliseconds */ #define PG_AUTOCTL_MONITOR_DISABLED "PG_AUTOCTL_DISABLED" #define NETWORK_PARTITION_TIMEOUT 20 #define PREPARE_PROMOTION_CATCHUP_TIMEOUT 30 #define PREPARE_PROMOTION_WALRECEIVER_TIMEOUT 5 #define PG_AUTOCTL_KEEPER_SLEEP_TIME 1 /* seconds */ #define PG_AUTOCTL_KEEPER_RETRY_TIME_MS 350 /* milliseconds */ #define PG_AUTOCTL_MONITOR_SLEEP_TIME 10 /* seconds */ #define PG_AUTOCTL_MONITOR_RETRY_TIME 1 /* seconds */ #define PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT 60 #define COORDINATOR_IS_READY_TIMEOUT 300 #define POSTGRESQL_FAILS_TO_START_TIMEOUT 20 #define POSTGRESQL_FAILS_TO_START_RETRIES 3 #define DEFAULT_CITUS_ROLE "primary" #define DEFAULT_CITUS_CLUSTER_NAME "default" #define FAILOVER_FORMATION_NUMBER_SYNC_STANDBYS 1 #define FAILOVER_NODE_CANDIDATE_PRIORITY 50 #define FAILOVER_NODE_REPLICATION_QUORUM true /* internal default for allocating strings */ #define BUFSIZE 1024 /* * 50kB seems enough to store the PATH environment variable if you have more, * simply set PATH to something smaller. * The limit on linux for environment variables is 128kB: * https://unix.stackexchange.com/questions/336934 */ #define MAXPATHSIZE 50000 /* buffersize that is needed for results of ctime_r */ #define MAXCTIMESIZE 26 #define AWAIT_PROMOTION_SLEEP_TIME_MS 1000 #define KEEPER_CONFIGURATION_FILENAME "pg_autoctl.cfg" #define KEEPER_STATE_FILENAME "pg_autoctl.state" #define KEEPER_PID_FILENAME "pg_autoctl.pid" #define KEEPER_INIT_STATE_FILENAME "pg_autoctl.init" #define KEEPER_POSTGRES_STATE_FILENAME "pg_autoctl.pg" #define KEEPER_NODES_FILENAME "nodes.json" #define KEEPER_SYSTEMD_SERVICE "pgautofailover" #define KEEPER_SYSTEMD_FILENAME "pgautofailover.service" /* pg_auto_failover monitor related constants */ #define PG_AUTOCTL_HEALTH_USERNAME "pgautofailover_monitor" #define PG_AUTOCTL_HEALTH_PASSWORD "pgautofailover_monitor" #define PG_AUTOCTL_REPLICA_USERNAME "pgautofailover_replicator" #define PG_AUTOCTL_MONITOR_DBNAME "pg_auto_failover" #define PG_AUTOCTL_MONITOR_EXTENSION_NAME "pgautofailover" #define PG_AUTOCTL_MONITOR_DBOWNER "autoctl" #define PG_AUTOCTL_MONITOR_USERNAME "autoctl_node" /* Citus support */ #define CITUS_EXTENSION_NAME "citus" /* Default external service provider to use to discover local IP address */ #define DEFAULT_INTERFACE_LOOKUP_SERVICE_NAME "8.8.8.8" #define DEFAULT_INTERFACE_LOOKUP_SERVICE_PORT 53 /* * Error codes returned to the shell in case something goes wrong. */ #define EXIT_CODE_QUIT 0 /* it's ok, we were asked politely */ #define EXIT_CODE_BAD_ARGS 1 #define EXIT_CODE_BAD_CONFIG 2 #define EXIT_CODE_BAD_STATE 3 #define EXIT_CODE_PGSQL 4 #define EXIT_CODE_PGCTL 5 #define EXIT_CODE_MONITOR 6 #define EXIT_CODE_COORDINATOR 7 #define EXIT_CODE_KEEPER 8 #define EXIT_CODE_RELOAD 9 #define EXIT_CODE_INTERNAL_ERROR 12 #define EXIT_CODE_EXTENSION_MISSING 13 #define EXIT_CODE_DROPPED 121 /* node was dropped, stop everything and quit */ #define EXIT_CODE_FATAL 122 /* error is fatal, no retry, quit now */ /* * This opens file write only and creates if it doesn't exist. */ #define FOPEN_FLAGS_W O_WRONLY | O_TRUNC | O_CREAT /* * This opens the file in append mode and creates it if it doesn't exist. */ #define FOPEN_FLAGS_A O_APPEND | O_RDWR | O_CREAT /* when malloc fails, what do we tell our users */ #define ALLOCATION_FAILED_ERROR "Failed to allocate memory: %m" #endif /* DEFAULTS_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/demoapp.c000066400000000000000000000520531414244367200223530ustar00rootroot00000000000000/* * src/bin/pg_autoctl/demoapp.c * Demo application for pg_auto_failover * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include #include #include "cli_do_demoapp.h" #include "defaults.h" #include "demoapp.h" #include "env_utils.h" #include "log.h" #include "monitor.h" #include "pgsql.h" #include "signals.h" #include "string_utils.h" #include "runprogram.h" static void demoapp_set_retry_policy(PGSQL *pgsql, int cap, int sleepTime); static bool demoapp_register_client(const char *pguri, int clientId, int retrySleep, int retryCap); static bool demoapp_update_client_failovers(const char *pguri, int clientId, int failovers); static void demoapp_start_client(const char *pguri, int clientId, DemoAppOptions *demoAppOptions); static bool demoapp_wait_for_clients(pid_t clientsPidArray[], int startedClientsCount); static void demoapp_terminate_clients(pid_t clientsPidArray[], int startedClientsCount); static void demoapp_process_perform_switchover(DemoAppOptions *demoAppOptions); static int demoapp_get_terminal_columns(void); static void demoapp_psql(const char *pguri, const char *sql); /* * demoapp_grab_formation_uri connects to the monitor and grabs the formation * URI to use in the demo application. */ bool demoapp_grab_formation_uri(DemoAppOptions *options, char *pguri, size_t size, bool *mayRetry) { Monitor monitor = { 0 }; SSLOptions ssl = { 0 }; SSLMode sslMode = SSL_MODE_PREFER; char *sslModeStr = pgsetup_sslmode_to_string(sslMode); ssl.sslMode = sslMode; strlcpy(ssl.sslModeStr, sslModeStr, SSL_MODE_STRLEN); *mayRetry = false; if (!monitor_init(&monitor, options->monitor_pguri)) { /* errors have already been logged */ return false; } /* allow lots of retries to connect to the monitor at startup */ pgsql_set_monitor_interactive_retry_policy(&(monitor.pgsql.retryPolicy)); if (!monitor_formation_uri(&monitor, options->formation, "default", &ssl, pguri, size)) { int groupsCount = 0; if (!monitor_count_groups(&monitor, options->formation, &groupsCount)) { /* errors have already been logged */ return false; } if (groupsCount >= 0) { *mayRetry = true; } log_level(*mayRetry ? LOG_ERROR : LOG_FATAL, "Failed to grab the Postgres URI " "to connect to formation \"%s\", see above for details", options->formation); pgsql_finish(&(monitor.pgsql)); return false; } pgsql_finish(&(monitor.pgsql)); return true; } /* * demoapp_set_retry_policy sets a retry policy that is suitable for a demo * client application. */ static void demoapp_set_retry_policy(PGSQL *pgsql, int cap, int sleepTime) { (void) pgsql_set_retry_policy(&(pgsql->retryPolicy), 60, /* maxT */ -1, /* unbounded maxR */ cap, sleepTime); } /* * demoapp_prepare_schema prepares the demo application schema on the target * database instance. */ bool demoapp_prepare_schema(const char *pguri) { PGSQL pgsql = { 0 }; const char *ddls[] = { "drop schema if exists demo cascade", "create schema demo", "create table demo.tracking(ts timestamptz default now(), " "client integer, loop integer, retries integer, us bigint, recovery bool)", "create table demo.client(client integer, pid integer, " "retry_sleep_ms integer, retry_cap_ms integer, failover_count integer)", NULL }; /* use the retry policy for a REMOTE node */ pgsql_init(&pgsql, (char *) pguri, PGSQL_CONN_APP); demoapp_set_retry_policy(&pgsql, DEMO_DEFAULT_RETRY_CAP_TIME, DEMO_DEFAULT_RETRY_SLEEP_TIME); for (int i = 0; ddls[i] != NULL; i++) { const char *command = ddls[i]; log_info("Preparing demo schema: %s", command); if (!pgsql_execute(&pgsql, command)) { return false; } } return true; } /* * demoapp_run runs clientsCount sub-processes for given duration (in seconds), * each sub-process implements a very simple INSERT INTO in a loop. */ bool demoapp_run(const char *pguri, DemoAppOptions *demoAppOptions) { int clientsCount = demoAppOptions->clientsCount; int startedClientsCount = 0; pid_t clientsPidArray[MAX_CLIENTS_COUNT + 1] = { 0 }; log_info("Starting %d concurrent clients as sub-processes", clientsCount); /* Flush stdio channels just before fork, to avoid double-output problems */ fflush(stdout); fflush(stderr); /* we want to use the same logs semaphore in the sub-processes */ for (int index = 0; index <= clientsCount; index++) { pid_t fpid = fork(); switch (fpid) { case -1: { log_error("Failed to fork client %d", index); (void) demoapp_terminate_clients(clientsPidArray, startedClientsCount); return false; } case 0: { /* initialize the semaphore used for locking log output */ if (!semaphore_init(&log_semaphore)) { exit(EXIT_CODE_INTERNAL_ERROR); } /* set our logging facility to use our semaphore as a lock */ (void) log_set_udata(&log_semaphore); (void) log_set_lock(&semaphore_log_lock_function); if (index == 0) { (void) demoapp_process_perform_switchover(demoAppOptions); } else { (void) demoapp_start_client(pguri, index, demoAppOptions); } (void) semaphore_finish(&log_semaphore); exit(EXIT_CODE_QUIT); } default: { /* fork succeeded, in parent */ clientsPidArray[index] = fpid; ++startedClientsCount; } } } /* all clients have started, now wait until they are done */ return demoapp_wait_for_clients(clientsPidArray, startedClientsCount); } /* * demoapp_wait_for_clients waits until all the subprocess are finished. */ static bool demoapp_wait_for_clients(pid_t clientsPidArray[], int startedClientsCount) { int subProcessCount = startedClientsCount; bool allReturnCodeAreZero = true; while (subProcessCount > 0) { pid_t pid; int status; /* ignore errors */ pid = waitpid(-1, &status, WNOHANG); switch (pid) { case -1: { if (errno == ECHILD) { /* no more childrens */ return subProcessCount == 0; } pg_usleep(100 * 1000); /* 100 ms */ break; } case 0: { /* * We're using WNOHANG, 0 means there are no stopped or * exited children, it's all good. It's the expected case * when everything is running smoothly, so enjoy and sleep * for awhile. */ pg_usleep(100 * 1000); /* 100 ms */ break; } default: { /* * One of the az vm create sub-commands has finished, find * which and if it went all okay. */ int returnCode = WEXITSTATUS(status); /* find which client is done now */ for (int index = 0; index < startedClientsCount; index++) { if (clientsPidArray[index] == pid) { if (returnCode != 0) { log_error("Client %d (pid %d) exited with code %d", index, pid, returnCode); allReturnCodeAreZero = false; } } } --subProcessCount; break; } } } return allReturnCodeAreZero; } /* * demoapp_terminate_clients sends a SIGQUIT signal to known-running client * processes, and then wait until the processes are finished. */ static void demoapp_terminate_clients(pid_t clientsPidArray[], int startedClientsCount) { for (int index = 0; index < startedClientsCount; index++) { int pid = clientsPidArray[index]; if (kill(pid, SIGQUIT) != 0) { log_error("Failed to send SIGQUIT to client %d pid %d: %m", index, pid); } } } /* * demoapp_perform_switchover performs a switchover while the demo application * is running, once in a while */ static void demoapp_process_perform_switchover(DemoAppOptions *demoAppOptions) { Monitor monitor = { 0 }; char *channels[] = { "state", NULL }; char *formation = demoAppOptions->formation; int groupId = demoAppOptions->groupId; bool durationElapsed = false; uint64_t startTime = time(NULL); if (!demoAppOptions->doFailover) { log_info("This demo run uses --no-failover"); exit(EXIT_CODE_QUIT); } if (!monitor_init(&monitor, demoAppOptions->monitor_pguri)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } pgsql_set_monitor_interactive_retry_policy(&(monitor.pgsql.retryPolicy)); if (demoAppOptions->duration <= (demoAppOptions->firstFailover + 10)) { log_error("Use a --duration of at least %ds for a failover to happen", demoAppOptions->firstFailover + 10); exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Failover client is started, will failover in %ds " "and every %ds after that", demoAppOptions->firstFailover, demoAppOptions->failoverFreq); while (!durationElapsed) { uint64_t now = time(NULL); int currentSecond = (int) (now - startTime); if ((now - startTime) > demoAppOptions->duration) { durationElapsed = true; break; } /* * skip failover unless conditions are right: * * - current second is firstFailover (--first-failover 10) * * - we went past firstFailover already and current second is a * multiple of the failover frequency (failover every failoverFreq * seconds after the first failover). */ if (currentSecond != demoAppOptions->firstFailover || (currentSecond > demoAppOptions->firstFailover && ((currentSecond - demoAppOptions->firstFailover) % demoAppOptions->failoverFreq) != 0)) { pg_usleep(500); continue; } log_info("pg_autoctl perform failover"); /* start listening to the state changes before we perform_failover */ if (!pgsql_listen(&(monitor.pgsql), channels)) { log_error("Failed to listen to state changes from the monitor"); pgsql_finish(&(monitor.pgsql)); continue; } if (!monitor_perform_failover(&monitor, formation, groupId)) { log_fatal("Failed to perform failover/switchover, " "see above for details"); /* skip this round entirely and continue */ sleep(1); continue; } /* process state changes notification until we have a new primary */ if (!monitor_wait_until_some_node_reported_state( &monitor, formation, groupId, NODE_KIND_UNKNOWN, PRIMARY_STATE, PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT)) { log_error("Failed to wait until a new primary has been notified"); continue; } } } /* * demoapp_register_client registers a client with its retry policy */ static bool demoapp_register_client(const char *pguri, int clientId, int retrySleep, int retryCap) { PGSQL pgsql = { 0 }; char *sql = "insert into demo.client(client, pid, retry_sleep_ms, retry_cap_ms) " "values($1, $2, $3, $4)"; const Oid paramTypes[4] = { INT4OID, INT4OID, INT4OID, INT4OID }; const char *paramValues[4] = { 0 }; paramValues[0] = intToString(clientId).strValue; paramValues[1] = intToString(getpid()).strValue; paramValues[2] = intToString(retrySleep).strValue; paramValues[3] = intToString(retryCap).strValue; pgsql_init(&pgsql, (char *) pguri, PGSQL_CONN_APP); demoapp_set_retry_policy(&pgsql, DEMO_DEFAULT_RETRY_CAP_TIME, DEMO_DEFAULT_RETRY_SLEEP_TIME); if (!pgsql_execute_with_params(&pgsql, sql, 4, paramTypes, paramValues, NULL, NULL)) { /* errors have already been logged */ pgsql_finish(&pgsql); return false; } pgsql_finish(&pgsql); return true; } /* * demoapp_update_client_failovers registers how many failovers a client faced */ static bool demoapp_update_client_failovers(const char *pguri, int clientId, int failovers) { PGSQL pgsql = { 0 }; char *sql = "update demo.client set failover_count = $2 where client = $1"; const Oid paramTypes[2] = { INT4OID, INT4OID }; const char *paramValues[2] = { 0 }; paramValues[0] = intToString(clientId).strValue; paramValues[1] = intToString(failovers).strValue; pgsql_init(&pgsql, (char *) pguri, PGSQL_CONN_APP); demoapp_set_retry_policy(&pgsql, DEMO_DEFAULT_RETRY_CAP_TIME, DEMO_DEFAULT_RETRY_SLEEP_TIME); if (!pgsql_execute_with_params(&pgsql, sql, 2, paramTypes, paramValues, NULL, NULL)) { /* errors have already been logged */ pgsql_finish(&pgsql); return false; } pgsql_finish(&pgsql); return true; } /* * http://c-faq.com/lib/randrange.html */ #define random_between(M, N) \ ((M) + pg_lrand48() / (RAND_MAX / ((N) -(M) +1) + 1)) /* * demo_start_client starts a sub-process that implements our demo application: * the subprocess connects to Postgres and INSERT INTO our demo tracking table * some latency information. */ static void demoapp_start_client(const char *pguri, int clientId, DemoAppOptions *demoAppOptions) { uint64_t startTime = time(NULL); bool durationElapsed = false; bool firstLoop = true; uint64_t previousLogLineTime = 0; int directs = 0; int retries = 0; int failovers = 0; int maxConnectionTimeNoRetry = 0; int maxConnectionTimeWithRetries = 0; int retryCap = 200; /* sleep up to 200ms between attempts */ int retrySleepTime = 500; /* first retry happens after 500 ms */ /* initialize a seed for our random number generator */ pg_srand48(((unsigned int) (getpid() ^ time(NULL)))); /* pick a random retry policy for this client */ retryCap = random_between(50, 500); retrySleepTime = random_between(500, 1500); log_info("Client %d is using a retry policy with initial sleep time %d ms " "and a retry time capped at %d ms", clientId, retrySleepTime, retryCap); if (!demoapp_register_client(pguri, clientId, retrySleepTime, retryCap)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } for (int index = 0; !durationElapsed; index++) { PGSQL pgsql = { 0 }; bool is_in_recovery = false; uint64_t now = time(NULL); if (firstLoop) { firstLoop = false; } else { int sleepTimeMs = random_between(10, 200); pg_usleep(sleepTimeMs * 1000); } if ((now - startTime) > demoAppOptions->duration) { durationElapsed = true; break; } /* use the retry policy for a REMOTE node */ pgsql_init(&pgsql, (char *) pguri, PGSQL_CONN_APP); demoapp_set_retry_policy(&pgsql, retryCap, retrySleepTime); if (!pgsql_is_in_recovery(&pgsql, &is_in_recovery)) { /* errors have already been logged */ continue; } instr_time duration = pgsql.retryPolicy.connectTime; INSTR_TIME_SUBTRACT(duration, pgsql.retryPolicy.startTime); if (pgsql.retryPolicy.attempts == 0) { ++directs; /* we could connect without retries, everything is fine */ if (maxConnectionTimeNoRetry == 0 || INSTR_TIME_GET_MILLISEC(duration) > maxConnectionTimeNoRetry) { maxConnectionTimeNoRetry = INSTR_TIME_GET_MILLISEC(duration); } /* log every 2s max, to avoid filling in the logs */ if (previousLogLineTime == 0 || (now - previousLogLineTime) >= 10) { if (failovers == 0) { log_info("Client %d connected %d times in less than %d ms, " "before first failover", clientId, directs, maxConnectionTimeNoRetry); } else { log_info("Client %d connected %d times in less than %d ms, " "after %d failover(s)", clientId, directs, maxConnectionTimeNoRetry, failovers); } previousLogLineTime = now; } } else { /* we had to retry connecting, a failover is in progress */ ++failovers; retries += pgsql.retryPolicy.attempts; if (maxConnectionTimeWithRetries == 0 || INSTR_TIME_GET_MILLISEC(duration) > maxConnectionTimeWithRetries) { maxConnectionTimeWithRetries = INSTR_TIME_GET_MILLISEC(duration); } log_info("Client %d attempted to connect during a failover, " "and had to attempt %d times which took %5.3f ms with " "the current retry policy", clientId, pgsql.retryPolicy.attempts, INSTR_TIME_GET_MILLISEC(duration)); } char *sql = "insert into demo.tracking(client, loop, retries, us, recovery) " "values($1, $2, $3, $4, $5)"; const Oid paramTypes[5] = { INT4OID, INT4OID, INT8OID, INT8OID, BOOLOID }; const char *paramValues[5] = { 0 }; paramValues[0] = intToString(clientId).strValue; paramValues[1] = intToString(index).strValue; paramValues[2] = intToString(pgsql.retryPolicy.attempts).strValue; paramValues[3] = intToString(INSTR_TIME_GET_MICROSEC(duration)).strValue; paramValues[4] = is_in_recovery ? "true" : "false"; if (!pgsql_execute_with_params(&pgsql, sql, 5, paramTypes, paramValues, NULL, NULL)) { /* errors have already been logged */ } /* the idea is to reconnect every time */ pgsql_finish(&pgsql); } if (!demoapp_update_client_failovers(pguri, clientId, failovers)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Client %d connected on first attempt %d times " "with a maximum connection time of %d ms", clientId, directs, maxConnectionTimeNoRetry); log_info("Client %d is using a retry policy with initial sleep time %d ms " "and a retry time capped at %d ms", clientId, retrySleepTime, retryCap); log_info("Client %d attempted to connect during a failover %d times " "with a maximum connection time of %d ms and a total number " "of %d retries", clientId, failovers, maxConnectionTimeWithRetries, retries); } /* * demoapp_print_histogram prints an histogram of the distribution of the * connection timings measured throughout the testing. */ void demoapp_print_histogram(const char *pguri, DemoAppOptions *demoAppOptions) { const char *sqlFormatString = /* *INDENT-OFF* */ "with minmax as ( select min(us), max(us) from demo.tracking ), " "histogram as ( " "select width_bucket(us, min, max, 18) as bucket, " "round(min(us)/1000.0, 3) as min, " "round(max(us)/1000.0, 3) as max, " "count(*) as freq " "from demo.tracking, minmax " "group by bucket " "order by bucket " ") " "select min as \"Min Connect Time (ms)\", max, freq, " "repeat('▒', " "(freq::float / max(freq) over() * %d)::int " ") as bar " "from histogram; "; /* *INDENT-ON* */ /* the first columns take up 45 columns already, use what's remaining */ int cols = demoapp_get_terminal_columns() - 45; char sql[BUFSIZE] = { 0 }; sformat(sql, sizeof(sql), sqlFormatString, cols); (void) demoapp_psql(pguri, sql); } #define P95 "percentile_cont(0.95) within group (order by us::float8) / 1000.0" #define P99 "percentile_cont(0.99) within group (order by us::float8) / 1000.0" /* * demoapp_print_summary prints a summar of what happened during the run. */ void demoapp_print_summary(const char *pguri, DemoAppOptions *demoAppOptions) { const char *sql = /* *INDENT-OFF* */ "with stats as( " "select client, " "count(*) as conn, " "sum(retries), " "round(min(us)/1000.0, 3) as min, " "round(max(us)/1000.0, 3) as max, " "round((" P95 ")::numeric, 3) as p95, " "round((" P99 ")::numeric, 3) as p99 " "from demo.tracking " "group by rollup(client) " ") " "select " "case when client is not null then format('Client %s', client) " "else ('All Clients Combined') end as \"Client\", " "conn as \"Connections\", " /* "failover_count as \"Failovers\", " */ /* "retry_sleep_ms as \"Retry Sleep (ms)\", " */ /* "retry_cap_ms as \"Retry Cap (ms)\", " */ "sum as \"Retries\", " "min as \"Min Connect Time (ms)\", max, p95, p99 " "from stats left join demo.client using(client) " "order by client nulls last"; /* *INDENT-ON* */ log_info("Summary for the demo app running with %d clients for %ds", demoAppOptions->clientsCount, demoAppOptions->duration); (void) demoapp_psql(pguri, sql); } /* * demoapp_get_terminal_columns gets the current terminal window width. */ static int demoapp_get_terminal_columns() { struct winsize ws; if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) == -1) { log_error("Failed to get terminal width: %m"); /* default terminal width is 78 (less than the magic 80) */ return 78; } return ws.ws_col; } /* * demoapp_psql calls psql to display the results of a given SQL query, in a * way that we don't have to compute the headers for the output. */ static void demoapp_psql(const char *pguri, const char *sql) { char cat[MAXPGPATH] = { 0 }; char psql[MAXPGPATH] = { 0 }; char *args[16]; int argsIndex = 0; /* we shell-out to psql so that we don't have to compute headers */ if (!search_path_first("psql", psql, LOG_ERROR)) { log_fatal("Failed to find program psql in PATH"); exit(EXIT_CODE_INTERNAL_ERROR); } /* we use /bin/cat as our PAGER */ if (!search_path_first("cat", cat, LOG_ERROR)) { log_fatal("Failed to find program cat in PATH"); exit(EXIT_CODE_INTERNAL_ERROR); } /* set our PAGER to be just cat */ setenv("PAGER", cat, 1); args[argsIndex++] = psql; args[argsIndex++] = "--no-psqlrc"; args[argsIndex++] = "-d"; args[argsIndex++] = (char *) pguri; args[argsIndex++] = "-c"; args[argsIndex++] = (char *) sql; args[argsIndex++] = NULL; /* we do not want to call setsid() when running this program. */ Program program = { 0 }; (void) initialize_program(&program, args, false); program.capture = false; /* don't capture output */ program.tty = true; /* allow sharing the parent's tty */ (void) execute_subprogram(&program); free_program(&program); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/demoapp.h000066400000000000000000000014141414244367200223530ustar00rootroot00000000000000/* * src/bin/pg_autoctl/demoapp.h * Demo application for pg_auto_failover * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef DEMOAPP_H #define DEMOAPP_H #include #include "cli_do_demoapp.h" #define DEMO_DEFAULT_RETRY_CAP_TIME 200 #define DEMO_DEFAULT_RETRY_SLEEP_TIME 500 bool demoapp_grab_formation_uri(DemoAppOptions *options, char *pguri, size_t size, bool *mayRetry); bool demoapp_prepare_schema(const char *pguri); bool demoapp_run(const char *pguri, DemoAppOptions *demoAppOptions); void demoapp_print_histogram(const char *pguri, DemoAppOptions *demoAppOptions); void demoapp_print_summary(const char *pguri, DemoAppOptions *demoAppOptions); #endif /* DEMOAPP_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/env_utils.c000066400000000000000000000100371414244367200227320ustar00rootroot00000000000000/* * src/bin/pg_autoctl/env_utils.c * Utility functions for interacting with environment settings. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include "defaults.h" #include "env_utils.h" #include "log.h" /* * env_found_empty returns true if the passed environment variable is the empty * string. It returns false when the environment variable is not set or if it * set but is something else than the empty string. */ bool env_found_empty(const char *name) { if (name == NULL || strlen(name) == 0) { log_error("Failed to get environment setting. " "NULL or empty variable name is provided"); return false; } /* * Explanation of IGNORE-BANNED * getenv is safe here because we never provide null argument, * and only check the value it's length. */ char *envvalue = getenv(name); /* IGNORE-BANNED */ return envvalue != NULL && strlen(envvalue) == 0; } /* * env_exists returns true if the passed environment variable exists in the * environment, otherwise it returns false. */ bool env_exists(const char *name) { if (name == NULL || strlen(name) == 0) { log_error("Failed to get environment setting. " "NULL or empty variable name is provided"); return false; } /* * Explanation of IGNORE-BANNED * getenv is safe here because we never provide null argument, * and only check if it returns NULL. */ return getenv(name) != NULL; /* IGNORE-BANNED */ } /* * get_env_copy_with_fallback copies the environment variable with "name" into * the result buffer. It returns false when it fails. If the environment * variable is not set the fallback string will be written in the buffer. * Except when fallback is NULL, in that case an error is returned. */ bool get_env_copy_with_fallback(const char *name, char *result, int maxLength, const char *fallback) { if (name == NULL || strlen(name) == 0) { log_error("Failed to get environment setting. " "NULL or empty variable name is provided"); return false; } if (result == NULL) { log_error("Failed to get environment setting. " "Tried to store in NULL pointer"); return false; } /* * Explanation of IGNORE-BANNED * getenv is safe here because we never provide null argument, * and copy out the result immediately. */ const char *envvalue = getenv(name); /* IGNORE-BANNED */ if (envvalue == NULL) { envvalue = fallback; if (envvalue == NULL) { log_error("Failed to get value for environment variable '%s', " "which is unset", name); return false; } } size_t actualLength = strlcpy(result, envvalue, maxLength); /* uses >= to make sure the nullbyte fits */ if (actualLength >= maxLength) { log_error("Failed to copy value stored in %s environment setting, " "which is %lu long. pg_autoctl only supports %lu bytes for " "this environment setting", name, (unsigned long) actualLength, (unsigned long) maxLength - 1); return false; } return true; } /* * get_env_copy copies the environmennt variable with "name" into tho result * buffer. It returns false when it fails. The environment variable not * existing is also considered a failure. */ bool get_env_copy(const char *name, char *result, int maxLength) { return get_env_copy_with_fallback(name, result, maxLength, NULL); } /* * get_env_pgdata checks for environment value PGDATA * and copy its value into provided buffer. * * function returns true on successful run. returns false * if it can't find PGDATA or its value is larger than * the provided buffer */ bool get_env_pgdata(char *pgdata) { return get_env_copy("PGDATA", pgdata, MAXPGPATH) > 0; } /* * get_env_pgdata_or_exit does the same as get_env_pgdata. Instead of * returning false in case of error it exits the process and shows a FATAL log * message. */ void get_env_pgdata_or_exit(char *pgdata) { if (get_env_pgdata(pgdata)) { return; } log_fatal("Failed to set PGDATA either from the environment " "or from --pgdata"); exit(EXIT_CODE_BAD_ARGS); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/env_utils.h000066400000000000000000000012011414244367200227300ustar00rootroot00000000000000/* * src/bin/pg_autoctl/env_utils.h * Utility functions for interacting with environment settings. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef ENV_UTILS_H #define ENV_UTILS_H #include "postgres_fe.h" bool env_found_empty(const char *name); bool env_exists(const char *name); bool get_env_copy(const char *name, char *outbuffer, int maxLength); bool get_env_copy_with_fallback(const char *name, char *result, int maxLength, const char *fallback); bool get_env_pgdata(char *pgdata); void get_env_pgdata_or_exit(char *pgdata); #endif /* ENV_UTILS_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/file_utils.c000066400000000000000000000523141414244367200230650ustar00rootroot00000000000000/* * src/bin/pg_autoctl/file_utils.c * Implementations of utility functions for reading and writing files * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #if defined(__APPLE__) #include #endif #include "postgres_fe.h" #include "snprintf.h" #include "cli_root.h" #include "defaults.h" #include "env_utils.h" #include "file_utils.h" #include "log.h" static bool read_file_internal(FILE *fileStream, const char *filePath, char **contents, long *fileSize); /* * file_exists returns true if the given filename is known to exist * on the file system or false if it does not exist or in case of * error. */ bool file_exists(const char *filename) { bool exists = access(filename, F_OK) != -1; if (!exists && errno != 0) { /* * Only log "interesting" errors here. * * The fact that the file does not exist is not interesting: we're * retuning false and the caller figures it out, maybe then creating * the file. */ if (errno != ENOENT && errno != ENOTDIR) { log_error("Failed to check if file \"%s\" exists: %m", filename); } return false; } return exists; } /* * directory_exists returns whether the given path is the name of a directory that * exists on the file system or not. */ bool directory_exists(const char *path) { struct stat info; if (!file_exists(path)) { return false; } if (stat(path, &info) != 0) { log_error("Failed to stat \"%s\": %m\n", path); return false; } bool result = (info.st_mode & S_IFMT) == S_IFDIR; return result; } /* * ensure_empty_dir ensures that the given path points to an empty directory with * the given mode. If it fails to do so, it returns false. */ bool ensure_empty_dir(const char *dirname, int mode) { /* pg_mkdir_p might modify its input, so create a copy of dirname. */ char dirname_copy[MAXPGPATH]; strlcpy(dirname_copy, dirname, MAXPGPATH); if (directory_exists(dirname)) { if (!rmtree(dirname, true)) { log_error("Failed to remove directory \"%s\": %m", dirname); return false; } } else { /* * reset errno, we don't care anymore that it failed because dirname * doesn't exists. */ errno = 0; } if (pg_mkdir_p(dirname_copy, mode) == -1) { log_error("Failed to ensure empty directory \"%s\": %m", dirname); return false; } return true; } /* * fopen_with_umask is a version of fopen that gives more control. The main * advantage of it is that it allows specifying a umask of the file. This makes * sure files are not accidentally created with umask 777 if the user has it * configured in a weird way. * * This function returns NULL when opening the file fails. So this should be * handled. It will log an error in this case though, so that's not necessary * at the callsite. */ FILE * fopen_with_umask(const char *filePath, const char *modes, int flags, mode_t umask) { int fileDescriptor = open(filePath, flags, umask); if (fileDescriptor == -1) { log_error("Failed to open file \"%s\": %m", filePath); return NULL; } FILE *fileStream = fdopen(fileDescriptor, modes); if (fileStream == NULL) { log_error("Failed to open file \"%s\": %m", filePath); close(fileDescriptor); } return fileStream; } /* * fopen_read_only opens the file as a read only stream. */ FILE * fopen_read_only(const char *filePath) { /* * Explanation of IGNORE-BANNED * fopen is safe here because we open the file in read only mode. So no * exclusive access is needed. */ return fopen(filePath, "rb"); /* IGNORE-BANNED */ } /* * write_file writes the given data to the file given by filePath using * our logging library to report errors. If succesful, the function returns * true. */ bool write_file(char *data, long fileSize, const char *filePath) { FILE *fileStream = fopen_with_umask(filePath, "wb", FOPEN_FLAGS_W, 0644); if (fileStream == NULL) { /* errors have already been logged */ return false; } if (fwrite(data, sizeof(char), fileSize, fileStream) < fileSize) { log_error("Failed to write file \"%s\": %m", filePath); fclose(fileStream); return false; } if (fclose(fileStream) == EOF) { log_error("Failed to write file \"%s\"", filePath); return false; } return true; } /* * append_to_file writes the given data to the end of the file given by * filePath using our logging library to report errors. If succesful, the * function returns true. */ bool append_to_file(char *data, long fileSize, const char *filePath) { FILE *fileStream = fopen_with_umask(filePath, "ab", FOPEN_FLAGS_A, 0644); if (fileStream == NULL) { /* errors have already been logged */ return false; } if (fwrite(data, sizeof(char), fileSize, fileStream) < fileSize) { log_error("Failed to write file \"%s\": %m", filePath); fclose(fileStream); return false; } if (fclose(fileStream) == EOF) { log_error("Failed to write file \"%s\"", filePath); return false; } return true; } /* * read_file_if_exists is a utility function that reads the contents of a file * using our logging library to report errors. ENOENT is not considered worth * of a log message in this function, and we still return false in that case. * * If successful, the function returns true and fileSize points to the number * of bytes that were read and contents points to a buffer containing the entire * contents of the file. This buffer should be freed by the caller. */ bool read_file_if_exists(const char *filePath, char **contents, long *fileSize) { /* open a file */ FILE *fileStream = fopen_read_only(filePath); if (fileStream == NULL) { if (errno != ENOENT) { log_error("Failed to open file \"%s\": %m", filePath); } return false; } return read_file_internal(fileStream, filePath, contents, fileSize); } /* * read_file is a utility function that reads the contents of a file using our * logging library to report errors. * * If successful, the function returns true and fileSize points to the number * of bytes that were read and contents points to a buffer containing the entire * contents of the file. This buffer should be freed by the caller. */ bool read_file(const char *filePath, char **contents, long *fileSize) { /* open a file */ FILE *fileStream = fopen_read_only(filePath); if (fileStream == NULL) { log_error("Failed to open file \"%s\": %m", filePath); return false; } return read_file_internal(fileStream, filePath, contents, fileSize); } /* * read_file_internal is shared by both read_file and read_file_if_exists * functions. */ static bool read_file_internal(FILE *fileStream, const char *filePath, char **contents, long *fileSize) { /* get the file size */ if (fseek(fileStream, 0, SEEK_END) != 0) { log_error("Failed to read file \"%s\": %m", filePath); fclose(fileStream); return false; } *fileSize = ftell(fileStream); if (*fileSize < 0) { log_error("Failed to read file \"%s\": %m", filePath); fclose(fileStream); return false; } if (fseek(fileStream, 0, SEEK_SET) != 0) { log_error("Failed to read file \"%s\": %m", filePath); fclose(fileStream); return false; } /* read the contents */ char *data = malloc(*fileSize + 1); if (data == NULL) { log_error("Failed to allocate %ld bytes", *fileSize); log_error(ALLOCATION_FAILED_ERROR); fclose(fileStream); return false; } if (fread(data, sizeof(char), *fileSize, fileStream) < *fileSize) { log_error("Failed to read file \"%s\": %m", filePath); fclose(fileStream); free(data); return false; } if (fclose(fileStream) == EOF) { log_error("Failed to read file \"%s\"", filePath); free(data); return false; } data[*fileSize] = '\0'; *contents = data; return true; } /* * move_file is a utility function to move a file from sourcePath to * destinationPath. It behaves like mv system command. First attempts to move * a file using rename. if it fails with EXDEV error, the function duplicates * the source file with owner and permission information and removes it. */ bool move_file(char *sourcePath, char *destinationPath) { if (strncmp(sourcePath, destinationPath, MAXPGPATH) == 0) { /* nothing to do */ log_warn("Source and destination are the same \"%s\", nothing to move.", sourcePath); return true; } if (!file_exists(sourcePath)) { log_error("Failed to move file, source file \"%s\" does not exist.", sourcePath); return false; } if (file_exists(destinationPath)) { log_error("Failed to move file, destination file \"%s\" already exists.", destinationPath); return false; } /* first try atomic move operation */ if (rename(sourcePath, destinationPath) == 0) { return true; } /* * rename fails with errno = EXDEV when moving file to a different file * system. */ if (errno != EXDEV) { log_error("Failed to move file \"%s\" to \"%s\": %m", sourcePath, destinationPath); return false; } if (!duplicate_file(sourcePath, destinationPath)) { /* specific error is already logged */ log_error("Canceling file move due to errors."); return false; } /* everything is successful we can remove the file */ unlink_file(sourcePath); return true; } /* * duplicate_file is a utility function to duplicate a file from sourcePath to * destinationPath. It reads the contents of the source file and writes to the * destination file. It expects non-existing destination file and does not * copy over if it exists. The function returns true on successful execution. * * Note: the function reads the whole file into memory before copying out. */ bool duplicate_file(char *sourcePath, char *destinationPath) { char *fileContents; long fileSize; struct stat sourceFileStat; if (!read_file(sourcePath, &fileContents, &fileSize)) { /* errors are logged */ return false; } if (file_exists(destinationPath)) { log_error("Failed to duplicate, destination file already exists : %s", destinationPath); return false; } bool foundError = !write_file(fileContents, fileSize, destinationPath); free(fileContents); if (foundError) { /* errors are logged in write_file */ return false; } /* set uid gid and mode */ if (stat(sourcePath, &sourceFileStat) != 0) { log_error("Failed to get ownership and file permissions on \"%s\"", sourcePath); foundError = true; } else { if (chown(destinationPath, sourceFileStat.st_uid, sourceFileStat.st_gid) != 0) { log_error("Failed to set user and group id on \"%s\"", destinationPath); foundError = true; } if (chmod(destinationPath, sourceFileStat.st_mode) != 0) { log_error("Failed to set file permissions on \"%s\"", destinationPath); foundError = true; } } if (foundError) { /* errors are already logged */ unlink_file(destinationPath); return false; } return true; } /* * create_symbolic_link creates a symbolic link to source path. */ bool create_symbolic_link(char *sourcePath, char *targetPath) { if (symlink(sourcePath, targetPath) != 0) { log_error("Failed to create symbolic link to \"%s\": %m", targetPath); return false; } return true; } /* * path_in_same_directory constructs the path for a file with name fileName * that is in the same directory as basePath, which should be an absolute * path. The result is written to destinationPath, which should be at least * MAXPATH in size. */ void path_in_same_directory(const char *basePath, const char *fileName, char *destinationPath) { strlcpy(destinationPath, basePath, MAXPGPATH); get_parent_directory(destinationPath); join_path_components(destinationPath, destinationPath, fileName); } /* From PostgreSQL sources at src/port/path.c */ #ifndef WIN32 #define IS_PATH_VAR_SEP(ch) ((ch) == ':') #else #define IS_PATH_VAR_SEP(ch) ((ch) == ';') #endif /* * search_path_first copies the first entry found in PATH to result. result * should be a buffer of (at least) MAXPGPATH size. * The function returns false and logs an error when it cannot find the command * in PATH. */ bool search_path_first(const char *filename, char *result, int logLevel) { SearchPath paths = { 0 }; if (!search_path(filename, &paths) || paths.found == 0) { log_level(logLevel, "Failed to find %s command in your PATH", filename); return false; } strlcpy(result, paths.matches[0], MAXPGPATH); return true; } /* * Searches all the directories in the PATH environment variable for the given * filename. Returns number of occurrences and each match found with its * fullname, including the given filename, in the given pre-allocated * SearchPath result. */ bool search_path(const char *filename, SearchPath *result) { char pathlist[MAXPATHSIZE] = { 0 }; /* we didn't count nor find anything yet */ result->found = 0; /* Create a copy of pathlist, because we modify it here. */ if (!get_env_copy("PATH", pathlist, sizeof(pathlist))) { /* errors have already been logged */ return false; } char *path = pathlist; while (path != NULL) { char candidate[MAXPGPATH] = { 0 }; char *sep = first_path_var_separator(path); /* split path on current token, null-terminating string at separator */ if (sep != NULL) { *sep = '\0'; } (void) join_path_components(candidate, path, filename); (void) canonicalize_path(candidate); if (file_exists(candidate)) { strlcpy(result->matches[result->found++], candidate, MAXPGPATH); } path = (sep == NULL ? NULL : sep + 1); } return true; } /* * search_path_deduplicate_symlinks traverse the SearchPath result obtained by * calling the search_path() function and removes entries that are pointing to * the same binary on-disk. * * In modern debian installations, for instance, we have /bin -> /usr/bin; and * then we might find pg_config both in /bin/pg_config and /usr/bin/pg_config * although it's only been installed once, and both are the same file. * * We use realpath() to deduplicate entries, and keep the entry that is not a * symbolic link. */ bool search_path_deduplicate_symlinks(SearchPath *results, SearchPath *dedup) { /* now re-initialize the target structure dedup */ dedup->found = 0; for (int rIndex = 0; rIndex < results->found; rIndex++) { bool alreadyThere = false; char *currentPath = results->matches[rIndex]; char currentRealPath[PATH_MAX] = { 0 }; if (realpath(currentPath, currentRealPath) == NULL) { log_error("Failed to normalize file name \"%s\": %m", currentPath); return false; } /* add-in the realpath to dedup, unless it's already in there */ for (int dIndex = 0; dIndex < dedup->found; dIndex++) { if (strcmp(dedup->matches[dIndex], currentRealPath) == 0) { alreadyThere = true; log_debug("dedup: skipping \"%s\"", currentPath); break; } } if (!alreadyThere) { int bytesWritten = strlcpy(dedup->matches[dedup->found++], currentRealPath, MAXPGPATH); if (bytesWritten >= MAXPGPATH) { log_error( "Real path \"%s\" is %d bytes long, and pg_autoctl " "is limited to handling paths of %d bytes long, maximum", currentRealPath, (int) strlen(currentRealPath), MAXPGPATH); return false; } } } return true; } /* * unlink_state_file calls unlink(2) on the state file to make sure we don't * leave a lingering state on-disk. */ bool unlink_file(const char *filename) { if (unlink(filename) == -1) { /* if it didn't exist yet, good news! */ if (errno != ENOENT && errno != ENOTDIR) { log_error("Failed to remove file \"%s\": %m", filename); return false; } } return true; } /* * get_program_absolute_path returns the absolute path of the current program * being executed. Note: the shell is responsible to set that in interactive * environments, and when the pg_autoctl binary is in the PATH of the user, * then argv[0] (here pg_autoctl_argv0) is just "pg_autoctl". */ bool set_program_absolute_path(char *program, int size) { #if defined(__APPLE__) int actualSize = _NSGetExecutablePath(program, (uint32_t *) &size); if (actualSize != 0) { log_error("Failed to get absolute path for the pg_autoctl program, " "absolute path requires %d bytes and we support paths up " "to %d bytes only", actualSize, size); return false; } log_debug("Found absolute program: \"%s\"", program); #else /* * On Linux and FreeBSD and Solaris, we can find a symbolic link to our * program and get the information with readlink. Of course the /proc entry * to read is not the same on both systems, so we try several things here. */ bool found = false; char *procEntryCandidates[] = { "/proc/self/exe", /* Linux */ "/proc/curproc/file", /* FreeBSD */ "/proc/self/path/a.out" /* Solaris */ }; int procEntrySize = sizeof(procEntryCandidates) / sizeof(char *); int procEntryIndex = 0; for (procEntryIndex = 0; procEntryIndex < procEntrySize; procEntryIndex++) { if (readlink(procEntryCandidates[procEntryIndex], program, size) != -1) { found = true; log_debug("Found absolute program \"%s\" in \"%s\"", program, procEntryCandidates[procEntryIndex]); } else { /* when the file does not exist, we try our next guess */ if (errno != ENOENT && errno != ENOTDIR) { log_error("Failed to get absolute path for the " "pg_autoctl program: %m"); return false; } } } if (found) { return true; } else { /* * Now either return pg_autoctl_argv0 when that's an absolute filename, * or search for it in the PATH otherwise. */ SearchPath paths = { 0 }; if (pg_autoctl_argv0[0] == '/') { strlcpy(program, pg_autoctl_argv0, size); return true; } if (!search_path(pg_autoctl_argv0, &paths) || paths.found == 0) { log_error("Failed to find \"%s\" in PATH environment", pg_autoctl_argv0); exit(EXIT_CODE_INTERNAL_ERROR); } else { log_debug("Found \"%s\" in PATH at \"%s\"", pg_autoctl_argv0, paths.matches[0]); strlcpy(program, paths.matches[0], size); return true; } } #endif return true; } /* * normalize_filename returns the real path of a given filename that belongs to * an existing file on-disk, resolving symlinks and pruning double-slashes and * other weird constructs. filename and dst are allowed to point to the same * adress. */ bool normalize_filename(const char *filename, char *dst, int size) { /* normalize the path to the configuration file, if it exists */ if (file_exists(filename)) { char realPath[PATH_MAX] = { 0 }; if (realpath(filename, realPath) == NULL) { log_fatal("Failed to normalize file name \"%s\": %m", filename); return false; } if (strlcpy(dst, realPath, size) >= size) { log_fatal("Real path \"%s\" is %d bytes long, and pg_autoctl " "is limited to handling paths of %d bytes long, maximum", realPath, (int) strlen(realPath), size); return false; } } else { char realPath[PATH_MAX] = { 0 }; /* protect against undefined behavior if dst overlaps with filename */ strlcpy(realPath, filename, MAXPGPATH); strlcpy(dst, realPath, MAXPGPATH); } return true; } /* * fformat is a secured down version of pg_fprintf: * * Additional security checks are: * - make sure stream is not null * - make sure fmt is not null * - rely on pg_fprintf Assert() that %s arguments are not null */ int fformat(FILE *stream, const char *fmt, ...) { va_list args; if (stream == NULL || fmt == NULL) { log_error("BUG: fformat is called with a NULL target or format string"); return -1; } va_start(args, fmt); int len = pg_vfprintf(stream, fmt, args); va_end(args); return len; } /* * sformat is a secured down version of pg_snprintf */ int sformat(char *str, size_t count, const char *fmt, ...) { va_list args; if (str == NULL || fmt == NULL) { log_error("BUG: sformat is called with a NULL target or format string"); return -1; } va_start(args, fmt); int len = pg_vsnprintf(str, count, fmt, args); va_end(args); if (len >= count) { log_error("BUG: sformat needs %d bytes to expend format string \"%s\", " "and a target string of %lu bytes only has been given.", len, fmt, (unsigned long) count); } return len; } /* * set_ps_title sets the process title seen in ps/top and friends, truncating * if there is not enough space, rather than causing memory corruption. * * Inspired / stolen from Postgres code src/backend/utils/misc/ps_status.c with * most of the portability bits removed. At the moment we prefer simple code * that works on few targets to highly portable code. */ void init_ps_buffer(int argc, char **argv) { #if defined(__linux__) || defined(__darwin__) char *end_of_area = NULL; int i; /* * check for contiguous argv strings */ for (i = 0; i < argc; i++) { if (i == 0 || end_of_area + 1 == argv[i]) { end_of_area = argv[i] + strlen(argv[i]); /* lgtm[cpp/tainted-arithmetic] */ } } if (end_of_area == NULL) /* probably can't happen? */ { ps_buffer = NULL; ps_buffer_size = 0; return; } ps_buffer = argv[0]; last_status_len = ps_buffer_size = end_of_area - argv[0]; /* lgtm[cpp/tainted-arithmetic] */ #else ps_buffer = NULL; ps_buffer_size = 0; return; #endif } /* * set_ps_title sets our process name visible in ps/top/pstree etc. */ void set_ps_title(const char *title) { if (ps_buffer == NULL) { /* noop */ return; } int n = sformat(ps_buffer, ps_buffer_size, "%s", title); /* pad our process title string */ for (size_t i = n; i < ps_buffer_size; i++) { *(ps_buffer + i) = '\0'; } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/file_utils.h000066400000000000000000000044241414244367200230710ustar00rootroot00000000000000/* * src/bin/pg_autoctl/file_utils.h * Utility functions for reading and writing files * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef FILE_UTILS_H #define FILE_UTILS_H #include #include "postgres_fe.h" #include #if defined(__APPLE__) #define ST_MTIME_S(st) ((int64_t) st.st_mtimespec.tv_sec) #else #define ST_MTIME_S(st) ((int64_t) st.st_mtime) #endif /* * In order to avoid dynamic memory allocations and tracking when searching the * PATH environment, we pre-allocate 1024 paths entries. That should be way * more than enough for all situations, and only costs 1024*1024 = 1MB of * memory. */ typedef struct SearchPath { int found; char matches[1024][MAXPGPATH]; } SearchPath; bool file_exists(const char *filename); bool directory_exists(const char *path); bool ensure_empty_dir(const char *dirname, int mode); FILE * fopen_with_umask(const char *filePath, const char *modes, int flags, mode_t umask); FILE * fopen_read_only(const char *filePath); bool write_file(char *data, long fileSize, const char *filePath); bool append_to_file(char *data, long fileSize, const char *filePath); bool read_file(const char *filePath, char **contents, long *fileSize); bool read_file_if_exists(const char *filePath, char **contents, long *fileSize); bool move_file(char *sourcePath, char *destinationPath); bool duplicate_file(char *sourcePath, char *destinationPath); bool create_symbolic_link(char *sourcePath, char *targetPath); void path_in_same_directory(const char *basePath, const char *fileName, char *destinationPath); bool search_path_first(const char *filename, char *result, int logLevel); bool search_path(const char *filename, SearchPath *result); bool search_path_deduplicate_symlinks(SearchPath *results, SearchPath *dedup); bool unlink_file(const char *filename); bool set_program_absolute_path(char *program, int size); bool normalize_filename(const char *filename, char *dst, int size); void init_ps_buffer(int argc, char **argv); void set_ps_title(const char *title); int fformat(FILE *stream, const char *fmt, ...) __attribute__((format(printf, 2, 3))); int sformat(char *str, size_t count, const char *fmt, ...) __attribute__((format(printf, 3, 4))); #endif /* FILE_UTILS_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/formation_config.h000066400000000000000000000012771414244367200242600ustar00rootroot00000000000000/* * src/bin/pg_autoctl/formation_config.h * Formation configuration data structure and function definitions for cli * commands targeting formations. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef FORMATION_CONFIG_H #define FORMATION_CONFIG_H #include "pgctl.h" typedef struct FormationConfig { /* pg_auto_failover formation setup */ char monitor_pguri[MAXCONNINFO]; char formation[NAMEDATALEN]; char formationKind[NAMEDATALEN]; char dbname[NAMEDATALEN]; bool formationHasSecondary; int numberSyncStandbys; /* PostgreSQL setup */ PostgresSetup pgSetup; } FormationConfig; #endif /*FORMATION_CONFIG_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/fsm.c000066400000000000000000000542241414244367200215150ustar00rootroot00000000000000/* * src/bin/pg_autoctl/fsm.c * Finite State Machine implementation for pg_autoctl. * * The state machine transitions are decided by the pg_auto_failover monitor * and implemented on the local Postgres node by the pg_autoctl service. This * is the client-side implementation. We refer to this service as the "keeper", * it is the local agent that executes the pg_auto_failover decisions. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include "defaults.h" #include "keeper.h" #include "pgctl.h" #include "fsm.h" #include "log.h" #include "monitor.h" #include "primary_standby.h" #include "state.h" /* * Comments displayed in the logs when state changes. */ #define COMMENT_INIT_TO_SINGLE \ "Start as a single node" #define COMMENT_PRIMARY_TO_SINGLE \ "Other node was forcibly removed, now single" #define COMMENT_DEMOTED_TO_SINGLE \ "Was demoted after a failure, " \ "but secondary was forcibly removed" #define COMMENT_LOST_PRIMARY \ "Primary was forcibly removed" #define COMMENT_REPLICATION_TO_SINGLE \ "Went down to force the primary to time out, " \ "but then it was removed" #define COMMENT_SINGLE_TO_WAIT_PRIMARY \ "A new secondary was added" #define COMMENT_PRIMARY_TO_WAIT_PRIMARY \ "Secondary became unhealthy" #define COMMENT_PRIMARY_TO_JOIN_PRIMARY \ "A new secondary was added" #define COMMENT_PRIMARY_TO_DRAINING \ "A failover occurred, stopping writes " #define COMMENT_PRIMARY_TO_PREPARE_MAINTENANCE \ "Promoting the standby to enable maintenance on the " \ "primary, stopping Postgres " #define COMMENT_PRIMARY_TO_MAINTENANCE \ "Setting up Postgres in standby mode for maintenance operations" #define COMMENT_PRIMARY_TO_MAINTENANCE_PROMOTE_SECONDARY \ "Promoting the standby to enable maintenance on the primary" #define COMMENT_PRIMARY_TO_DEMOTED \ "A failover occurred, no longer primary" #define COMMENT_DRAINING_TO_DEMOTED \ "Demoted after a failover, no longer primary" #define COMMENT_DRAINING_TO_DEMOTE_TIMEOUT \ "Secondary confirms it’s receiving no more writes" #define COMMENT_DEMOTE_TIMEOUT_TO_DEMOTED \ "Demote timeout expired" #define COMMENT_STOP_REPLICATION_TO_WAIT_PRIMARY \ "Confirmed promotion with the monitor" #define COMMENT_WAIT_PRIMARY_TO_PRIMARY \ "A healthy secondary appeared" #define COMMENT_JOIN_PRIMARY_TO_PRIMARY \ "A healthy secondary appeared" #define COMMENT_DEMOTE_TO_PRIMARY \ "Detected a network partition, " \ "but monitor didn't do failover" #define COMMENT_WAIT_STANDBY_TO_CATCHINGUP \ "The primary is now ready to accept a standby" #define COMMENT_DEMOTED_TO_CATCHINGUP \ "A new primary is available. " \ "First, try to rewind. If that fails, do a pg_basebackup." #define COMMENT_SECONDARY_TO_CATCHINGUP \ "Failed to report back to the monitor, " \ "not eligible for promotion" #define COMMENT_CATCHINGUP_TO_SECONDARY \ "Convinced the monitor that I'm up and running, " \ "and eligible for promotion again" #define COMMENT_SECONDARY_TO_PREP_PROMOTION \ "Stop traffic to primary, " \ "wait for it to finish draining." #define COMMENT_PROMOTION_TO_STOP_REPLICATION \ "Prevent against split-brain situations." #define COMMENT_INIT_TO_WAIT_STANDBY \ "Start following a primary" #define COMMENT_SECONARY_TO_WAIT_STANDBY \ "Registering to a new monitor" #define COMMENT_SECONDARY_TO_WAIT_MAINTENANCE \ "Waiting for the primary to disable sync replication before " \ "going to maintenance." #define COMMENT_SECONDARY_TO_MAINTENANCE \ "Suspending standby for manual maintenance." #define COMMENT_MAINTENANCE_TO_CATCHINGUP \ "Restarting standby after manual maintenance is done." #define COMMENT_BLOCKED_WRITES \ "Promoting a Citus Worker standby after having blocked writes " \ "from the coordinator." #define COMMENT_PRIMARY_TO_APPLY_SETTINGS \ "Apply new pg_auto_failover settings (synchronous_standby_names)" #define COMMENT_APPLY_SETTINGS_TO_PRIMARY \ "Back to primary state after having applied new pg_auto_failover settings" #define COMMENT_SECONDARY_TO_REPORT_LSN \ "Reporting the last write-ahead log location received" #define COMMENT_DRAINING_TO_REPORT_LSN \ "Reporting the last write-ahead log location after draining" #define COMMENT_DEMOTED_TO_REPORT_LSN \ "Reporting the last write-ahead log location after being demoted" #define COMMENT_REPORT_LSN_TO_PREP_PROMOTION \ "Stop traffic to primary, " \ "wait for it to finish draining." #define COMMENT_REPORT_LSN_TO_FAST_FORWARD \ "Fetching missing WAL bits from another standby before promotion" #define COMMENT_REPORT_LSN_TO_SINGLE \ "There is no other node anymore, promote this node" #define COMMENT_FOLLOW_NEW_PRIMARY \ "Switch replication to the new primary" #define COMMENT_REPORT_LSN_TO_JOIN_SECONDARY \ "A failover candidate has been selected, stop replication" #define COMMENT_JOIN_SECONDARY_TO_SECONDARY \ "Failover is done, we have a new primary to follow" #define COMMENT_FAST_FORWARD_TO_PREP_PROMOTION \ "Got the missing WAL bytes, promoted" #define COMMENT_INIT_TO_REPORT_LSN \ "Creating a new node from a standby node that is not a candidate." #define COMMENT_DROPPED_TO_REPORT_LSN \ "This node is being reinitialized after having been dropped" #define COMMENT_ANY_TO_DROPPED \ "This node is being dropped from the monitor" /* *INDENT-OFF* */ /* * The full 2-nodes state machine contains states that are expected only when * the node is a primary, and some only when the node is a standby. Each node * is going to change role in its life-cycle, so having the whole life-cycle in * a single FSM makes sense. * * The FSM is normally driven by an external node, the monitor. See design * docs. */ KeeperFSMTransition KeeperFSM[] = { /* * CURRENT_STATE, ASSIGNED_STATE, COMMENT, TRANSTION_FUNCTION */ /* * Started as a single, no nothing */ { INIT_STATE, SINGLE_STATE, COMMENT_INIT_TO_SINGLE, &fsm_init_primary }, { DROPPED_STATE, SINGLE_STATE, COMMENT_INIT_TO_SINGLE, &fsm_init_primary }, { DROPPED_STATE, REPORT_LSN_STATE, COMMENT_DROPPED_TO_REPORT_LSN, &fsm_init_from_standby }, /* * The previous implementation has a transition from any state to the INIT * state that ensures PostgreSQL is down, but I can't quite figure out what * role the INIT state plays exactly in there. * * {ANY_STATE, INIT_STATE, "Revert to initial state", &fsm_stop_postgres}, */ /* * other node(s) was forcibly removed, now single */ { PRIMARY_STATE, SINGLE_STATE, COMMENT_PRIMARY_TO_SINGLE, &fsm_disable_replication }, { WAIT_PRIMARY_STATE, SINGLE_STATE, COMMENT_PRIMARY_TO_SINGLE, &fsm_disable_replication }, { JOIN_PRIMARY_STATE, SINGLE_STATE, COMMENT_PRIMARY_TO_SINGLE, &fsm_disable_replication }, /* * failover occurred, primary -> draining/demoted */ { PRIMARY_STATE, DRAINING_STATE, COMMENT_PRIMARY_TO_DRAINING, &fsm_stop_postgres }, { DRAINING_STATE, DEMOTED_STATE, COMMENT_DRAINING_TO_DEMOTED, &fsm_stop_postgres }, { PRIMARY_STATE, DEMOTED_STATE, COMMENT_PRIMARY_TO_DEMOTED, &fsm_stop_postgres }, { PRIMARY_STATE, DEMOTE_TIMEOUT_STATE, COMMENT_PRIMARY_TO_DEMOTED, &fsm_stop_postgres }, { JOIN_PRIMARY_STATE, DRAINING_STATE, COMMENT_PRIMARY_TO_DRAINING, &fsm_stop_postgres }, { JOIN_PRIMARY_STATE, DEMOTED_STATE, COMMENT_PRIMARY_TO_DEMOTED, &fsm_stop_postgres }, { JOIN_PRIMARY_STATE, DEMOTE_TIMEOUT_STATE, COMMENT_PRIMARY_TO_DEMOTED, &fsm_stop_postgres }, { APPLY_SETTINGS_STATE, DRAINING_STATE, COMMENT_PRIMARY_TO_DRAINING, &fsm_stop_postgres }, { APPLY_SETTINGS_STATE, DEMOTED_STATE, COMMENT_PRIMARY_TO_DEMOTED, &fsm_stop_postgres }, { APPLY_SETTINGS_STATE, DEMOTE_TIMEOUT_STATE, COMMENT_PRIMARY_TO_DEMOTED, &fsm_stop_postgres }, /* * primary is put to maintenance */ { PRIMARY_STATE, PREPARE_MAINTENANCE_STATE, COMMENT_PRIMARY_TO_PREPARE_MAINTENANCE, &fsm_stop_postgres_for_primary_maintenance }, { PREPARE_MAINTENANCE_STATE, MAINTENANCE_STATE, COMMENT_PRIMARY_TO_MAINTENANCE, &fsm_stop_postgres_and_setup_standby }, { PRIMARY_STATE, MAINTENANCE_STATE, COMMENT_PRIMARY_TO_MAINTENANCE, &fsm_stop_postgres_for_primary_maintenance }, /* * was demoted, need to be dead now. */ { DRAINING_STATE, DEMOTE_TIMEOUT_STATE, COMMENT_DRAINING_TO_DEMOTE_TIMEOUT, &fsm_stop_postgres }, { DEMOTE_TIMEOUT_STATE, DEMOTED_STATE, COMMENT_DEMOTE_TIMEOUT_TO_DEMOTED, &fsm_stop_postgres}, /* * wait_primary stops reporting, is (supposed) dead now */ { WAIT_PRIMARY_STATE, DEMOTED_STATE, COMMENT_PRIMARY_TO_DEMOTED, &fsm_stop_postgres }, /* * was demoted after a failure, but standby was forcibly removed */ { DEMOTED_STATE, SINGLE_STATE, COMMENT_DEMOTED_TO_SINGLE, &fsm_resume_as_primary }, { DEMOTE_TIMEOUT_STATE, SINGLE_STATE, COMMENT_DEMOTED_TO_SINGLE, &fsm_resume_as_primary }, { DRAINING_STATE, SINGLE_STATE, COMMENT_DEMOTED_TO_SINGLE, &fsm_resume_as_primary }, /* * primary was forcibly removed */ { SECONDARY_STATE, SINGLE_STATE, COMMENT_LOST_PRIMARY, &fsm_promote_standby }, { CATCHINGUP_STATE, SINGLE_STATE, COMMENT_LOST_PRIMARY, &fsm_promote_standby }, { PREP_PROMOTION_STATE, SINGLE_STATE, COMMENT_LOST_PRIMARY, &fsm_promote_standby }, /* * went down to force the primary to time out, but then it was removed */ { STOP_REPLICATION_STATE, SINGLE_STATE, COMMENT_REPLICATION_TO_SINGLE, &fsm_promote_standby }, /* * all states should lead to SINGLE, including REPORT_LSN */ { REPORT_LSN_STATE, SINGLE_STATE, COMMENT_REPORT_LSN_TO_SINGLE, &fsm_promote_standby }, /* * On the Primary, wait for a standby to be ready: WAIT_PRIMARY */ { SINGLE_STATE, WAIT_PRIMARY_STATE, COMMENT_SINGLE_TO_WAIT_PRIMARY, &fsm_prepare_replication }, { PRIMARY_STATE, JOIN_PRIMARY_STATE, COMMENT_PRIMARY_TO_JOIN_PRIMARY, &fsm_prepare_replication }, { PRIMARY_STATE, WAIT_PRIMARY_STATE, COMMENT_PRIMARY_TO_WAIT_PRIMARY, &fsm_disable_sync_rep }, { JOIN_PRIMARY_STATE, WAIT_PRIMARY_STATE, COMMENT_PRIMARY_TO_WAIT_PRIMARY, &fsm_disable_sync_rep }, { WAIT_PRIMARY_STATE, JOIN_PRIMARY_STATE, COMMENT_PRIMARY_TO_JOIN_PRIMARY, &fsm_prepare_replication }, /* * Situation is getting back to normal on the primary */ { WAIT_PRIMARY_STATE, PRIMARY_STATE, COMMENT_WAIT_PRIMARY_TO_PRIMARY, &fsm_enable_sync_rep }, { JOIN_PRIMARY_STATE, PRIMARY_STATE, COMMENT_JOIN_PRIMARY_TO_PRIMARY, &fsm_enable_sync_rep }, { DEMOTE_TIMEOUT_STATE, PRIMARY_STATE, COMMENT_DEMOTE_TO_PRIMARY, &fsm_start_postgres }, /* * The primary is now ready to accept a standby, we're the standby */ { WAIT_STANDBY_STATE, CATCHINGUP_STATE, COMMENT_WAIT_STANDBY_TO_CATCHINGUP, &fsm_init_standby }, { DEMOTED_STATE, CATCHINGUP_STATE, COMMENT_DEMOTED_TO_CATCHINGUP, &fsm_rewind_or_init }, { SECONDARY_STATE, CATCHINGUP_STATE, COMMENT_SECONDARY_TO_CATCHINGUP, &fsm_follow_new_primary }, /* * We're asked to be a standby. */ { CATCHINGUP_STATE, SECONDARY_STATE, COMMENT_CATCHINGUP_TO_SECONDARY, &fsm_prepare_for_secondary }, /* * The standby is asked to prepare its own promotion */ { SECONDARY_STATE, PREP_PROMOTION_STATE, COMMENT_SECONDARY_TO_PREP_PROMOTION, &fsm_prepare_standby_for_promotion }, { CATCHINGUP_STATE, PREP_PROMOTION_STATE, COMMENT_SECONDARY_TO_PREP_PROMOTION, &fsm_prepare_standby_for_promotion }, /* * Forcefully stop replication by stopping the server. */ { PREP_PROMOTION_STATE, STOP_REPLICATION_STATE, COMMENT_PROMOTION_TO_STOP_REPLICATION, &fsm_stop_replication }, /* * finish the promotion */ { STOP_REPLICATION_STATE, WAIT_PRIMARY_STATE, COMMENT_STOP_REPLICATION_TO_WAIT_PRIMARY, &fsm_promote_standby_to_primary }, { PREP_PROMOTION_STATE, WAIT_PRIMARY_STATE, COMMENT_BLOCKED_WRITES, &fsm_promote_standby }, /* * Just wait until primary is ready */ { INIT_STATE, WAIT_STANDBY_STATE, COMMENT_INIT_TO_WAIT_STANDBY, NULL }, { DROPPED_STATE, WAIT_STANDBY_STATE, COMMENT_INIT_TO_WAIT_STANDBY, NULL }, /* * When losing a monitor and then connecting to a new monitor as a * secondary, we need to be able to follow the init sequence again. */ { SECONDARY_STATE, WAIT_STANDBY_STATE, COMMENT_SECONARY_TO_WAIT_STANDBY, NULL }, /* * In case of maintenance of the standby server, we stop PostgreSQL. */ { SECONDARY_STATE, WAIT_MAINTENANCE_STATE, COMMENT_SECONDARY_TO_WAIT_MAINTENANCE, NULL }, { CATCHINGUP_STATE, WAIT_MAINTENANCE_STATE, COMMENT_SECONDARY_TO_WAIT_MAINTENANCE, NULL }, { SECONDARY_STATE, MAINTENANCE_STATE, COMMENT_SECONDARY_TO_MAINTENANCE, &fsm_start_maintenance_on_standby }, { CATCHINGUP_STATE, MAINTENANCE_STATE, COMMENT_SECONDARY_TO_MAINTENANCE, &fsm_start_maintenance_on_standby }, { WAIT_MAINTENANCE_STATE, MAINTENANCE_STATE, COMMENT_SECONDARY_TO_MAINTENANCE, &fsm_start_maintenance_on_standby }, { MAINTENANCE_STATE, CATCHINGUP_STATE, COMMENT_MAINTENANCE_TO_CATCHINGUP, &fsm_restart_standby }, { PREPARE_MAINTENANCE_STATE, CATCHINGUP_STATE, COMMENT_MAINTENANCE_TO_CATCHINGUP, &fsm_restart_standby }, /* * Applying new replication/cluster settings (per node replication quorum, * candidate priorities, or per formation number_sync_standbys) means we * have to fetch the new value for synchronous_standby_names from the * monitor. */ { PRIMARY_STATE, APPLY_SETTINGS_STATE, COMMENT_PRIMARY_TO_APPLY_SETTINGS, NULL }, { WAIT_PRIMARY_STATE, APPLY_SETTINGS_STATE, COMMENT_PRIMARY_TO_APPLY_SETTINGS, NULL }, { APPLY_SETTINGS_STATE, PRIMARY_STATE, COMMENT_APPLY_SETTINGS_TO_PRIMARY, &fsm_enable_sync_rep }, { APPLY_SETTINGS_STATE, SINGLE_STATE, COMMENT_PRIMARY_TO_SINGLE, &fsm_disable_replication }, { APPLY_SETTINGS_STATE, WAIT_PRIMARY_STATE, COMMENT_PRIMARY_TO_WAIT_PRIMARY, &fsm_disable_sync_rep }, { APPLY_SETTINGS_STATE, JOIN_PRIMARY_STATE, COMMENT_PRIMARY_TO_JOIN_PRIMARY, &fsm_prepare_replication }, /* * In case of multiple standbys, failover begins with reporting current LSN */ { SECONDARY_STATE, REPORT_LSN_STATE, COMMENT_SECONDARY_TO_REPORT_LSN, &fsm_report_lsn }, { CATCHINGUP_STATE, REPORT_LSN_STATE, COMMENT_SECONDARY_TO_REPORT_LSN, &fsm_report_lsn }, { MAINTENANCE_STATE, REPORT_LSN_STATE, COMMENT_SECONDARY_TO_REPORT_LSN, &fsm_report_lsn }, { PREPARE_MAINTENANCE_STATE, REPORT_LSN_STATE, COMMENT_SECONDARY_TO_REPORT_LSN, &fsm_report_lsn }, { REPORT_LSN_STATE, PREP_PROMOTION_STATE, COMMENT_REPORT_LSN_TO_PREP_PROMOTION, &fsm_prepare_standby_for_promotion }, { REPORT_LSN_STATE, FAST_FORWARD_STATE, COMMENT_REPORT_LSN_TO_FAST_FORWARD, &fsm_fast_forward }, { FAST_FORWARD_STATE, PREP_PROMOTION_STATE, COMMENT_FAST_FORWARD_TO_PREP_PROMOTION, &fsm_cleanup_as_primary }, { REPORT_LSN_STATE, JOIN_SECONDARY_STATE, COMMENT_REPORT_LSN_TO_JOIN_SECONDARY, &fsm_checkpoint_and_stop_postgres }, { REPORT_LSN_STATE, SECONDARY_STATE, COMMENT_REPORT_LSN_TO_JOIN_SECONDARY, &fsm_follow_new_primary }, { JOIN_SECONDARY_STATE, SECONDARY_STATE, COMMENT_JOIN_SECONDARY_TO_SECONDARY, &fsm_follow_new_primary }, /* * When an old primary gets back online and reaches draining/draining, if a * failover is on-going then have it join the selection process. */ { DRAINING_STATE, REPORT_LSN_STATE, COMMENT_DRAINING_TO_REPORT_LSN, &fsm_report_lsn_and_drop_replication_slots }, { DEMOTED_STATE, REPORT_LSN_STATE, COMMENT_DEMOTED_TO_REPORT_LSN, &fsm_report_lsn_and_drop_replication_slots }, /* * When adding a new node and there is no primary, but there are existing * nodes that are not candidates for failover. */ { INIT_STATE, REPORT_LSN_STATE, COMMENT_INIT_TO_REPORT_LSN, &fsm_init_from_standby }, /* * Dropping a node is a two-step process */ { ANY_STATE, DROPPED_STATE, COMMENT_ANY_TO_DROPPED, &fsm_drop_node }, /* * This is the end, my friend. */ { NO_STATE, NO_STATE, NULL, NULL }, }; /* *INDENT-ON* */ /* * keeper_fsm_step implements the logic to perform a single step * of the state machine according to the goal state returned by * the monitor. */ bool keeper_fsm_step(Keeper *keeper) { KeeperConfig *config = &(keeper->config); KeeperStateData *keeperState = &(keeper->state); Monitor *monitor = &(keeper->monitor); LocalPostgresServer *postgres = &(keeper->postgres); MonitorAssignedState assignedState = { 0 }; /* * Update our in-memory representation of PostgreSQL state, ignore errors * as in the main loop: we continue with default WAL lag of -1 and an empty * string for pgsrSyncState. */ (void) keeper_update_pg_state(keeper, LOG_DEBUG); log_debug("Calling node_active for node %s/%d/%d with current state: " "PostgreSQL is running is %s, " "sync_state is \"%s\", " "latest WAL LSN is %s.", config->formation, keeperState->current_node_id, keeperState->current_group, postgres->pgIsRunning ? "true" : "false", postgres->pgsrSyncState, postgres->currentLSN); if (!monitor_node_active(monitor, config->formation, keeperState->current_node_id, keeperState->current_group, keeperState->current_role, postgres->pgIsRunning, postgres->postgresSetup.control.timeline_id, postgres->currentLSN, postgres->pgsrSyncState, &assignedState)) { log_fatal("Failed to get the goal state from the monitor, " "see above for details"); return false; } /* * Assign the new state. We skip writing the state file here since we can * (and should) always get the assigned state from the monitor. */ keeperState->assigned_role = assignedState.state; /* roll the state machine forward */ if (keeperState->assigned_role != keeperState->current_role) { if (!keeper_fsm_reach_assigned_state(keeper)) { /* errors have already been logged */ return false; } } else { /* * Now that we know if PostgreSQL is running or not, maybe restart it, * or maybe shut it down, depending on what the current state expects. */ if (!keeper_ensure_current_state(keeper)) { log_warn("pg_autoctl keeper failed to ensure current state \"%s\": " "PostgreSQL %s running", NodeStateToString(keeperState->current_role), postgres->pgIsRunning ? "is" : "is not"); } } /* update state file */ if (!keeper_update_state(keeper, assignedState.nodeId, assignedState.groupId, assignedState.state, true)) { log_error("Failed to write keepers state file, see above for details"); return false; } return true; } /* * keeper_fsm_reach_assigned_state uses the KeeperFSM to drive a transition * from keeper->state->current_role to keeper->state->assigned_role, when * that's supported. */ bool keeper_fsm_reach_assigned_state(Keeper *keeper) { int transitionIndex = 0; KeeperStateData *keeperState = &(keeper->state); KeeperFSMTransition transition = KeeperFSM[0]; if (keeperState->current_role == keeperState->assigned_role) { log_debug("Current state and Goal state are the same (\"%s\").", NodeStateToString(keeperState->current_role)); return true; } while (transition.current != NO_STATE) { if (state_matches(transition.current, keeperState->current_role) && state_matches(transition.assigned, keeperState->assigned_role)) { bool ret = false; /* avoid logging "#any state#" to the user */ if (transition.current != ANY_STATE) { log_info("FSM transition from \"%s\" to \"%s\"%s%s", NodeStateToString(transition.current), NodeStateToString(transition.assigned), transition.comment ? ": " : "", transition.comment ? transition.comment : ""); } else { log_info("FSM transition to \"%s\"%s%s", NodeStateToString(transition.assigned), transition.comment ? ": " : "", transition.comment ? transition.comment : ""); } if (transition.transitionFunction) { ret = (*transition.transitionFunction)(keeper); log_debug("Transition function returned: %s", ret ? "true" : "false"); } else { ret = true; log_debug("No transition function, assigning new state"); } if (ret) { keeperState->current_role = keeperState->assigned_role; log_info("Transition complete: current state is now \"%s\"", NodeStateToString(keeperState->current_role)); } else { /* avoid logging "#any state#" to the user */ if (transition.current != ANY_STATE) { log_error("Failed to transition from state \"%s\" " "to state \"%s\", see above.", NodeStateToString(transition.current), NodeStateToString(transition.assigned)); } else { log_error("Failed to transition to state \"%s\", see above.", NodeStateToString(transition.assigned)); } } return ret; } transition = KeeperFSM[++transitionIndex]; } /* * we didn't find a transition */ log_fatal("pg_autoctl does not know how to reach state \"%s\" from \"%s\"", NodeStateToString(keeperState->assigned_role), NodeStateToString(keeperState->current_role)); return false; } /* * print_reachable_states shows the list of states we can reach using the FSM * transitions from KeeperState.current_role. */ void print_reachable_states(KeeperStateData *keeperState) { int transitionIndex = 0; bool header = false; KeeperFSMTransition transition = KeeperFSM[0]; log_debug("print_reachable_states: %s", NodeStateToString(keeperState->current_role)); while (transition.current != NO_STATE) { if (state_matches(transition.current, keeperState->current_role)) { if (!header) { fformat(stdout, "%20s | %20s | %s\n", "Current", "Reachable", "Comment"); fformat(stdout, "%20s-+-%20s-+-%s\n", "--------------------", "--------------------", "--------------------"); header = true; } fformat(stdout, "%20s | %20s | %s\n", NodeStateToString(transition.current), NodeStateToString(transition.assigned), transition.comment); } transition = KeeperFSM[++transitionIndex]; } } /* * print_fsm_for_graphviz outputs the program used by graphviz to draw a visual * representation of our state machine. * * pg_autoctl do fsm gv | dot -Tpng > fsm.png */ void print_fsm_for_graphviz() { KeeperFSMTransition transition = KeeperFSM[0]; int transitionIndex = 0; fformat( stdout, "digraph finite_state_machine\n" "{\n" " size=\"12\"\n" " ratio=\"fill\"\n" " node [shape = doubleoctagon, style=filled, color=\"bisque1\"]; init primary secondary; \n" " node [shape = octagon, style=filled color=\"bisque3\"]; \n"); while (transition.current != NO_STATE) { fformat(stdout, " %s -> %s [ label = \"%s\" ];\n", NodeStateToString(transition.current), NodeStateToString(transition.assigned), transition.comment); transition = KeeperFSM[++transitionIndex]; } fformat(stdout, "}\n"); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/fsm.h000066400000000000000000000050701414244367200215150ustar00rootroot00000000000000/* * src/bin/pg_autoctl/fsm.h * Finite State Machine implementation for pg_autoctl. * * Handling of the Finite State Machine driving the pg_autoctl Keeper. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef KEEPER_FSM_H #define KEEPER_FSM_H #include "keeper.h" #include "keeper_config.h" #include "monitor.h" #include "primary_standby.h" #include "state.h" /* * Each FSM entry is a transition from a current state to the next */ typedef bool (*ReachAssignedStateFunction)(Keeper *keeper); /* defines a possible transition in the FSM */ typedef struct KeeperFSMTransition { NodeState current; NodeState assigned; const char *comment; ReachAssignedStateFunction transitionFunction; } KeeperFSMTransition; /* src/bin/pg_autoctl/fsm.c */ extern KeeperFSMTransition KeeperFSM[]; bool fsm_init_primary(Keeper *keeper); bool fsm_prepare_replication(Keeper *keeper); bool fsm_disable_replication(Keeper *keeper); bool fsm_resume_as_primary(Keeper *keeper); bool fsm_rewind_or_init(Keeper *keeper); bool fsm_prepare_for_secondary(Keeper *keeper); bool fsm_init_standby(Keeper *keeper); bool fsm_promote_standby(Keeper *keeper); bool fsm_prepare_standby_for_promotion(Keeper *keeper); bool fsm_promote_standby_to_primary(Keeper *keeper); bool fsm_promote_standby_to_single(Keeper *keeper); bool fsm_stop_replication(Keeper *keeper); bool fsm_enable_sync_rep(Keeper *keeper); bool fsm_disable_sync_rep(Keeper *keeper); bool fsm_apply_settings(Keeper *keeper); bool fsm_start_postgres(Keeper *keeper); bool fsm_stop_postgres(Keeper *keeper); bool fsm_stop_postgres_for_primary_maintenance(Keeper *keeper); bool fsm_stop_postgres_and_setup_standby(Keeper *keeper); bool fsm_checkpoint_and_stop_postgres(Keeper *keeper); bool fsm_start_maintenance_on_standby(Keeper *keeper); bool fsm_restart_standby(Keeper *keeper); bool fsm_report_lsn(Keeper *keeper); bool fsm_report_lsn_and_drop_replication_slots(Keeper *keeper); bool fsm_fast_forward(Keeper *keeper); bool fsm_prepare_cascade(Keeper *keeper); bool fsm_follow_new_primary(Keeper *keeper); bool fsm_cleanup_as_primary(Keeper *keeper); bool fsm_init_from_standby(Keeper *keeper); bool fsm_drop_node(Keeper *keeper); /* * Extra helpers. */ bool prepare_replication(Keeper *keeper, NodeState otherNodeState); /* * Generic API to use the previous definitions. */ void print_reachable_states(KeeperStateData *keeperState); void print_fsm_for_graphviz(void); bool keeper_fsm_step(Keeper *keeper); bool keeper_fsm_reach_assigned_state(Keeper *keeper); #endif /* KEEPER_FSM_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/fsm_transition.c000066400000000000000000001263271414244367200237730ustar00rootroot00000000000000/* * src/bin/pg_autoctl/fsm_transition.c * Implementation of transitions in the keeper state machine * * To move from a current state to a goal state, the pg_autoctl state machine * will call the functions defined in this file, which are referenced from * fsm.c * * Every transition must be idempotent such that it can safely be repeated * until it succeeds. * * As the keeper could fail or be interrupted in-flight, it's important that * every transition can be tried again (is idempotent). When interrupted (by * a bug or a signal, user interrupt or system reboot), the current and * assigned roles have not changed and on the next keeper's start the FSM * will kick in a call the transition that failed again. The transition might * have successfully implemented the first parts of its duties... and we must * not fail because of that. Idempotency is achieved by only calling * idempotent subroutines or checking whether the goal of the subroutine * (e.g. "postgres is promoted") has been achieved already. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include "defaults.h" #include "env_utils.h" #include "pgctl.h" #include "fsm.h" #include "keeper.h" #include "keeper_pg_init.h" #include "log.h" #include "monitor.h" #include "pghba.h" #include "primary_standby.h" #include "state.h" static bool fsm_init_standby_from_upstream(Keeper *keeper); /* * fsm_init_primary initializes the postgres server as primary. * * This function actually covers the transition from INIT to SINGLE. * * pg_ctl initdb (if necessary) * && create database + create extension (if necessary) * && start_postgres * && promote_standby (if applicable) * && add_default_settings * && create_monitor_user * && create_replication_user */ bool fsm_init_primary(Keeper *keeper) { KeeperConfig *config = &(keeper->config); LocalPostgresServer *postgres = &(keeper->postgres); PGSQL *pgsql = &(postgres->sqlClient); bool inRecovery = false; KeeperStateInit *initState = &(keeper->initState); PostgresSetup *pgSetup = &(postgres->postgresSetup); bool postgresInstanceExists = pg_setup_pgdata_exists(pgSetup); log_info("Initialising postgres as a primary"); /* * When initialializing the local node on-top of an empty (or non-existing) * PGDATA directory, now is the time to `pg_ctl initdb`. */ if (!keeper_init_state_read(initState, config->pathnames.init)) { log_error("Failed to read init state file \"%s\", which is required " "for the transition from INIT to SINGLE.", config->pathnames.init); return false; } /* * When initState is PRE_INIT_STATE_RUNNING, double check that Postgres is * still running. After all the end-user could just stop Postgres and then * give the install to us. We ought to support that. */ if (initState->pgInitState >= PRE_INIT_STATE_RUNNING) { if (!keeper_init_state_discover(initState, pgSetup, keeper->config.pathnames.init)) { /* errors have already been logged */ return false; } /* did the user try again after having stopped Postgres maybe? */ if (initState->pgInitState < PRE_INIT_STATE_RUNNING) { log_info("PostgreSQL state has changed since registration time: %s", PreInitPostgreInstanceStateToString(initState->pgInitState)); } } bool pgInstanceIsOurs = initState->pgInitState == PRE_INIT_STATE_EMPTY || initState->pgInitState == PRE_INIT_STATE_EXISTS; if (initState->pgInitState == PRE_INIT_STATE_EMPTY && !postgresInstanceExists) { Monitor *monitor = &(keeper->monitor); PostgresSetup newPgSetup = { 0 }; bool missingPgdataIsOk = false; bool postgresNotRunningIsOk = true; if (!pg_ctl_initdb(pgSetup->pg_ctl, pgSetup->pgdata)) { log_fatal("Failed to initialize a PostgreSQL instance at \"%s\"" ", see above for details", pgSetup->pgdata); return false; } if (!pg_setup_init(&newPgSetup, pgSetup, missingPgdataIsOk, postgresNotRunningIsOk)) { /* errors have already been logged */ log_error("pg_setup_wait_until_is_ready: pg_setup_init is false"); return false; } *pgSetup = newPgSetup; /* * We managed to initdb, refresh our configuration file location with * the realpath(3) from pg_setup_update_config_with_absolute_pgdata: * we might have been given a relative pathname. */ if (!keeper_config_update_with_absolute_pgdata(&(keeper->config))) { /* errors have already been logged */ return false; } if (!config->monitorDisabled) { /* * We have a new system_identifier, we need to publish it now. */ if (!monitor_set_node_system_identifier( monitor, keeper->state.current_node_id, pgSetup->control.system_identifier)) { log_error("Failed to update the new node system_identifier"); return false; } } } else if (initState->pgInitState >= PRE_INIT_STATE_RUNNING) { log_error("PostgreSQL is already running at \"%s\", refusing to " "initialize a new cluster on-top of the current one.", pgSetup->pgdata); return false; } /* * When the PostgreSQL instance either did not exist, or did exist but was * not running when creating the pg_autoctl node the first time, then we * can restart the instance without fear of disturbing the service. */ if (pgInstanceIsOurs) { /* create the target database and install our extension there */ if (!create_database_and_extension(keeper)) { /* errors have already been logged */ return false; } } /* * Now is the time to make sure Postgres is running, as our next steps to * prepare a SINGLE from INIT are depending on being able to connect to the * local Postgres service. */ if (!ensure_postgres_service_is_running(postgres)) { log_error("Failed to initialize postgres as primary because " "starting postgres failed, see above for details"); return false; } /* * When dealing with a pg_autoctl create postgres command with a * pre-existing PGDATA directory, make sure we can start the cluster * without being in sync-rep already. The target state here is SINGLE * after all. */ if (!fsm_disable_replication(keeper)) { log_error("Failed to disable synchronous replication in order to " "initialize as a primary, see above for details"); return false; } /* * FIXME: In the current FSM, I am not sure this can happen anymore. That * said we might want to remain compatible with initializing a SINGLE from * an pre-existing standby. I wonder why/how it would come to that though. */ if (pgsql_is_in_recovery(pgsql, &inRecovery) && inRecovery) { log_info("Initialising a postgres server in recovery mode as the primary, " "promoting"); if (!standby_promote(postgres)) { log_error("Failed to initialize postgres as primary because promoting " "postgres failed, see above for details"); return false; } } /* * We just created the local Postgres cluster, make sure it has our minimum * configuration deployed. * * When --ssl-self-signed has been used, now is the time to build a * self-signed certificate for the server. We place the certificate and * private key in $PGDATA/server.key and $PGDATA/server.crt */ if (!keeper_create_self_signed_cert(keeper)) { /* errors have already been logged */ return false; } if (!postgres_add_default_settings(postgres, config->hostname)) { log_error("Failed to initialize postgres as primary because " "adding default settings failed, see above for details"); return false; } /* * Now add the role and HBA entries necessary for the monitor to run health * checks on the local Postgres node. */ if (!config->monitorDisabled) { char monitorHostname[_POSIX_HOST_NAME_MAX]; int monitorPort = 0; int connlimit = 1; if (!hostname_from_uri(config->monitor_pguri, monitorHostname, _POSIX_HOST_NAME_MAX, &monitorPort)) { /* developer error, this should never happen */ log_fatal("BUG: monitor_pguri should be validated before calling " "fsm_init_primary"); return false; } /* * We need to add the monitor host:port in the HBA settings for the * node to enable the health checks. * * Node that we forcibly use the authentication method "trust" for the * pgautofailover_monitor user, which from the monitor also uses the * hard-coded password PG_AUTOCTL_HEALTH_PASSWORD. The idea is to avoid * leaking information from the passfile, environment variable, or * other places. */ if (!primary_create_user_with_hba(postgres, PG_AUTOCTL_HEALTH_USERNAME, PG_AUTOCTL_HEALTH_PASSWORD, monitorHostname, "trust", pgSetup->hbaLevel, connlimit)) { log_error( "Failed to initialise postgres as primary because " "creating the database user that the pg_auto_failover monitor " "uses for health checks failed, see above for details"); return false; } } /* * This node is intended to be used as a primary later in the setup, when * we have a standby node to register, so prepare the replication user now. */ if (!primary_create_replication_user(postgres, PG_AUTOCTL_REPLICA_USERNAME, config->replication_password)) { log_error("Failed to initialize postgres as primary because creating the " "replication user for the standby failed, see above for details"); return false; } /* * What remains to be done is either opening the HBA for a test setup, or * when we are initializing pg_auto_failover on an existing PostgreSQL * primary server instance, making sure that the parameters are all set. */ if (pgInstanceIsOurs) { if (env_found_empty("PG_REGRESS_SOCK_DIR")) { /* * In test environements allow nodes from the same network to * connect. The network is discovered automatically. */ if (!pghba_enable_lan_cidr(&keeper->postgres.sqlClient, keeper->config.pgSetup.ssl.active, HBA_DATABASE_ALL, NULL, keeper->config.hostname, NULL, DEFAULT_AUTH_METHOD, HBA_EDIT_MINIMAL, NULL)) { log_error("Failed to grant local network connections in HBA"); return false; } } } else { /* * As we are registering a previsouly existing PostgreSQL * instance, we now check that our mininum configuration * requirements for pg_auto_failover are in place. If not, tell * the user they must restart PostgreSQL at their next * maintenance window to fully enable pg_auto_failover. */ bool settings_are_ok = false; if (!check_postgresql_settings(&(keeper->postgres), &settings_are_ok)) { log_fatal("Failed to check local PostgreSQL settings " "compliance with pg_auto_failover, " "see above for details"); return false; } else if (!settings_are_ok) { log_fatal("Current PostgreSQL settings are not compliant " "with pg_auto_failover requirements, " "please restart PostgreSQL at the next " "opportunity to enable pg_auto_failover changes, " "and redo `pg_autoctl create`"); return false; } } /* and we're done with this connection. */ pgsql_finish(pgsql); return true; } /* * fsm_disable_replication is used when other node was forcibly removed, now * single. * * disable_synchronous_replication * && keeper_create_and_drop_replication_slots * * TODO: We currently use a separate session for each step. We should use * a single connection. */ bool fsm_disable_replication(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); if (!ensure_postgres_service_is_running(postgres)) { /* errors have already been logged */ return false; } if (!primary_disable_synchronous_replication(postgres)) { log_error("Failed to disable replication because disabling synchronous " "failed, see above for details"); return false; } /* cache invalidation in case we're doing WAIT_PRIMARY to SINGLE */ bzero((void *) postgres->standbyTargetLSN, PG_LSN_MAXLENGTH); /* when a standby has been removed, remove its replication slot */ return keeper_create_and_drop_replication_slots(keeper); } /* * fsm_resume_as_primary is used when the local node was demoted after a * failure, but standby was forcibly removed. * * start_postgres * && disable_synchronous_replication * && keeper_create_and_drop_replication_slots * * So we reuse fsm_disable_replication() here, rather than copy/pasting the same * bits code in the fsm_resume_as_primary() function body. If the definition of * the fsm_resume_as_primary transition ever came to diverge from whatever * fsm_disable_replication() is doing, we'd have to copy/paste and maintain * separate code path. */ bool fsm_resume_as_primary(Keeper *keeper) { if (!fsm_disable_replication(keeper)) { log_error("Failed to disable synchronous replication in order to " "resume as a primary, see above for details"); return false; } return true; } /* * fsm_prepare_replication is used when a new standby was added. * * add_standby_to_hba && create_replication_slot * * Those operations are now done eagerly rather than just in time. So it's been * taken care of aready, nothing to do within this state transition. */ bool fsm_prepare_replication(Keeper *keeper) { return true; } /* * fsm_stop_replication is used to forcefully stop replication, in case the * primary is on the other side of a network split. */ bool fsm_stop_replication(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); PGSQL *client = &(postgres->sqlClient); /* * We can't control if the client is still sending writes to our PostgreSQL * instance or not. To avoid split-brains situation, we need to make some * efforts: * * - set default_transaction_read_only to 'on' on this server (a * standby being promoted) so that it can't be the target of * connection strings requiring target_session_attrs=read-write yet * * - shut down the replication stream (here by promoting the replica) * * - have the primary server realize it's alone on the network: can't * communicate with the monitor (which triggered the failover), can't * communicate with the standby (now absent from pg_stat_replication) * * When the keeper on the primary realizes they are alone in the dark, * it will go to DEMOTE state on its own and shut down PostgreSQL, * protecting againts split brain. */ log_info("Prevent writes to the promoted standby while the primary " "is not demoted yet, by making the service incompatible with " "target_session_attrs = read-write"); if (!pgsql_set_default_transaction_mode_read_only(client)) { log_error("Failed to switch to read-only mode"); return false; } return fsm_promote_standby(keeper); } /* * fsm_disable_sync_rep is used when standby became unhealthy. */ bool fsm_disable_sync_rep(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); return primary_disable_synchronous_replication(postgres); } /* * fsm_promote_standby_to_primary is used when the standby should become the * new primary. It also prepares for the old primary to become the new standby. * * The promotion of the standby has already happened in the previous * transition: * * 1. secondary ➜ prepare_promotion : block writes * 2. prepare_promotion ➜ stop_replication : promote * 3. stop_replication ➜ wait_primary : resume writes * * Resuming writes is done by setting default_transaction_read_only to off, * thus allowing libpq to establish connections when target_session_attrs is * read-write. */ bool fsm_promote_standby_to_primary(Keeper *keeper) { bool forceCacheInvalidation = true; LocalPostgresServer *postgres = &(keeper->postgres); PGSQL *client = &(postgres->sqlClient); if (!pgsql_set_default_transaction_mode_read_write(client)) { log_error("Failed to set default_transaction_read_only to off " "which is needed to accept libpq connections with " "target_session_attrs read-write"); return false; } /* now is a good time to make sure we invalidate other nodes cache */ if (!keeper_refresh_other_nodes(keeper, forceCacheInvalidation)) { log_error("Failed to update HBA rules after resuming writes"); return false; } return true; } /* * fsm_enable_sync_rep is used when a healthy standby appeared. */ bool fsm_enable_sync_rep(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); PostgresSetup *pgSetup = &(postgres->postgresSetup); PGSQL *pgsql = &(postgres->sqlClient); /* * First, we need to fetch and apply the synchronous_standby_names setting * value from the monitor... */ if (!fsm_apply_settings(keeper)) { /* errors have already been logged */ return false; } /* * If we don't have any standby with replication-quorum true, then we don't * actually enable sync rep here. In that case don't bother making sure the * standbys have reached a meaningful LSN target before continuing. */ if (streq(postgres->synchronousStandbyNames, "")) { return true; } /* first time in that state, fetch most recent metadata */ if (IS_EMPTY_STRING_BUFFER(postgres->standbyTargetLSN)) { if (!pgsql_get_postgres_metadata(pgsql, &pgSetup->is_in_recovery, postgres->pgsrSyncState, postgres->currentLSN, &(postgres->postgresSetup.control))) { log_error("Failed to update the local Postgres metadata"); return false; } /* * Our standbyTargetLSN needs to be set once we have at least one * standby that's known to participate in the synchronous replication * quorum. */ if (!(streq(postgres->pgsrSyncState, "quorum") || streq(postgres->pgsrSyncState, "sync"))) { /* it's an expected situation here, don't fill-up the logs */ log_warn("Failed to set the standby Target LSN because we don't " "have a quorum candidate yet"); return false; } strlcpy(postgres->standbyTargetLSN, postgres->currentLSN, PG_LSN_MAXLENGTH); log_info("Waiting until standby node has caught-up to LSN %s", postgres->standbyTargetLSN); } /* * Now, we have set synchronous_standby_names and have one standby that's * expected to be caught-up. Make sure that is the case by checking the LSN * positions in much the same way as Postgres does when committing a * transaction on the primary: get the current LSN, and wait until the * reported LSN from the secondary has advanced past the current point. */ return primary_standby_has_caught_up(postgres); } /* * fsm_apply_settings is used when a pg_auto_failover setting has changed, such * as number_sync_standbys or node priorities and replication quorum * properties. * * So we have to fetch the current synchronous_standby_names setting value from * the monitor and apply it (reload) to the current node. */ bool fsm_apply_settings(Keeper *keeper) { Monitor *monitor = &(keeper->monitor); KeeperConfig *config = &(keeper->config); LocalPostgresServer *postgres = &(keeper->postgres); /* get synchronous_standby_names value from the monitor */ if (!config->monitorDisabled) { if (!monitor_synchronous_standby_names( monitor, config->formation, keeper->state.current_group, postgres->synchronousStandbyNames, sizeof(postgres->synchronousStandbyNames))) { log_error("Failed to enable synchronous replication because " "we failed to get the synchronous_standby_names value " "from the monitor, see above for details"); return false; } } else { /* no monitor: use the generic value '*' */ strlcpy(postgres->synchronousStandbyNames, "*", sizeof(postgres->synchronousStandbyNames)); } return primary_set_synchronous_standby_names(postgres); } /* * fsm_start_postgres is used when we detected a network partition, but monitor * didn't do failover. */ bool fsm_start_postgres(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); if (!ensure_postgres_service_is_running(postgres)) { log_error("Failed to promote postgres because the server could not " "be started before promotion, see above for details"); return false; } /* fetch synchronous_standby_names setting from the monitor */ if (!fsm_apply_settings(keeper)) { /* errors have already been logged */ return false; } return true; } /* * fsm_stop_postgres is used when local node was demoted, need to be dead now. */ bool fsm_stop_postgres(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); return ensure_postgres_service_is_stopped(postgres); } /* * fsm_stop_postgres_for_primary_maintenance is used when pg_autoctl enable * maintenance has been used on the primary server, we do a couple CHECKPOINT * before stopping Postgres to ensure a smooth transition. */ bool fsm_stop_postgres_for_primary_maintenance(Keeper *keeper) { return fsm_checkpoint_and_stop_postgres(keeper); } /* * fsm_stop_postgres_and_setup_standby is used when the primary is put to * maintenance. Not only do we stop Postgres, we also prepare a setup as a * secondary. */ bool fsm_stop_postgres_and_setup_standby(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); ReplicationSource *upstream = &(postgres->replicationSource); PostgresSetup *pgSetup = &(postgres->postgresSetup); KeeperConfig *config = &(keeper->config); NodeAddress upstreamNode = { 0 }; if (!ensure_postgres_service_is_stopped(postgres)) { /* errors have already been logged */ return false; } /* Move the Postgres controller out of the way */ if (!local_postgres_unlink_status_file(postgres)) { /* highly unexpected */ log_error("Failed to remove our Postgres status file " "see above for details"); return false; } /* prepare a standby setup */ if (!standby_init_replication_source(postgres, &upstreamNode, PG_AUTOCTL_REPLICA_USERNAME, config->replication_password, config->replication_slot_name, config->maximum_backup_rate, config->backupDirectory, NULL, /* no targetLSN */ config->pgSetup.ssl, keeper->state.current_node_id)) { /* can't happen at the moment */ return false; } /* make the Postgres setup for a standby node before reaching maintenance */ if (!pg_setup_standby_mode(pgSetup->control.pg_control_version, pgSetup->pgdata, pgSetup->pg_ctl, upstream)) { log_error("Failed to setup Postgres as a standby to go to maintenance"); return false; } return true; } /* * fsm_checkpoint_and_stop_postgres is used when shutting down Postgres as part * of some FSM step when we have a controlled situation. We do a couple * CHECKPOINT before stopping Postgres to ensure a smooth transition. */ bool fsm_checkpoint_and_stop_postgres(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); PostgresSetup *pgSetup = &(postgres->postgresSetup); PGSQL *pgsql = &(postgres->sqlClient); if (pg_setup_is_running(pgSetup)) { /* * Starting with Postgres 12, pg_basebackup sets the recovery * configuration parameters in the postgresql.auto.conf file. We need * to make sure to RESET this value so that our own configuration * setting takes effect. */ if (pgSetup->control.pg_control_version >= 1200) { if (!pgsql_reset_primary_conninfo(pgsql)) { log_error("Failed to RESET primary_conninfo"); return false; } } /* * PostgreSQL shutdown sequence includes a CHECKPOINT, that is issued * by the checkpointer process one every query backend has stopped * already. During this final CHECKPOINT no work can be done, so it's * best to reduce the amount of work needed there. To reduce the * checkpointer shutdown activity, we perform a manual shutdown while * still having concurrent activity. * * The first checkpoint writes all the in-memory buffers, the second * checkpoint writes everything that was added during the first one. */ log_info("Preparing Postgres shutdown: CHECKPOINT;"); for (int i = 0; i < 2; i++) { if (!pgsql_checkpoint(pgsql)) { log_warn("Failed to checkpoint before stopping Postgres"); } } } log_info("Stopping Postgres at \"%s\"", pgSetup->pgdata); return ensure_postgres_service_is_stopped(postgres); } /* * fsm_init_standby_from_upstream is the work horse for both fsm_init_standby * and fsm_init_from_standby. The replication source must have been setup * already. */ static bool fsm_init_standby_from_upstream(Keeper *keeper) { KeeperConfig *config = &(keeper->config); Monitor *monitor = &(keeper->monitor); LocalPostgresServer *postgres = &(keeper->postgres); /* * At pg_autoctl create time when PGDATA already exists and we were * successful in registering the node, then we can proceed without a * pg_basebackup: we already have a copy of PGDATA on-disk. * * The existence of PGDATA at pg_autoctl create time is tracked in our init * state as the PRE_INIT_STATE_EXISTS enum value. Once init is finished, we * remove our init file: then we need to pg_basebackup again to init a * standby. */ bool skipBaseBackup = file_exists(keeper->config.pathnames.init) && keeper->initState.pgInitState == PRE_INIT_STATE_EXISTS; if (!standby_init_database(postgres, config->hostname, skipBaseBackup)) { log_error("Failed to initialize standby server, see above for details"); return false; } if (!skipBaseBackup) { bool forceCacheInvalidation = true; /* write our own HBA rules, pg_basebackup copies pg_hba.conf too */ if (!keeper_refresh_other_nodes(keeper, forceCacheInvalidation)) { log_error("Failed to update HBA rules after a base backup"); return false; } } /* * Publish our possibly new system_identifier now. */ if (!config->monitorDisabled) { if (!monitor_set_node_system_identifier( monitor, keeper->state.current_node_id, postgres->postgresSetup.control.system_identifier)) { log_error("Failed to update the new node system_identifier"); return false; } } /* ensure the SSL setup is synced with the keeper config */ if (!keeper_create_self_signed_cert(keeper)) { /* errors have already been logged */ return false; } /* now, in case we have an init state file around, remove it */ return unlink_file(config->pathnames.init); } /* * fsm_init_standby is used when the primary is now ready to accept a standby, * we're the standby. */ bool fsm_init_standby(Keeper *keeper) { KeeperConfig *config = &(keeper->config); LocalPostgresServer *postgres = &(keeper->postgres); NodeAddress *primaryNode = NULL; /* get the primary node to follow */ if (!keeper_get_primary(keeper, &(postgres->replicationSource.primaryNode))) { log_error("Failed to initialize standby for lack of a primary node, " "see above for details"); return false; } if (!standby_init_replication_source(postgres, primaryNode, PG_AUTOCTL_REPLICA_USERNAME, config->replication_password, config->replication_slot_name, config->maximum_backup_rate, config->backupDirectory, NULL, /* no targetLSN */ config->pgSetup.ssl, keeper->state.current_node_id)) { /* can't happen at the moment */ return false; } return fsm_init_standby_from_upstream(keeper); } /* * fsm_rewind_or_init is used when a new primary is available. First, try to * rewind. If that fails, do a pg_basebackup. */ bool fsm_rewind_or_init(Keeper *keeper) { KeeperConfig *config = &(keeper->config); LocalPostgresServer *postgres = &(keeper->postgres); ReplicationSource *upstream = &(postgres->replicationSource); NodeAddress *primaryNode = NULL; /* get the primary node to follow */ if (!keeper_get_primary(keeper, &(postgres->replicationSource.primaryNode))) { log_error("Failed to initialize standby for lack of a primary node, " "see above for details"); return false; } if (!standby_init_replication_source(postgres, primaryNode, PG_AUTOCTL_REPLICA_USERNAME, config->replication_password, config->replication_slot_name, config->maximum_backup_rate, config->backupDirectory, NULL, /* no targetLSN */ config->pgSetup.ssl, keeper->state.current_node_id)) { /* can't happen at the moment */ return false; } /* first, make sure we can connect with "replication" */ if (!pgctl_identify_system(upstream)) { log_error("Failed to connect to the primary node " NODE_FORMAT "with a replication connection string. " "See above for details", upstream->primaryNode.nodeId, upstream->primaryNode.name, upstream->primaryNode.host, upstream->primaryNode.port); return false; } if (!primary_rewind_to_standby(postgres)) { bool skipBaseBackup = false; bool forceCacheInvalidation = true; log_warn("Failed to rewind demoted primary to standby, " "trying pg_basebackup instead"); if (!standby_init_database(postgres, config->hostname, skipBaseBackup)) { log_error("Failed to become standby server, see above for details"); return false; } /* ensure the SSL setup is synced with the keeper config */ if (!keeper_create_self_signed_cert(keeper)) { /* errors have already been logged */ return false; } /* write our own HBA rules, pg_basebackup copies pg_hba.conf too */ if (!keeper_refresh_other_nodes(keeper, forceCacheInvalidation)) { log_error("Failed to update HBA rules after a base backup"); return false; } } /* * This node is now demoted: it used to be a primary node, it's not * anymore. The replication slots that used to be maintained by the * streaming replication protocol are now going to be maintained "manually" * by pg_autoctl using pg_replication_slot_advance(). * * There is a problem in pg_replication_slot_advance() in that it only * maintains the restart_lsn property of a replication slot, it does not * maintain the xmin of it. When re-using the pre-existing replication * slots, we want to have a NULL xmin, so we drop the slots, and then * create them again. */ if (!primary_drop_all_replication_slots(postgres)) { /* errors have already been logged */ return false; } return true; } /* * fsm_prepare_for_secondary is used when going from CATCHINGUP to SECONDARY, * to create missing replication slots. We want to maintain a replication slot * for each of the other nodes in the system, so that we make sure we have the * WAL bytes around when a standby nodes has to follow a new primary, after * failover. */ bool fsm_prepare_for_secondary(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); /* first. check that we're on the same timeline as the new primary */ if (!standby_check_timeline_with_upstream(postgres)) { /* errors have already been logged */ return false; } return keeper_maintain_replication_slots(keeper); } /* * fsm_prepare_standby_for_promotion used when the standby is asked to prepare * its own promotion. * * TODO: implement the prepare_promotion_walreceiver_timeout as follows: * * We need to loop over the `ready_to_promote' until the standby is ready. * This routine compare the time spent waiting to the setup: * * prepare_promotion_walreceiver_timeout * * The `ready_to_promote' routine eventually returns true. * * Currently the keeper only supports Synchronous Replication so this timeout * isn't necessary, that's why it's not implemented yet. The implementation * needs to happen for async rep support. */ bool fsm_prepare_standby_for_promotion(Keeper *keeper) { log_debug("No support for async replication means we don't wait until " "prepare_promotion_walreceiver_timeout (%ds)", keeper->config.prepare_promotion_walreceiver); return true; } /* * fsm_start_maintenance_on_standby is used when putting the standby in * maintenance mode (kernel upgrades, change of hardware, etc). Maintenance * means that the user now is driving the service, refrain from doing anything * ourselves. */ bool fsm_start_maintenance_on_standby(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); /* Move the Postgres controller out of the way */ if (!local_postgres_unlink_status_file(postgres)) { /* highly unexpected */ log_error("Failed to remove our Postgres status file " "see above for details"); return false; } return true; } /* * fsm_restart_standby is used when restarting a node after manual maintenance * is done. In case that changed we get the current primary from the monitor * and reset the standby setup (primary_conninfo) to target it, then restart * Postgres. * * We don't know what happened during the maintenance of the node, so we use * pg_rewind to make sure we're in a position to be a standby to the current * primary. * * So we're back to doing the exact same thing as fsm_rewind_or_init() now, and * that's why we just call that function. */ bool fsm_restart_standby(Keeper *keeper) { return fsm_rewind_or_init(keeper); } /* * fsm_promote_standby is used in several situations in the FSM transitions and * the following actions are needed to promote a standby: * * start_postgres * && promote_standby * && add_standby_to_hba * && create_replication_slot * && disable_synchronous_replication * && keeper_create_and_drop_replication_slots * * Note that the HBA and slot maintenance are done eagerly in the main keeper * loop as soon as a new node is added to the group, so we don't need to handle * those operations in the context of a the FSM transitions anymore. * * So we reuse fsm_disable_replication() here, rather than copy/pasting the same * bits code in the fsm_promote_standby() function body. If the definition of * the fsm_promote_standby transition ever came to diverge from whatever * fsm_disable_replication() is doing, we'd have to copy/paste and maintain * separate code path. * * We open the HBA connections for the other node as found per given state, * most often a DEMOTE_TIMEOUT_STATE, sometimes though MAINTENANCE_STATE. */ bool fsm_promote_standby(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); if (!ensure_postgres_service_is_running(postgres)) { log_error("Failed to promote postgres because the server could not " "be started before promotion, see above for details"); return false; } /* * If postgres is no longer in recovery mode, standby_promote returns true * immediately and therefore this function is idempotent. */ if (!standby_promote(postgres)) { log_error("Failed to promote the local postgres server from standby " "to single state, see above for details"); return false; } if (!standby_cleanup_as_primary(postgres)) { log_error("Failed to cleanup replication settings, " "see above for details"); return false; } if (!fsm_disable_replication(keeper)) { log_error("Failed to disable synchronous replication after promotion, " "see above for details"); return false; } return true; } /* * When more than one secondary is available for failover we need to pick one. * We want to pick the secondary that received the most WAL, so the monitor * asks every secondary to report its current LSN position. * * secondary ➜ report_lsn */ bool fsm_report_lsn(Keeper *keeper) { KeeperConfig *config = &(keeper->config); PostgresSetup *pgSetup = &(config->pgSetup); LocalPostgresServer *postgres = &(keeper->postgres); PGSQL *pgsql = &(postgres->sqlClient); /* * Forcibly disconnect from the primary node, for two reasons: * * 1. when the primary node can't connect to the monitor, and if there's * no replica currently connected, it will then proceed to DEMOTE * itself * * 2. that way we ensure that the current LSN we report can't change * anymore, because we are a standby without a primary_conninfo, and * without a restore_command either * * To disconnect the current node from its primary, we write a recovery * setup where there is no primary_conninfo and otherwise use the same * parameters as for streaming replication. */ NodeAddress upstreamNode = { 0 }; if (!standby_init_replication_source(postgres, &upstreamNode, PG_AUTOCTL_REPLICA_USERNAME, config->replication_password, config->replication_slot_name, config->maximum_backup_rate, config->backupDirectory, NULL, /* no targetLSN */ config->pgSetup.ssl, keeper->state.current_node_id)) { /* can't happen at the moment */ return false; } log_info("Restarting standby node to disconnect replication " "from failed primary node, to prepare failover"); if (!standby_restart_with_current_replication_source(postgres)) { log_error("Failed to disconnect from failed primary node, " "see above for details"); return false; } /* * Fetch most recent metadata, that will be sent in the next node_active() * call. */ if (!pgsql_get_postgres_metadata(pgsql, &pgSetup->is_in_recovery, postgres->pgsrSyncState, postgres->currentLSN, &(postgres->postgresSetup.control))) { log_error("Failed to update the local Postgres metadata"); return false; } return true; } /* * fsm_report_lsn_and_drop_replication_slots is used when a former primary node * has been demoted and gets back online during the secondary election. * * As Postgres pg_replication_slot_advance() function does not maintain the * xmin property of the slot, we want to create new inactive slots now rather * than continue using previously-active (streaming replication) slots. */ bool fsm_report_lsn_and_drop_replication_slots(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); if (!fsm_report_lsn(keeper)) { /* errors have already been reported */ return false; } return primary_drop_all_replication_slots(postgres); } /* * When the selected failover candidate does not have the latest received WAL, * it fetches them from another standby, the first one with the most LSN * available. */ bool fsm_fast_forward(Keeper *keeper) { KeeperConfig *config = &(keeper->config); LocalPostgresServer *postgres = &(keeper->postgres); PostgresSetup *pgSetup = &(postgres->postgresSetup); ReplicationSource *upstream = &(postgres->replicationSource); NodeAddress upstreamNode = { 0 }; char slotName[MAXCONNINFO] = { 0 }; /* get the primary node to follow */ if (!keeper_get_most_advanced_standby(keeper, &upstreamNode)) { log_error("Failed to fast forward from the most advanced standby node, " "see above for details"); return false; } /* * Postgres 10 does not have pg_replication_slot_advance(), so we don't * support replication slots on standby nodes there. */ if (pgSetup->control.pg_control_version >= 1100) { strlcpy(slotName, config->replication_slot_name, MAXCONNINFO); } if (!standby_init_replication_source(postgres, &upstreamNode, PG_AUTOCTL_REPLICA_USERNAME, config->replication_password, slotName, config->maximum_backup_rate, config->backupDirectory, upstreamNode.lsn, config->pgSetup.ssl, keeper->state.current_node_id)) { /* can't happen at the moment */ return false; } if (!standby_fetch_missing_wal(postgres)) { log_error("Failed to fetch WAL bytes from standby node " NODE_FORMAT ", see above for details", upstream->primaryNode.nodeId, upstream->primaryNode.name, upstream->primaryNode.host, upstream->primaryNode.port); return false; } return true; } /* * fsm_cleanup_as_primary cleans-up the replication setting. It's called after * a fast-forward operation. */ bool fsm_cleanup_as_primary(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); if (!standby_cleanup_as_primary(postgres)) { log_error("Failed to cleanup replication settings and restart Postgres " "to continue as a primary, see above for details"); return false; } return true; } /* * When the failover is done we need to follow the new primary. We should be * able to do that directly, by changing our primary_conninfo, thanks to our * candidate selection where we make it so that the failover candidate always * has the most advanced LSN, and also thanks to our use of replication slots * on every standby. */ bool fsm_follow_new_primary(Keeper *keeper) { KeeperConfig *config = &(keeper->config); LocalPostgresServer *postgres = &(keeper->postgres); ReplicationSource *replicationSource = &(postgres->replicationSource); /* get the primary node to follow */ if (!keeper_get_primary(keeper, &(postgres->replicationSource.primaryNode))) { log_error("Failed to initialize standby for lack of a primary node, " "see above for details"); return false; } if (!standby_init_replication_source(postgres, NULL, PG_AUTOCTL_REPLICA_USERNAME, config->replication_password, config->replication_slot_name, config->maximum_backup_rate, config->backupDirectory, NULL, /* no targetLSN */ config->pgSetup.ssl, keeper->state.current_node_id)) { /* can't happen at the moment */ return false; } if (!standby_follow_new_primary(postgres)) { log_error("Failed to change standby setup to follow new primary " "node " NODE_FORMAT ", see above for details", replicationSource->primaryNode.nodeId, replicationSource->primaryNode.name, replicationSource->primaryNode.host, replicationSource->primaryNode.port); return false; } /* now, in case we have an init state file around, remove it */ if (!unlink_file(config->pathnames.init)) { /* errors have already been logged */ return false; } /* * Finally, check that we're on the same timeline as the new primary when * assigned secondary as a goal state. This transition function is also * used when going from secondary to catchingup, as the primary might have * changed also in that situation. */ if (keeper->state.assigned_role == SECONDARY_STATE) { return standby_check_timeline_with_upstream(postgres); } return true; } /* * fsm_init_from_standby creates a new node from existing nodes that are still * available but not setup to be a candidate for promotion. */ bool fsm_init_from_standby(Keeper *keeper) { KeeperConfig *config = &(keeper->config); LocalPostgresServer *postgres = &(keeper->postgres); NodeAddress upstreamNode = { 0 }; /* get the primary node to follow */ if (!keeper_get_most_advanced_standby(keeper, &upstreamNode)) { log_error("Failed to initialise from the most advanced standby node, " "see above for details"); return false; } if (!standby_init_replication_source(postgres, &upstreamNode, PG_AUTOCTL_REPLICA_USERNAME, config->replication_password, "", /* no replication slot */ config->maximum_backup_rate, config->backupDirectory, upstreamNode.lsn, config->pgSetup.ssl, keeper->state.current_node_id)) { /* can't happen at the moment */ return false; } return fsm_init_standby_from_upstream(keeper); } /* * fsm_drop_node is called to finish dropping a node on the client side. * * This stops postgres and updates the postgres state file to say that postgres * should be stopped. It also cleans up any existing init file. Not doing these * two things can confuse a possible future re-init of the node. */ bool fsm_drop_node(Keeper *keeper) { KeeperConfig *config = &(keeper->config); if (!fsm_stop_postgres(keeper)) { /* errors have already been logged */ return false; } return unlink_file(config->pathnames.init); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/ini_file.c000066400000000000000000000354451414244367200225120ustar00rootroot00000000000000/* * src/bin/pg_autoctl/ini_file.c * Functions to parse a configuration file using the .INI syntax. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include "ini.h" #include "ini_file.h" #include "log.h" #include "pgctl.h" #include "parson.h" #include "pgsetup.h" #include "string_utils.h" /* * Load a configuration file in the INI format. */ bool read_ini_file(const char *filename, IniOption *optionList) { char *fileContents = NULL; long fileSize = 0L; /* read the current postgresql.conf contents */ if (!read_file(filename, &fileContents, &fileSize)) { return false; } return parse_ini_buffer(filename, fileContents, optionList); } /* * parse_ini_buffer parses the content of a config.ini file. */ bool parse_ini_buffer(const char *filename, char *fileContents, IniOption *optionList) { IniOption *option; /* parse the content of the file as per INI syntax rules */ ini_t *ini = ini_load(fileContents, NULL); free(fileContents); /* * Now that the INI file is loaded into a generic structure, run through it * to find given opts and set. */ for (option = optionList; option->type != INI_END_T; option++) { int optionIndex; char *val; int sectionIndex = ini_find_section(ini, option->section, 0); if (sectionIndex == INI_NOT_FOUND) { if (option->required) { log_error("Failed to find section %s in \"%s\"", option->section, filename); ini_destroy(ini); return false; } optionIndex = INI_NOT_FOUND; } else { optionIndex = ini_find_property(ini, sectionIndex, option->name, 0); } /* * When we didn't find an option, we have three cases to consider: * 1. it's required, error out * 2. it's a compatibility option, skip it * 3. use the default value instead */ if (optionIndex == INI_NOT_FOUND) { if (option->required) { log_error("Failed to find option %s.%s in \"%s\"", option->section, option->name, filename); ini_destroy(ini); return false; } else if (option->compat) { /* skip compatibility options that are not found */ continue; } else { switch (option->type) { case INI_INT_T: { *(option->intValue) = option->intDefault; break; } case INI_STRING_T: case INI_STRBUF_T: { ini_set_option_value(option, option->strDefault); break; } default: /* should never happen, or it's a development bug */ log_fatal("Unknown option type %d", option->type); ini_destroy(ini); return false; } } } else { val = (char *) ini_property_value(ini, sectionIndex, optionIndex); log_trace("%s.%s = %s", option->section, option->name, val); if (val != NULL) { if (!ini_set_option_value(option, val)) { /* we logged about it already */ ini_destroy(ini); return false; } } } } ini_destroy(ini); return true; } /* * ini_validate_options walks through an optionList and installs default values * when necessary, and returns false if any required option is missing and * doesn't have a default provided. */ bool ini_validate_options(IniOption *optionList) { IniOption *option; for (option = optionList; option->type != INI_END_T; option++) { char optionName[BUFSIZE]; int n = sformat(optionName, BUFSIZE, "%s.%s", option->section, option->name); if (option->optName) { sformat(optionName + n, BUFSIZE - n, " (--%s)", option->optName); } switch (option->type) { case INI_INT_T: { if (*(option->intValue) == -1 && option->intDefault != -1) { *(option->intValue) = option->intDefault; } if (option->required && *(option->intValue) == -1) { log_error("Option %s is required and has not been set", optionName); return false; } break; } case INI_STRING_T: { if (*(option->strValue) == NULL && option->strDefault != NULL) { ini_set_option_value(option, option->strDefault); } if (option->required && *(option->strValue) == NULL) { log_error("Option %ss is required and has not been set", optionName); return false; } break; } case INI_STRBUF_T: { if (IS_EMPTY_STRING_BUFFER(option->strBufValue) && option->strDefault != NULL) { ini_set_option_value(option, option->strDefault); } if (option->required && IS_EMPTY_STRING_BUFFER(option->strBufValue)) { log_error("Option %s is required and has not been set", optionName); return false; } break; } default: /* should never happen, or it's a development bug */ log_fatal("Unknown option type %d", option->type); return false; } } return true; } /* * ini_set_option_value saves given value to option, parsing the value string * as its type require. */ bool ini_set_option_value(IniOption *option, const char *value) { if (option == NULL) { return false; } switch (option->type) { case INI_STRING_T: { if (value == NULL) { *(option->strValue) = NULL; } else { *(option->strValue) = strdup(value); } break; } case INI_STRBUF_T: { /* * When given a String Buffer str[SIZE], then we are given strbuf * as the address where to host the data directly. */ if (value == NULL) { /* null are handled as bytes of '\0' in string buffers */ bzero((void *) option->strBufValue, option->strBufferSize); } else { strlcpy((char *) option->strBufValue, value, option->strBufferSize); } break; } case INI_INT_T: { if (value) { int nb; if (!stringToInt(value, &nb)) { log_error("Failed to parse %s.%s's value \"%s\" as a number", option->section, option->name, value); return false; } *(option->intValue) = nb; } break; } default: { /* developer error, should never happen */ log_fatal("Unknown option type %d", option->type); return false; } } return true; } /* * Format a single option as a string value. */ bool ini_option_to_string(IniOption *option, char *dest, size_t size) { switch (option->type) { case INI_STRING_T: { /* option->strValue is a char **, both pointers could be NULL */ if (option->strValue == NULL || *(option->strValue) == NULL) { return false; } strlcpy(dest, *(option->strValue), size); return true; } case INI_STRBUF_T: { strlcpy(dest, (char *) option->strBufValue, size); return true; } case INI_INT_T: { sformat(dest, size, "%d", *(option->intValue)); return true; } default: { log_fatal("Unknown option type %d", option->type); return false; } } return false; } #define streq(x, y) ((x != NULL) && (y != NULL) && ( \ strcmp(x, y) == 0)) /* * write_ini_to_stream writes in-memory INI structure to given STREAM in the * INI format specifications. */ bool write_ini_to_stream(FILE *stream, IniOption *optionList) { char *currentSection = NULL; IniOption *option; for (option = optionList; option->type != INI_END_T; option++) { /* we read "compatibility" options but never write them back */ if (option->compat) { continue; } /* we might need to open a new section */ if (!streq(currentSection, option->section)) { if (currentSection != NULL) { fformat(stream, "\n"); } currentSection = (char *) option->section; fformat(stream, "[%s]\n", currentSection); } switch (option->type) { case INI_INT_T: { fformat(stream, "%s = %d\n", option->name, *(option->intValue)); break; } case INI_STRING_T: { char *value = *(option->strValue); if (value) { fformat(stream, "%s = %s\n", option->name, value); } else if (option->required) { log_error("Option %s.%s is required but is not set", option->section, option->name); return false; } break; } case INI_STRBUF_T: { /* here we have a string buffer, which is its own address */ char *value = (char *) option->strBufValue; if (value[0] != '\0') { fformat(stream, "%s = %s\n", option->name, value); } else if (option->required) { log_error("Option %s.%s is required but is not set", option->section, option->name); return false; } break; } default: { /* developper error, should never happen */ log_fatal("Unknown option type %d", option->type); break; } } } fflush(stream); return true; } /* * ini_to_json populates the given JSON value with the contents of the INI * file. Sections become JSON objects, options the keys to the section objects. */ bool ini_to_json(JSON_Object *jsRoot, IniOption *optionList) { char *currentSection = NULL; JSON_Value *currentSectionJs = NULL; JSON_Object *currentSectionJsObj = NULL; IniOption *option = NULL; for (option = optionList; option->type != INI_END_T; option++) { /* we read "compatibility" options but never write them back */ if (option->compat) { continue; } /* we might need to open a new section */ if (!streq(currentSection, option->section)) { if (currentSection != NULL) { json_object_set_value(jsRoot, currentSection, currentSectionJs); } currentSectionJs = json_value_init_object(); currentSectionJsObj = json_value_get_object(currentSectionJs); currentSection = (char *) option->section; } switch (option->type) { case INI_INT_T: { json_object_set_number(currentSectionJsObj, option->name, (double) *(option->intValue)); break; } case INI_STRING_T: { char *value = *(option->strValue); if (value) { json_object_set_string(currentSectionJsObj, option->name, value); } else if (option->required) { log_error("Option %s.%s is required but is not set", option->section, option->name); return false; } break; } case INI_STRBUF_T: { /* here we have a string buffer, which is its own address */ char *value = (char *) option->strBufValue; if (value[0] != '\0') { json_object_set_string(currentSectionJsObj, option->name, value); } else if (option->required) { log_error("Option %s.%s is required but is not set", option->section, option->name); return false; } break; } default: { /* developper error, should never happen */ log_fatal("Unknown option type %d", option->type); break; } } } if (currentSection != NULL) { json_object_set_value(jsRoot, currentSection, currentSectionJs); } return true; } /* * lookup_ini_option implements an option lookup given a section name and an * option name. */ IniOption * lookup_ini_option(IniOption *optionList, const char *section, const char *name) { IniOption *option; /* now lookup section/option names in opts */ for (option = optionList; option->type != INI_END_T; option++) { if (streq(option->section, section) && streq(option->name, name)) { return option; } } return NULL; } /* * Lookup an option value given a "path" of section.option. */ IniOption * lookup_ini_path_value(IniOption *optionList, const char *path) { char *section_name, *option_name, *ptr; /* * Split path into section/option. */ ptr = strchr(path, '.'); if (ptr == NULL) { log_error("Failed to find a dot separator in option path \"%s\"", path); return NULL; } section_name = strdup(path); /* don't scribble on path */ option_name = section_name + (ptr - path) + 1; /* apply same offset */ *(option_name - 1) = '\0'; /* split string at the dot */ IniOption *option = lookup_ini_option(optionList, section_name, option_name); if (option == NULL) { log_error("Failed to find configuration option for path \"%s\"", path); } free(section_name); return option; } /* * ini_merge merges the options that have been set in overrideOptionList into * the options in dstOptionList, ignoring default values. */ bool ini_merge(IniOption *dstOptionList, IniOption *overrideOptionList) { IniOption *option; for (option = overrideOptionList; option->type != INI_END_T; option++) { IniOption *dstOption = lookup_ini_option(dstOptionList, option->section, option->name); if (dstOption == NULL) { /* developper error, why do we have incompatible INI options? */ log_error("BUG: ini_merge: lookup failed in dstOptionList(%s, %s)", option->section, option->name); return false; } switch (option->type) { case INI_INT_T: { if (*(option->intValue) != -1 && *(option->intValue) != 0) { *(dstOption->intValue) = *(option->intValue); } break; } case INI_STRING_T: { if (*(option->strValue) != NULL) { *(dstOption->strValue) = strdup(*(option->strValue)); } break; } case INI_STRBUF_T: { if (!IS_EMPTY_STRING_BUFFER(option->strBufValue)) { strlcpy((char *) dstOption->strBufValue, (char *) option->strBufValue, dstOption->strBufferSize); } break; } default: /* should never happen, or it's a development bug */ log_fatal("Unknown option type %d", option->type); return false; } } return true; } /* * ini_get_setting reads given INI filename and maps its content using an * optionList that instructs which options to read and what default values to * use. Then ini_get_setting looks up the given path (section.option) and sets * the given value string. */ bool ini_get_setting(const char *filename, IniOption *optionList, const char *path, char *value, size_t size) { log_debug("Reading configuration from \"%s\"", filename); if (!read_ini_file(filename, optionList)) { log_error("Failed to parse configuration file \"%s\"", filename); return false; } IniOption *option = lookup_ini_path_value(optionList, path); if (option) { return ini_option_to_string(option, value, size); } return false; } /* * ini_set_option sets the INI value to the given value. */ bool ini_set_option(IniOption *optionList, const char *path, char *value) { IniOption *option = lookup_ini_path_value(optionList, path); if (option && ini_set_option_value(option, value)) { log_debug("ini_set_option %s.%s = %s", option->section, option->name, value); return true; } return false; } /* * ini_set_setting sets the INI filename option identified by path to the given * value. optionList is used to know how to read the values in the file and * also contains the default values. */ bool ini_set_setting(const char *filename, IniOption *optionList, const char *path, char *value) { log_debug("Reading configuration from %s", filename); if (!read_ini_file(filename, optionList)) { log_error("Failed to parse configuration file \"%s\"", filename); return false; } return ini_set_option(optionList, path, value); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/ini_file.h000066400000000000000000000102671414244367200225120ustar00rootroot00000000000000/* * src/bin/pg_autoctl/ini_file.h * Functions to parse a configuration file using the .INI syntax. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef INI_FILE_H #define INI_FILE_H #include #include #include "parson.h" #define INI_STRING_T 1 /* char *target */ #define INI_STRBUF_T 2 /* char target[size] */ #define INI_INT_T 3 /* int target */ #define INI_END_T 4 /* * IniOption represent a key/value as written in the INI format: * * [section] * name = "values" * int = 10 * * The IniOption structure is used both for specifying what we expect to read * in the INI file: required, strdefault, and intdefault, and what has been * actually read from it: strval/intval. * * Given the previous contents and this structure as input: * * { * {INI_STRING_T, "section", "name", true, "default", -1, -1, &str, NULL}, * {INI_INT_T, "section", "int", true, NULL, 1, -1, NULL, &int}, * {INI_END_T, NULL, NULL, false, NULL, -1, -1, NULL, NULL} * } * * Then after reading the ini file with `read_ini_file' then *str = "values" * and *int = 10. */ typedef struct IniOption { int type; const char *section; const char *name; const char *optName; /* command line option name */ bool required; bool compat; /* compatibility: read but don't write */ char *strDefault; /* default value when type is string */ int intDefault; /* default value when type is int */ int strBufferSize; /* size of the BUFFER when INI_STRBUF_T */ char **strValue; /* pointer to a string pointer (typically malloc-ed) */ char *strBufValue; /* pointer to a string buffer (on the stack) */ int *intValue; /* pointer to an integer */ } IniOption; #define make_int_option(section, name, optName, required, value) \ { INI_INT_T, section, name, optName, required, false, \ NULL, -1, -1, NULL, NULL, value } #define make_int_option_default(section, name, optName, \ required, value, default) \ { INI_INT_T, section, name, optName, required, false, \ NULL, default, -1, NULL, NULL, value } #define make_string_option(section, name, optName, required, value) \ { INI_STRING_T, section, name, optName, required, false, \ NULL, -1, -1, value, NULL, NULL } #define make_string_option_default(section, name, optName, required, \ value, default) \ { INI_STRING_T, section, name, optName, required, false, \ default, -1, -1, value, NULL, NULL } #define make_strbuf_option(section, name, optName, required, size, value) \ { INI_STRBUF_T, section, name, optName, required, false, \ NULL, -1, size, NULL, value, NULL } #define make_strbuf_compat_option(section, name, size, value) \ { INI_STRBUF_T, section, name, NULL, false, true, \ NULL, -1, size, NULL, value, NULL } #define make_strbuf_option_default(section, name, optName, required, \ size, value, default) \ { INI_STRBUF_T, section, name, optName, required, false, \ default, -1, size, NULL, value, NULL } #define INI_OPTION_LAST \ { INI_END_T, NULL, NULL, NULL, false, false, NULL, -1, -1, NULL, NULL, NULL } bool read_ini_file(const char *filename, IniOption *opts); bool parse_ini_buffer(const char *filename, char *fileContents, IniOption *optionList); bool ini_validate_options(IniOption *optionList); bool ini_set_option_value(IniOption *option, const char *value); bool ini_option_to_string(IniOption *option, char *dest, size_t size); bool write_ini_to_stream(FILE *stream, IniOption *optionList); bool ini_to_json(JSON_Object *jsRoot, IniOption *optionList); IniOption * lookup_ini_option(IniOption *optionList, const char *section, const char *name); IniOption * lookup_ini_path_value(IniOption *optionList, const char *path); bool ini_merge(IniOption *dstOptionList, IniOption *overrideOptionList); bool ini_set_option(IniOption *optionList, const char *path, char *value); bool ini_get_setting(const char *filename, IniOption *optionList, const char *path, char *value, size_t size); bool ini_set_setting(const char *filename, IniOption *optionList, const char *path, char *value); #endif /* INI_FILE_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/ini_implementation.c000066400000000000000000000007261414244367200246120ustar00rootroot00000000000000/* * src/bin/pg_autoctl/ini_implementation.c * The file containing library code used to parse files with .INI syntax * * The main reason this is in a separate file is so you can exclude a file * during static analysis. This way we exclude vendored in library code, * but not our code using it. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. */ #define INI_IMPLEMENTATION #include "ini.h" pg_auto_failover-1.6.3/src/bin/pg_autoctl/ipaddr.c000066400000000000000000000541721414244367200221750ustar00rootroot00000000000000/* * src/bin/pg_autoctl/ipaddr.c * Find local ip used as source ip in ip packets, using getsockname and a udp * connection. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "postgres_fe.h" #include "defaults.h" #include "env_utils.h" #include "file_utils.h" #include "ipaddr.h" #include "log.h" #include "pgsetup.h" #include "pgsql.h" #include "string_utils.h" static unsigned int countSetBits(unsigned int n); static unsigned int countSetBitsv6(unsigned char *addr); static bool ipv4eq(struct sockaddr_in *a, struct sockaddr_in *b); static bool ipv6eq(struct sockaddr_in6 *a, struct sockaddr_in6 *b); static bool fetchIPAddressFromInterfaceList(char *localIpAddress, int size); static bool ipaddr_sockaddr_to_string(struct addrinfo *ai, char *ipaddr, size_t size); static bool ipaddr_getsockname(int sock, char *ipaddr, size_t size); static bool GetAddrInfo(const char *restrict node, const char *restrict service, const struct addrinfo *restrict hints, struct addrinfo **restrict res); /* * Connect to given serviceName and servicePort in TCP in order to determine * which local IP address has been used to connect. That local IP address is * then the one we use for the default --hostname value, when not provided. * * On a keeper, we use the monitor hostname as the serviceName. On the monitor, * we use DEFAULT_INTERFACE_LOOKUP_SERVICE_NAME to discover the local default * outbound IP address. */ bool fetchLocalIPAddress(char *localIpAddress, int size, const char *serviceName, int servicePort, int logLevel, bool *mayRetry) { struct addrinfo *lookup; struct addrinfo *ai; struct addrinfo hints; bool couldConnect = false; int sock; *mayRetry = false; /* prepare getaddrinfo hints for name resolution or IP address parsing */ memset(&hints, 0, sizeof(hints)); hints.ai_family = PF_UNSPEC; /* accept any family as supported by OS */ hints.ai_socktype = SOCK_STREAM; /* we only want TCP sockets */ hints.ai_protocol = IPPROTO_TCP; /* we only want TCP sockets */ if (!GetAddrInfo(serviceName, intToString(servicePort).strValue, &hints, &lookup)) { /* errors have already been logged */ return false; } for (ai = lookup; ai; ai = ai->ai_next) { char addr[BUFSIZE] = { 0 }; if (!ipaddr_sockaddr_to_string(ai, addr, sizeof(addr))) { /* errors have already been logged */ return false; } sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); if (sock < 0) { log_warn("Failed to create a socket: %m"); return false; } /* connect timeout can be quite long by default */ log_info("Connecting to %s (port %d)", addr, servicePort); int err = connect(sock, ai->ai_addr, ai->ai_addrlen); if (err < 0) { log_level(logLevel, "Failed to connect to %s: %m", addr); } else { /* found a getaddrinfo() result we could use to connect */ couldConnect = true; break; } } freeaddrinfo(lookup); if (!couldConnect) { if (env_found_empty("PG_REGRESS_SOCK_DIR")) { /* * In test environment, in case of no internet access, just use the * address of the non-loopback network interface. */ return fetchIPAddressFromInterfaceList(localIpAddress, size); } else { *mayRetry = true; if (strcmp(DEFAULT_INTERFACE_LOOKUP_SERVICE_NAME, serviceName) == 0) { log_level(logLevel, "Failed to connect to \"%s\" on port %d " "to discover this machine hostname, " "please use --hostname", serviceName, servicePort); } else { log_level(logLevel, "Failed to connect to any of the IP addresses for " "monitor hostname \"%s\" and port %d", serviceName, servicePort); } return false; } } if (!ipaddr_getsockname(sock, localIpAddress, size)) { /* errors have already been logged */ close(sock); return false; } close(sock); return true; } /* * fetchLocalCIDR loops over the local interfaces on the host and finds the one * for which the IP address is the same as the given localIpAddress parameter. * Then using the netmask information from the network interface, * fetchLocalCIDR computes the local CIDR to use in HBA in order to allow * authentication of all servers in the local network. */ bool fetchLocalCIDR(const char *localIpAddress, char *localCIDR, int size) { char network[INET6_ADDRSTRLEN]; struct ifaddrs *ifaddr, *ifa; int prefix = 0; bool found = false; if (getifaddrs(&ifaddr) == -1) { log_warn("Failed to get the list of local network inferfaces: %m"); return false; } for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { char netmask[INET6_ADDRSTRLEN] = { 0 }; char address[INET6_ADDRSTRLEN] = { 0 }; /* * Some interfaces might have an empty ifa_addr, such as when using the * PPTP protocol. With a NULL ifa_addr we can't inquire about the IP * address and its netmask to compute any CIDR notation, so we skip the * entry. */ if (ifa->ifa_addr == NULL) { log_debug("Skipping interface \"%s\" with NULL ifa_addr", ifa->ifa_name); continue; } switch (ifa->ifa_addr->sa_family) { case AF_INET: { struct sockaddr_in *netmask4 = (struct sockaddr_in *) ifa->ifa_netmask; struct sockaddr_in *address4 = (struct sockaddr_in *) ifa->ifa_addr; struct in_addr s_network; if (inet_ntop(AF_INET, (void *) &netmask4->sin_addr, netmask, INET_ADDRSTRLEN) == NULL) { /* just skip that entry then */ log_trace("Failed to determine local network CIDR: %m"); continue; } if (inet_ntop(AF_INET, (void *) &address4->sin_addr, address, INET_ADDRSTRLEN) == NULL) { /* just skip that entry then */ log_trace("Failed to determine local network CIDR: %m"); continue; } s_network.s_addr = address4->sin_addr.s_addr & netmask4->sin_addr.s_addr; prefix = countSetBits(netmask4->sin_addr.s_addr); if (inet_ntop(AF_INET, (void *) &s_network, network, INET_ADDRSTRLEN) == NULL) { /* just skip that entry then */ log_trace("Failed to determine local network CIDR: %m"); continue; } break; } case AF_INET6: { int i = 0; struct sockaddr_in6 *netmask6 = (struct sockaddr_in6 *) ifa->ifa_netmask; struct sockaddr_in6 *address6 = (struct sockaddr_in6 *) ifa->ifa_addr; struct in6_addr s_network; if (inet_ntop(AF_INET6, (void *) &netmask6->sin6_addr, netmask, INET6_ADDRSTRLEN) == NULL) { /* just skip that entry then */ log_trace("Failed to determine local network CIDR: %m"); continue; } if (inet_ntop(AF_INET6, (void *) &address6->sin6_addr, address, INET6_ADDRSTRLEN) == NULL) { /* just skip that entry then */ log_trace("Failed to determine local network CIDR: %m"); continue; } for (i = 0; i < sizeof(struct in6_addr); i++) { s_network.s6_addr[i] = address6->sin6_addr.s6_addr[i] & netmask6->sin6_addr.s6_addr[i]; } prefix = countSetBitsv6(netmask6->sin6_addr.s6_addr); if (inet_ntop(AF_INET6, &s_network, network, INET6_ADDRSTRLEN) == NULL) { /* just skip that entry then */ log_trace("Failed to determine local network CIDR: %m"); continue; } break; } default: continue; } if (strcmp(address, localIpAddress) == 0) { found = true; break; } } freeifaddrs(ifaddr); if (!found) { return false; } sformat(localCIDR, size, "%s/%d", network, prefix); return true; } /* * countSetBits return how many bits are set (to 1) in an integer. When given a * netmask, that's the CIDR prefix. */ static unsigned int countSetBits(unsigned int n) { unsigned int count = 0; while (n) { count += n & 1; n >>= 1; } return count; } /* * countSetBitsv6 returns how many bits are set (to 1) in an IPv6 address, an * array of 16 unsigned char values. When given a netmask, that's the * prefixlen. */ static unsigned int countSetBitsv6(unsigned char *addr) { int i = 0; unsigned int count = 0; for (i = 0; i < 16; i++) { unsigned char n = addr[i]; while (n) { count += n & 1; n >>= 1; } } return count; } /* * Fetches the IP address of the first non-loopback interface with an ip4 * address. */ static bool fetchIPAddressFromInterfaceList(char *localIpAddress, int size) { bool found = false; struct ifaddrs *ifaddrList = NULL, *ifaddr = NULL; if (getifaddrs(&ifaddr) == -1) { log_error("Failed to get the list of local network inferfaces: %m"); return false; } for (ifaddr = ifaddrList; ifaddr != NULL; ifaddr = ifaddr->ifa_next) { if (ifaddr->ifa_flags & IFF_LOOPBACK) { log_trace("Skipping loopback interface \"%s\"", ifaddr->ifa_name); continue; } /* * Some interfaces might have an empty ifa_addr, such as when using the * PPTP protocol. With a NULL ifa_addr we can't inquire about the IP * address and its netmask to compute any CIDR notation, so we skip the * entry. */ if (ifaddr->ifa_addr == NULL) { log_debug("Skipping interface \"%s\" with NULL ifa_addr", ifaddr->ifa_name); continue; } /* * We only support IPv4 here, also this function is only called in test * environment where we run in a docker container with a network * namespace in which we use only IPv4, so that's ok. */ if (ifaddr->ifa_addr->sa_family == AF_INET) { struct sockaddr_in *ip = (struct sockaddr_in *) ifaddr->ifa_addr; if (inet_ntop(AF_INET, (void *) &(ip->sin_addr), localIpAddress, size) == NULL) { /* skip that address, silently */ log_trace("Failed to determine local network CIDR: %m"); continue; } found = true; break; } } freeifaddrs(ifaddrList); return found; } /* * From /Users/dim/dev/PostgreSQL/postgresql/src/backend/libpq/hba.c */ static bool ipv4eq(struct sockaddr_in *a, struct sockaddr_in *b) { return (a->sin_addr.s_addr == b->sin_addr.s_addr); } /* * From /Users/dim/dev/PostgreSQL/postgresql/src/backend/libpq/hba.c */ static bool ipv6eq(struct sockaddr_in6 *a, struct sockaddr_in6 *b) { int i; for (i = 0; i < 16; i++) { if (a->sin6_addr.s6_addr[i] != b->sin6_addr.s6_addr[i]) { return false; } } return true; } /* * findHostnameLocalAddress does a reverse DNS lookup given a hostname * (--hostname), and if the DNS lookup fails or doesn't return any local IP * address, then returns false. */ bool findHostnameLocalAddress(const char *hostname, char *localIpAddress, int size) { struct addrinfo *dns_lookup_addr; struct addrinfo *dns_addr; struct ifaddrs *ifaddrList, *ifaddr; if (!GetAddrInfo(hostname, NULL, 0, &dns_lookup_addr)) { /* errors have already been logged */ return false; } /* * Loop over DNS results for the given hostname. Filter out loopback * devices, and for each IP address given by the look-up, check if we * have a corresponding local interface bound to the IP address. */ if (getifaddrs(&ifaddrList) == -1) { log_warn("Failed to get the list of local network inferfaces: %m"); return false; } /* * Compare both addresses list (dns lookup and list of interface * addresses) in a nested loop fashion: lists are not sorted, and we * expect something like a dozen entry per list anyway. */ for (dns_addr = dns_lookup_addr; dns_addr != NULL; dns_addr = dns_addr->ai_next) { for (ifaddr = ifaddrList; ifaddr != NULL; ifaddr = ifaddr->ifa_next) { /* * Some interfaces might have an empty ifa_addr, such as when using * the PPTP protocol. With a NULL ifa_addr we can't inquire about * the IP address and its netmask to compute any CIDR notation, so * we skip the entry. */ if (ifaddr->ifa_addr == NULL) { log_debug("Skipping interface \"%s\" with NULL ifa_addr", ifaddr->ifa_name); continue; } if (ifaddr->ifa_addr->sa_family == AF_INET && dns_addr->ai_family == AF_INET) { struct sockaddr_in *ip = (struct sockaddr_in *) ifaddr->ifa_addr; if (ipv4eq(ip, (struct sockaddr_in *) dns_addr->ai_addr)) { /* * Found an IP address in the DNS answer that * matches one of the interfaces IP addresses on * the machine. */ freeaddrinfo(dns_lookup_addr); if (inet_ntop(AF_INET, (void *) &(ip->sin_addr), localIpAddress, size) == NULL) { log_warn("Failed to determine local ip address: %m"); freeifaddrs(ifaddrList); return false; } freeifaddrs(ifaddrList); return true; } } else if (ifaddr->ifa_addr->sa_family == AF_INET6 && dns_addr->ai_family == AF_INET6) { struct sockaddr_in6 *ip = (struct sockaddr_in6 *) ifaddr->ifa_addr; if (ipv6eq(ip, (struct sockaddr_in6 *) dns_addr->ai_addr)) { /* * Found an IP address in the DNS answer that * matches one of the interfaces IP addresses on * the machine. */ freeaddrinfo(dns_lookup_addr); if (inet_ntop(AF_INET6, (void *) &(ip->sin6_addr), localIpAddress, size) == NULL) { /* check size >= INET6_ADDRSTRLEN */ log_warn("Failed to determine local ip address: %m"); freeifaddrs(ifaddrList); return false; } freeifaddrs(ifaddrList); return true; } } } } freeifaddrs(ifaddrList); freeaddrinfo(dns_lookup_addr); return false; } /* * ip_address_type parses the hostname and determines whether it is an IPv4 * address, IPv6 address, or DNS name. * * To edit pg HBA file, when given an IP address (rather than a hostname), we * need to compute the CIDR mask. In the case of ipv4, that's /32, in the case * of ipv6, that's /128. The `ip_address_type' function discovers which type of * IP address we are dealing with. */ IPType ip_address_type(const char *hostname) { struct in_addr ipv4; struct in6_addr ipv6; if (hostname == NULL) { return IPTYPE_NONE; } else if (inet_pton(AF_INET, hostname, &ipv4) == 1) { log_trace("hostname \"%s\" is ipv4", hostname); return IPTYPE_V4; } else if (inet_pton(AF_INET6, hostname, &ipv6) == 1) { log_trace("hostname \"%s\" is ipv6", hostname); return IPTYPE_V6; } return IPTYPE_NONE; } /* * findHostnameFromLocalIpAddress does a reverse DNS lookup from a given IP * address, and returns the first hostname of the DNS response. */ bool findHostnameFromLocalIpAddress(char *localIpAddress, char *hostname, int size) { char hbuf[NI_MAXHOST]; struct addrinfo *lookup, *ai; /* parse ipv4 or ipv6 address using getaddrinfo() */ if (!GetAddrInfo(localIpAddress, NULL, 0, &lookup)) { /* errors have already been logged */ return false; } /* now reverse lookup (NI_NAMEREQD) the address with getnameinfo() */ for (ai = lookup; ai; ai = ai->ai_next) { int ret = getnameinfo(ai->ai_addr, ai->ai_addrlen, hbuf, sizeof(hbuf), NULL, 0, NI_NAMEREQD); if (ret != 0) { log_warn("Failed to resolve hostname from address \"%s\": %s", localIpAddress, gai_strerror(ret)); return false; } sformat(hostname, size, "%s", hbuf); /* stop at the first hostname found */ break; } freeaddrinfo(lookup); return true; } /* * resolveHostnameForwardAndReverse returns true when we could do a forward DNS * lookup for the hostname and one of the IP addresses from the lookup resolves * back to the hostname when doing a reverse-DNS lookup from it. * * When Postgres runs the DNS checks in the HBA implementation, the client IP * address is looked-up in a reverse DNS query, and that name is compared to * the hostname in the HBA file. Then, a forward DNS query is performed on the * hostname, and one of the IP addresses returned must match with the client IP * address. * * client ip -- reverse dns lookup --> hostname * hostname -- forward dns lookup --> { ... client ip ... } * * At this point we don't have a client IP address. That said, the Postgres * check will always fail if we fail to get our hostname back from at least one * of the IP addresses that our hostname forward-DNS query returns. */ bool resolveHostnameForwardAndReverse(const char *hostname, char *ipaddr, int size, bool *foundHostnameFromAddress) { struct addrinfo *lookup, *ai; *foundHostnameFromAddress = false; if (!GetAddrInfo(hostname, NULL, 0, &lookup)) { /* errors have already been logged */ return false; } /* when everything fails, we return a proper empty string buffer */ bzero((void *) ipaddr, size); /* loop over the forward DNS results for hostname */ for (ai = lookup; ai; ai = ai->ai_next) { char candidateIPAddr[BUFSIZE] = { 0 }; char hbuf[NI_MAXHOST] = { 0 }; if (!ipaddr_sockaddr_to_string(ai, candidateIPAddr, BUFSIZE)) { /* errors have already been logged */ continue; } /* keep the first IP address of the list */ if (IS_EMPTY_STRING_BUFFER(ipaddr)) { strlcpy(ipaddr, candidateIPAddr, size); } log_debug("%s has address %s", hostname, candidateIPAddr); /* now reverse lookup (NI_NAMEREQD) the address with getnameinfo() */ int ret = getnameinfo(ai->ai_addr, ai->ai_addrlen, hbuf, sizeof(hbuf), NULL, 0, NI_NAMEREQD); if (ret != 0) { log_debug("Failed to resolve hostname from address \"%s\": %s", ipaddr, gai_strerror(ret)); continue; } log_debug("reverse lookup for \"%s\" gives \"%s\" first", candidateIPAddr, hbuf); /* compare reverse-DNS lookup result with our hostname */ if (strcmp(hbuf, hostname) == 0) { *foundHostnameFromAddress = true; break; } } freeaddrinfo(lookup); return true; } /* * ipaddr_sockaddr_to_string converts a binary socket address to its string * representation using inet_ntop(3). */ static bool ipaddr_sockaddr_to_string(struct addrinfo *ai, char *ipaddr, size_t size) { if (ai->ai_family == AF_INET) { struct sockaddr_in *ip = (struct sockaddr_in *) ai->ai_addr; if (inet_ntop(AF_INET, (void *) &(ip->sin_addr), ipaddr, size) == NULL) { log_debug("Failed to determine local ip address: %m"); return false; } } else if (ai->ai_family == AF_INET6) { struct sockaddr_in6 *ip = (struct sockaddr_in6 *) ai->ai_addr; if (inet_ntop(AF_INET6, (void *) &(ip->sin6_addr), ipaddr, size) == NULL) { log_debug("Failed to determine local ip address: %m"); return false; } } else { /* Highly unexpected */ log_debug("Non supported ai_family %d", ai->ai_family); return false; } return true; } /* * ipaddr_getsockname gets the IP address "name" from a connected socket. */ static bool ipaddr_getsockname(int sock, char *ipaddr, size_t size) { struct sockaddr_storage address = { 0 }; socklen_t sockaddrlen = sizeof(address); int err = getsockname(sock, (struct sockaddr *) (&address), &sockaddrlen); if (err < 0) { log_warn("Failed to get IP address from socket: %m"); return false; } if (address.ss_family == AF_INET) { struct sockaddr_in *ip = (struct sockaddr_in *) &address; if (inet_ntop(AF_INET, (void *) &(ip->sin_addr), ipaddr, size) == NULL) { log_debug("Failed to determine local ip address: %m"); return false; } } else if (address.ss_family == AF_INET6) { struct sockaddr_in6 *ip = (struct sockaddr_in6 *) &address; if (inet_ntop(AF_INET6, (void *) &(ip->sin6_addr), ipaddr, size) == NULL) { log_debug("Failed to determine local ip address: %m"); return false; } } else { log_debug("Non supported ss_family %d", address.ss_family); return false; } return true; } /* * ipaddrGetLocalHostname uses gethostname(3) to get the current machine * hostname. We only use the result from gethostname(3) when in turn we can * resolve the result to an IP address that is present on the local machine. * * Failing to match the hostname to a local IP address, we then use the default * lookup service name and port instead (we would then connect to a google * provided DNS service to see what is the default network interface/source * address to connect to a remote endpoint; to avoid any of that process just * using pg_autoctl with the --hostname option). */ bool ipaddrGetLocalHostname(char *hostname, size_t size) { char localIpAddress[BUFSIZE] = { 0 }; char hostnameCandidate[_POSIX_HOST_NAME_MAX] = { 0 }; if (gethostname(hostnameCandidate, sizeof(hostnameCandidate)) == -1) { log_warn("Failed to get local hostname: %m"); return false; } log_debug("ipaddrGetLocalHostname: \"%s\"", hostnameCandidate); /* do a lookup of the host name and see that we get a local address back */ if (!findHostnameLocalAddress(hostnameCandidate, localIpAddress, BUFSIZE)) { log_warn("Failed to get a local IP address for hostname \"%s\"", hostnameCandidate); return false; } strlcpy(hostname, hostnameCandidate, size); return true; } /* * GetAddrInfo calls getaddrinfo and implement a retry policy in case we get a * transient failure from the system. And for kubernetes compatibility, we also * retry when the plain EAI_FAIL error code is returned, because DNS entries in * this environments are dynamic. */ static bool GetAddrInfo(const char *restrict node, const char *restrict service, const struct addrinfo *restrict hints, struct addrinfo **restrict res) { bool success = false; ConnectionRetryPolicy retryPolicy = { 0 }; (void) pgsql_set_interactive_retry_policy(&retryPolicy); while (!pgsql_retry_policy_expired(&retryPolicy)) { int error = getaddrinfo(node, service, hints, res); /* * Given docker/kubernetes environments, we treat permanent DNS * failures (EAI_FAIL) as a retryable condition, same as EAI_AGAIN. */ if (error != 0 && error != EAI_AGAIN && error != EAI_FAIL) { log_warn("Failed to resolve DNS name \"%s\": %s", node, gai_strerror(error)); return false; } else if (error != 0) { log_debug("Failed to resolve DNS name \"%s\": %s", node, gai_strerror(error)); } success = (error == 0); if (success) { break; } int sleepTimeMs = pgsql_compute_connection_retry_sleep_time(&retryPolicy); /* we have milliseconds, pg_usleep() wants microseconds */ (void) pg_usleep(sleepTimeMs * 1000); } return success; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/ipaddr.h000066400000000000000000000020331414244367200221670ustar00rootroot00000000000000/* * src/bin/pg_autoctl/ipaddr.h * Find local ip used as source ip in ip packets, using getsockname and a udp * connection. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef __IPADDRH__ #define __IPADDRH__ #include typedef enum { IPTYPE_V4, IPTYPE_V6, IPTYPE_NONE } IPType; IPType ip_address_type(const char *hostname); bool fetchLocalIPAddress(char *localIpAddress, int size, const char *serviceName, int servicePort, int logLevel, bool *mayRetry); bool fetchLocalCIDR(const char *localIpAddress, char *localCIDR, int size); bool findHostnameLocalAddress(const char *hostname, char *localIpAddress, int size); bool findHostnameFromLocalIpAddress(char *localIpAddress, char *hostname, int size); bool resolveHostnameForwardAndReverse(const char *hostname, char *ipaddr, int size, bool *foundHostnameFromAddress); bool ipaddrGetLocalHostname(char *hostname, size_t size); #endif /* __IPADDRH__ */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/keeper.c000066400000000000000000002530041414244367200222000ustar00rootroot00000000000000/* * src/bin/pg_autoctl/keeper.c * Keeper state functions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "parson.h" #include "cli_common.h" #include "cli_root.h" #include "env_utils.h" #include "file_utils.h" #include "fsm.h" #include "keeper.h" #include "keeper_config.h" #include "keeper_pg_init.h" #include "parsing.h" #include "pghba.h" #include "pgsetup.h" #include "primary_standby.h" #include "signals.h" #include "state.h" #include "runprogram.h" static bool keeper_state_check_postgres(Keeper *keeper, PostgresControlData *control); static void diff_nodesArray(NodeAddressArray *previousNodesArray, NodeAddressArray *currentNodesArray, NodeAddressArray *diffNodesArray); /* * keeper_init initializes the keeper logic according to the given keeper * configuration. It also reads the state file from disk. The state file * must be generated before calling keeper_init. */ bool keeper_init(Keeper *keeper, KeeperConfig *config) { PostgresSetup *pgSetup = &(config->pgSetup); keeper->config = *config; local_postgres_init(&keeper->postgres, pgSetup); if (!config->monitorDisabled) { if (!monitor_init(&keeper->monitor, config->monitor_pguri)) { return false; } } if (!keeper_load_state(keeper)) { /* errors logged in keeper_state_read */ return false; } return true; } /* * keeper_load_state loads the current state of the keeper from the * configured state file. */ bool keeper_load_state(Keeper *keeper) { KeeperStateData *keeperState = &(keeper->state); KeeperConfig *config = &(keeper->config); return keeper_state_read(keeperState, config->pathnames.state); } /* * keeper_store_state stores the current state of the keeper in the configured * state file. */ bool keeper_store_state(Keeper *keeper) { KeeperStateData *keeperState = &(keeper->state); KeeperConfig *config = &(keeper->config); return keeper_state_write(keeperState, config->pathnames.state); } /* * keeper_update_state updates the keeper state and immediately writes * it to disk. */ bool keeper_update_state(Keeper *keeper, int64_t node_id, int group_id, NodeState state, bool update_last_monitor_contact) { KeeperStateData *keeperState = &(keeper->state); uint64_t now = time(NULL); if (update_last_monitor_contact) { keeperState->last_monitor_contact = now; } /* * See state.h for details about why this test. We could migrate the state * nodeId to an int64_t, but that's a TODO item still at this point. It * would require being able to read the old state format on-disk and * convert automatically to the new one in-memory. */ if (node_id >= LONG_MAX) { log_fatal("Current node id does not fit in a 32 bits integer."); log_info("Please report a bug to pg_auto_failover by opening " "an issue on Github project at " "https://github.com/citusdata/pg_auto_failover."); return false; } keeperState->current_node_id = node_id; keeperState->current_group = group_id; keeperState->assigned_role = state; if (!keeper_store_state(keeper)) { /* keeper_state_write logs errors */ return false; } log_keeper_state(keeperState); return true; } /* * keeper_should_ensure_current_state returns true when pg_autoctl should * ensure that Postgres is running, or not running, depending on the current * FSM state, before calling the transition function to the next state. * * At the moment, the only cases when we DON'T want to ensure the current state * are when either the current state or the goal state are one of the following: * * - DRAINING * - DEMOTED * - DEMOTE TIMEOUT * * That's because we would then stop Postgres first when going from DEMOTED to * SINGLE, or ensure Postgres is running when going from PRIMARY to DEMOTED. * This last example is a split-brain hazard, too. */ bool keeper_should_ensure_current_state_before_transition(Keeper *keeper) { KeeperStateData *keeperState = &(keeper->state); if (keeperState->assigned_role == keeperState->current_role) { /* this function should not be called in that case */ log_debug("BUG: keeper_should_ensure_current_state_before_transition " "called with assigned role == current role == %s", NodeStateToString(keeperState->assigned_role)); return false; } if (keeperState->assigned_role == DRAINING_STATE || keeperState->assigned_role == DEMOTE_TIMEOUT_STATE || keeperState->assigned_role == DEMOTED_STATE) { /* don't ensure Postgres is running before shutting it down */ return false; } if (keeperState->current_role == DRAINING_STATE || keeperState->current_role == DEMOTE_TIMEOUT_STATE || keeperState->current_role == DEMOTED_STATE) { /* don't ensure Postgres is down before starting it again */ return false; } if (keeperState->current_role == SECONDARY_STATE && keeperState->assigned_role != SECONDARY_STATE) { /* * We might have a different primary server to reconnect to, or be * asked to report lsn, etc. Ensuring the secondary state does not * sound productive there. */ return false; } /* in all other cases, yes please ensure the current state */ return true; } /* * keeper_ensure_current_state ensures that the current keeper's state is met * with the current PostgreSQL status, at minimum that PostgreSQL is running * when it's expected to be, etc. */ bool keeper_ensure_current_state(Keeper *keeper) { KeeperStateData *keeperState = &(keeper->state); LocalPostgresServer *postgres = &(keeper->postgres); log_debug("Ensuring current state: %s", NodeStateToString(keeperState->current_role)); switch (keeperState->current_role) { /* * When in primary state, publishing that PostgreSQL is down might * trigger a failover. This is the best solution only when we tried * everything else. So first, retry starting PostgreSQL a couple more * times. * * See configuration parameters: * * timeout.postgresql_fails_to_start_timeout (default 20s) * timeout.postgresql_fails_to_start_retries (default 3 times) */ case SINGLE_STATE: case PRIMARY_STATE: case WAIT_PRIMARY_STATE: case JOIN_PRIMARY_STATE: case APPLY_SETTINGS_STATE: { if (!keeper_ensure_postgres_is_running(keeper, true)) { /* errors have already been logged */ return false; } /* when a standby has been removed, remove its replication slot */ return keeper_create_and_drop_replication_slots(keeper); } /* * In the following states, we don't want to maintain local replication * slots, either because we're a primary and the replication protocol * is taking care of that, or because we're in the middle of changing * the replication upstream node. */ case PREP_PROMOTION_STATE: case STOP_REPLICATION_STATE: { return keeper_ensure_postgres_is_running(keeper, false); } case SECONDARY_STATE: case REPORT_LSN_STATE: { bool updateRetries = false; if (!keeper_ensure_postgres_is_running(keeper, updateRetries)) { /* errors have already been logged */ return false; } /* now ensure progress is made on the replication slots */ return keeper_maintain_replication_slots(keeper); } /* * We don't maintain replication slots in CATCHINGUP state. We might * not be in a position to pg_replication_slot_advance() the slot to * the position required by the other standby nodes. Typically we would * get a Postgres error such as the following: * * cannot advance replication slot to 0/5000060, minimum is 0/6000028 */ case CATCHINGUP_STATE: { bool updateRetries = false; return keeper_ensure_postgres_is_running(keeper, updateRetries); } case DEMOTED_STATE: case DEMOTE_TIMEOUT_STATE: case DRAINING_STATE: { if (postgres->pgIsRunning) { log_warn("PostgreSQL is running while in state \"%s\", " "stopping PostgreSQL.", NodeStateToString(keeperState->current_role)); return ensure_postgres_service_is_stopped(postgres); } return true; } case MAINTENANCE_STATE: default: /* nothing to be done here */ return true; } /* should never happen */ return false; } /* * reportPgIsRunning returns the boolean that we should use to report * pgIsRunning to the monitor. When the local PostgreSQL isn't running, we * continue reporting that it is for some time, depending on the following * configuration parameters: * * timeout.postgresql_restart_failure_timeout (default 20s) * timeout.postgresql_restart_failure_max_retries (default 3 times) */ bool ReportPgIsRunning(Keeper *keeper) { KeeperStateData *keeperState = &(keeper->state); KeeperConfig *config = &(keeper->config); LocalPostgresServer *postgres = &(keeper->postgres); int retries = config->postgresql_restart_failure_max_retries; int timeout = config->postgresql_restart_failure_timeout; uint64_t now = time(NULL); if (keeperState->current_role != PRIMARY_STATE) { /* * Only when in the PRIMARY_STATE is the monitor going to consider a * failover to another node. That's when we should be careful about * having attempted all we could before resigning. * * When we're not in PRIMARY_STATE, then it's ok to immediately report * that PostgreSQL is not running, for immediate decision making on the * monitor's side. */ return postgres->pgIsRunning; } /* * Now we know the current state is PRIMARY_STATE. If PostgreSQL is * running, then we simply report that, easy. */ if (postgres->pgIsRunning) { return postgres->pgIsRunning; } else if (postgres->pgFirstStartFailureTs == 0) { /* * Oh, that's quite strange. It means we just fell in a code path where * pgIsRunning is set to false, and didn't call * ensure_local_postgres_is_running() to restart it. */ log_debug("ReportPgIsRunning: PostgreSQL is not running, " "and has not been restarted."); return postgres->pgIsRunning; } else if ((now - postgres->pgFirstStartFailureTs) > timeout || postgres->pgStartRetries >= retries) { /* * If we fail to restart PostgreSQL 3 times in a row within the last 20 * seconds (default values), then report the failure to the monitor for * immediate action (failover, depending on the secondary health & * reporting). */ log_error("Failed to restart PostgreSQL %d times in the " "last %" PRIu64 "s, reporting PostgreSQL not running to " "the pg_auto_failover monitor.", postgres->pgStartRetries, now - postgres->pgFirstStartFailureTs); return false; } else { /* * Don't tell the monitor yet, pretend PostgreSQL is running: we might * be able to get the service back running, it's too early for a * failover to be our best option yet. */ log_warn("PostgreSQL failed to start %d/%d times before " "reporting to the monitor, trying again", postgres->pgStartRetries, retries); return true; } /* we never reach this point. */ } /* * keeper_update_pg_state updates our internal reflection of the PostgreSQL * state. * * It returns true when we could successfully update the PostgreSQL state and * everything makes sense, and false when either we failed to update the state, * or when there's a serious problem with PostgreSQL and our expections are not * met. Examples of returning false include: * - Postgres is running on a different port than configured * - Postgres system identifier has changed from our keeper state * - We failed to obtain the replication state from pg_stat_replication */ bool keeper_update_pg_state(Keeper *keeper, int logLevel) { KeeperStateData *keeperState = &(keeper->state); KeeperConfig *config = &(keeper->config); PostgresSetup *pgSetup = &(keeper->postgres.postgresSetup); LocalPostgresServer *postgres = &(keeper->postgres); PGSQL *pgsql = &(postgres->sqlClient); bool pgIsNotRunningIsOk = true; log_debug("Update local PostgreSQL state"); /* reinitialize the replication state values each time we update */ postgres->pgIsRunning = false; memset(postgres->pgsrSyncState, 0, PGSR_SYNC_STATE_MAXLENGTH); strlcpy(postgres->currentLSN, "0/0", sizeof(postgres->currentLSN)); /* when running with --disable-monitor, we might get here early */ if (keeperState->current_role == INIT_STATE) { return true; } *pgSetup = config->pgSetup; /* * When PostgreSQL is running, do some extra checks that are going to be * helpful to drive the keeper's FSM decision making. */ if (pg_setup_is_ready(pgSetup, pgIsNotRunningIsOk)) { char connInfo[MAXCONNINFO]; if (pgSetup->pidFile.port != config->pgSetup.pgport) { log_fatal("PostgreSQL is expected to run on port %d, " "found to be running on port %d", config->pgSetup.pgport, pgSetup->pidFile.port); return false; } /* we know now that Postgres is running (and ready) */ postgres->pgIsRunning = true; /* * Reinitialize connection string in case host changed or was first * discovered. */ pg_setup_get_local_connection_string(pgSetup, connInfo); pgsql_init(pgsql, connInfo, PGSQL_CONN_LOCAL); /* * Update our Postgres metadata now. * * First, update our cache of file path locations for Postgres * configuration files (including HBA), in case it's been moved to * somewhere else. This could happen when using the debian/ubuntu * pg_createcluster command on an already existing cluster, for * instance. * * Also update our view of pg_is_in_recovery, the replication sync * state when we are a primary with a standby currently using our * replication slot, our current LSN position, and the control data * values (pg_control_version, catalog_version_no, and * system_identifier). */ if (!pgsql_get_postgres_metadata(pgsql, &pgSetup->is_in_recovery, postgres->pgsrSyncState, postgres->currentLSN, &(pgSetup->control))) { log_level(logLevel, "Failed to update the local Postgres metadata"); return false; } if (!keeper_state_check_postgres(keeper, &(pgSetup->control))) { log_level(logLevel, "Failed to update the local Postgres metadata, " "see above for details"); return false; } /* update the state from the metadata we just obtained */ keeperState->pg_control_version = pgSetup->control.pg_control_version; keeperState->catalog_version_no = pgSetup->control.catalog_version_no; keeperState->system_identifier = pgSetup->control.system_identifier; } else { /* Postgres is not running. */ postgres->pgIsRunning = false; /* * Cache invalidation: keep the current values we have for the Postgres * characteristics, when we already have them, or fetch them anew using * pg_controldata. */ if (keeperState->pg_control_version != 0) { pgSetup->control.pg_control_version = keeperState->pg_control_version; pgSetup->control.catalog_version_no = keeperState->catalog_version_no; pgSetup->control.system_identifier = keeperState->system_identifier; } else { /* Postgres is not running and we have yet to call pg_controldata */ const bool missingPgdataIsOk = false; if (!pg_controldata(pgSetup, missingPgdataIsOk)) { /* errors have already been logged */ return false; } } } /* * In some states, PostgreSQL isn't expected to be running, or not expected * to have a streaming replication to monitor at all. */ switch (keeperState->current_role) { case WAIT_PRIMARY_STATE: { /* we don't expect to have a streaming replica */ return postgres->pgIsRunning; } case PRIMARY_STATE: { /* * We expect to be able to read the current LSN, as always when * Postgres is running, and we also expect replication to be in * place when in PRIMARY state. * * On the primary, we use pg_stat_replication.sync_state to have an * idea of how the replication is going. The query we use in * pgsql_get_postgres_metadata should always return a non-empty * string when we are a PRIMARY and our standby is connected. */ if (IS_EMPTY_STRING_BUFFER(postgres->pgsrSyncState)) { log_level(logLevel, "Failed to fetch current replication properties " "from standby node: no standby connected in " "pg_stat_replication."); log_level(logLevel, "HINT: check pg_autoctl and Postgres logs on " "standby nodes"); } return postgres->pgIsRunning && !IS_EMPTY_STRING_BUFFER(postgres->currentLSN) && !IS_EMPTY_STRING_BUFFER(postgres->pgsrSyncState); } case SECONDARY_STATE: case CATCHINGUP_STATE: { /* pg_stat_replication.sync_state is only available upstream */ bool success = postgres->pgIsRunning; if (!success) { log_level(logLevel, "Postgres is %s and we are in state %s", postgres->pgIsRunning ? "running" : "not running", NodeStateToString(keeperState->current_role)); } return success; } default: { /* we don't need to check replication state in those states */ break; } } return true; } /* * keeper_state_check_postgres checks that the Postgres control data main * properties are still as we expect them to be. At the moment we don't support * Postgres minor and major upgrades, and we can't support the system * identifier ever changing. */ static bool keeper_state_check_postgres(Keeper *keeper, PostgresControlData *control) { KeeperStateData *keeperState = &(keeper->state); /* * We got new control data from either running pg_controldata or connecting * to the local Postgres instance and running our * pgsql_get_postgres_metadata() SQL query. In either case we now need to * update our Keeper State with the control data values. */ if (keeperState->system_identifier != control->system_identifier && keeperState->system_identifier != 0) { /* * This is a physical replication deal breaker, so it's mighty * confusing to get that here. In the least, the keeper should get * initialized from scratch again, but basically, we don't know what we * are doing anymore. */ log_error("Unknown PostgreSQL system identifier: %" PRIu64 ", " "expected %" PRIu64, keeperState->system_identifier, control->system_identifier); return false; } if (keeperState->pg_control_version != control->pg_control_version && keeperState->pg_control_version != 0) { /* Postgres minor upgrade happened */ log_warn("PostgreSQL version changed from %u to %u", keeperState->pg_control_version, control->pg_control_version); } if (keeperState->catalog_version_no != control->catalog_version_no && keeperState->catalog_version_no != 0) { /* Postgres major upgrade happened */ log_warn("PostgreSQL catalog version changed from %u to %u", keeperState->catalog_version_no, control->catalog_version_no); } return true; } /* * keeper_restart_postgres asks the Postgres controller process to stop and * then to restart Postgres. * * TODO: At the moment we just ensure postgres is stopped, and when that's the * case, ensure it's running again. It would arguably be more efficient to send * the explicit order to restart Postgres on the Postgres controller process * though. */ bool keeper_restart_postgres(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); log_info("Restarting Postgres at \"%s\"", postgres->postgresSetup.pgdata); if (ensure_postgres_service_is_stopped(postgres)) { bool updateRetries = false; return keeper_ensure_postgres_is_running(keeper, updateRetries); } return false; } /* * keeper_ensure_postgres_is_running ensures that Postgres is running. */ bool keeper_ensure_postgres_is_running(Keeper *keeper, bool updateRetries) { PostgresSetup *pgSetup = &(keeper->postgres.postgresSetup); LocalPostgresServer *postgres = &(keeper->postgres); if (postgres->pgIsRunning) { if (updateRetries) { /* reset PostgreSQL restart failures tracking */ postgres->pgFirstStartFailureTs = 0; postgres->pgStartRetries = 0; } return true; } else if (ensure_postgres_service_is_running(postgres)) { log_warn("PostgreSQL was not running, restarted with pid %d", pgSetup->pidFile.pid); return true; } else { log_error("Failed to restart PostgreSQL, " "see PostgreSQL logs for instance at \"%s\".", pgSetup->pgdata); return false; } } /* * keeper_create_self_signed_cert creates SSL self-signed certificates if * needed within the current configuration, and then makes sure we update our * keeper configuration both in-memory and on-disk with the new normalized * filenames of the certificate files created. */ bool keeper_create_self_signed_cert(Keeper *keeper) { KeeperConfig *config = &(keeper->config); LocalPostgresServer *postgres = &(keeper->postgres); PostgresSetup *pgSetup = &(postgres->postgresSetup); if (pgSetup->ssl.createSelfSignedCert && !(file_exists(pgSetup->ssl.serverKey) && file_exists(pgSetup->ssl.serverCert))) { if (!pg_create_self_signed_cert(pgSetup, config->hostname)) { log_error("Failed to create SSL self-signed certificate, " "see above for details"); return false; } } /* ensure the SSL setup is synced with the keeper config */ config->pgSetup.ssl = pgSetup->ssl; /* update our configuration with ssl server.{key,cert} */ if (!keeper_config_write_file(config)) { /* errors have already been logged */ return false; } return true; } /* * keeper_ensure_configuration updates the Postgres settings to match the * pg_autoctl configuration file, if necessary. * * This includes making sure that the SSL server.{key,cert} files are used in * the Postgres configuration, and on a secondary server, that means updating * the primary_conninfo connection string to make sure we use the proper * sslmode that is setup. * * This could change anytime with `pg_autoctl enable|disable ssl`. We cache the * primary node information in the LocalPostgresServer with the other * replicationSource parameters, and the monitor has the responsiblity to * instruct us when this cache needs to be invalidated (new primary, etc). */ bool keeper_ensure_configuration(Keeper *keeper, bool postgresNotRunningIsOk) { KeeperConfig *config = &(keeper->config); KeeperStateData *state = &(keeper->state); LocalPostgresServer *postgres = &(keeper->postgres); PostgresSetup *pgSetup = &(postgres->postgresSetup); /* * We just reloaded our configuration file from disk. Use the pgSetup from * the new configuration to re-init our local postgres instance * information, including a maybe different SSL setup. */ postgres->postgresSetup = config->pgSetup; if (!keeper_config_update(config, state->current_node_id, state->current_group)) { log_error("Failed to update configuration"); return false; } if (!local_postgres_update(postgres, postgresNotRunningIsOk)) { log_error("Failed to reload configuration, see above for details"); return false; } /* * We might have to deploy a new Postgres configuration, from new SSL * options being found in our pg_autoctl configuration file or for other * reasons. */ if (!postgres_add_default_settings(postgres, config->hostname)) { log_warn("Failed to edit Postgres configuration after " "reloading pg_autoctl configuration, " "see above for details"); return false; } /* * In pg_auto_failover before version 1.3 we would use pg_basebackup with * the --write-recovery-conf option. Starting with Postgres 12, this option * would cause pg_basebackup to edit postgresql.auto.conf rather than * recovery.conf... meaning that our own setup would not have any effect. * * Now is a good time to clean-up, at reload, and either on a primary or a * secondary, because those parameters should not remain set on a primary * either. * * At start-up, we call reload_configuration() before having contacted the * monitor, so Postgres is not running yet. When Postgres is not running we * can't ALTER SYSTEM to clean-up the primary_conninfo and * primary_slot_name, so we skip that step. * * At start-up we don't need to reload the configuration by calling the SQL * function pg_reload_conf() because Postgres is not running yet, it will * start with the new setup already. */ if (pg_setup_is_running(pgSetup)) { if (state->pg_control_version >= 1200) { /* errors are logged already, and non-fatal to this function */ (void) pgsql_reset_primary_conninfo(&(postgres->sqlClient)); } if (!pgsql_reload_conf(&(postgres->sqlClient))) { log_warn("Failed to reload Postgres configuration after " "reloading pg_autoctl configuration, " "see above for details"); return false; } } if (!config->monitorDisabled) { if (!monitor_init(&(keeper->monitor), config->monitor_pguri)) { /* we tested already in keeper_config_accept_new, but... */ log_warn("Failed to contact the monitor because its " "URL is invalid, see above for details"); return false; } } /* * On a standby server we might have to produce a new recovery settings * file (either recovery.conf or postgresql-auto-failover-standby.conf) and * then restart Postgres. */ if (state->current_role == CATCHINGUP_STATE || state->current_role == SECONDARY_STATE || state->current_role == MAINTENANCE_STATE) { ReplicationSource *upstream = &(postgres->replicationSource); /* either recovery.conf or AUTOCTL_STANDBY_CONF_FILENAME */ char *relativeConfPathName = state->pg_control_version < 1200 ? "recovery.conf" : AUTOCTL_STANDBY_CONF_FILENAME; char upstreamConfPath[MAXPGPATH] = { 0 }; char *currentConfContents = NULL; long currentConfSize = 0L; char *newConfContents = NULL; long newConfSize = 0L; /* do we have the primaryNode already? */ if (IS_EMPTY_STRING_BUFFER(upstream->primaryNode.host)) { if (!keeper_get_primary(keeper, &(upstream->primaryNode))) { log_error("Failed to update primary_conninfo, " "see above for details"); return false; } } /* * Read the contents of the standby configuration file now, so that we * only restart Postgres when it has been changed with the next step. */ join_path_components(upstreamConfPath, pgSetup->pgdata, relativeConfPathName); /* to check if replicationSettingsHaveChanged, read current file */ if (file_exists(upstreamConfPath)) { if (!read_file(upstreamConfPath, ¤tConfContents, ¤tConfSize)) { /* errors have already been logged */ return false; } } /* prepare a replicationSource from the primary and our SSL setup */ if (!standby_init_replication_source(postgres, NULL, /* primaryNode is done */ PG_AUTOCTL_REPLICA_USERNAME, config->replication_password, config->replication_slot_name, config->maximum_backup_rate, config->backupDirectory, NULL, /* no targetLSN */ config->pgSetup.ssl, state->current_node_id)) { /* can't happen at the moment */ free(currentConfContents); return false; } /* now setup the replication configuration (primary_conninfo etc) */ if (!pg_setup_standby_mode(state->pg_control_version, pgSetup->pgdata, pgSetup->pg_ctl, upstream)) { log_error("Failed to setup Postgres as a standby after primary " "connection settings change"); free(currentConfContents); return false; } /* restart Postgres only when the configuration file has changed */ if (!read_file(upstreamConfPath, &newConfContents, &newConfSize)) { /* errors have already been logged */ free(currentConfContents); return false; } bool replicationSettingsHaveChanged = currentConfContents == NULL || strcmp(newConfContents, currentConfContents) != 0; free(currentConfContents); free(newConfContents); if (replicationSettingsHaveChanged) { log_info("Replication settings at \"%s\" have changed, " "restarting Postgres", upstreamConfPath); if (!pgsql_checkpoint(&(postgres->sqlClient))) { log_warn("Failed to CHECKPOINT before restart, " "see above for details"); } if (!keeper_restart_postgres(keeper)) { log_error("Failed to restart Postgres to enable new " "replication settings, see above for details"); return false; } } } return true; } /* * keeper_create_and_drop_replication_slots drops replication slots that we * have on the local Postgres instance when the node is not registered on the * monitor anymore (after a pgautofailover.remove_node() has been issued, maybe * with the command `pg_autoctl drop node [ --destroy ]`); and creates * replication slots for nodes that have been recently registered on the * monitor. */ bool keeper_create_and_drop_replication_slots(Keeper *keeper) { LocalPostgresServer *postgres = &(keeper->postgres); NodeAddressArray *otherNodesArray = &(keeper->otherNodes); log_trace("keeper_create_and_drop_replication_slots"); if (!postgres_replication_slot_create_and_drop(postgres, otherNodesArray)) { log_error("Failed to maintain replication slots on the local Postgres " "instance, see above for details"); return false; } return true; } /* * keeper_advance_replication_slots loops over the other standby nodes and * advance their replication slots up to the current LSN value known by the * monitor. */ bool keeper_maintain_replication_slots(Keeper *keeper) { PostgresSetup *pgSetup = &(keeper->postgres.postgresSetup); LocalPostgresServer *postgres = &(keeper->postgres); /* do we bypass the whole operation? */ bool bypass = false; bool forceCacheInvalidation = false; /* * We would like to maintain replication slots on the standby nodes in a * group by using the function pg_replication_slot_advance(). This ensures * that every node keep a local copy of the WAL files that each other node * might need. * * This WAL files might be necessary in the following two cases: * * - when a primary has been demoted and now rejoins as a secondary, then * it uses pg_rewind and needs to find the WAL it missed on the new * primary ; in that case we need the replication slot to have been * maintained before the failover. * * - when a failover happens with more than one standby, all the standby * nodes that are not promoted need to follow a new primary node, and for * that it's best that the new-primary already had a replication slot for * its new set of standby nodes. * * The pg_replication_slot_advance() function is new in Postgres 11, so we * can't install replication slots on our standby nodes when using Postgres * 10. * * In Postgres 11 and 12, the pg_replication_slot_advance() function has * been buggy for quite some time and prevented WAL recycling on standby * servers, see https://github.com/citusdata/pg_auto_failover/issues/283 * for the problem and * https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=b48df81 * for the solution. * * The bug fix appears in the minor releases 12.4 and 11.9. Before that, we * disable the slot maintenance feature of pg_auto_failover. */ if (pgSetup->control.pg_control_version < 1100) { /* Postgres 10 does not have pg_replication_slot_advance() */ bypass = true; } else { /* * When running our test suite, we still use replication slots in all * versions of Postgres 11 and 12, for testing purposes. * * We estimate that we are in the test suite when both of * PG_AUTOCTL_DEBUG and PG_REGRESS_SOCK_DIR are set. */ if (env_exists(PG_AUTOCTL_DEBUG) && env_exists("PG_REGRESS_SOCK_DIR")) { bypass = false; } else { bool maintainSlots = pg_setup_standby_slot_supported(pgSetup, LOG_TRACE); bypass = !maintainSlots; } } /* * Do we actually want to maintain replication slots on this standby node? */ if (bypass) { log_debug("Skipping replication slots on a secondary running %d", pgSetup->control.pg_control_version); return true; } if (!keeper_refresh_other_nodes(keeper, forceCacheInvalidation)) { log_error("Failed to maintain replication slots on the local Postgres " "instance, due to failure to refresh list of other nodes, " "see above for details"); return false; } if (!postgres_replication_slot_maintain(postgres, &(keeper->otherNodes))) { log_error("Failed to maintain replication slots on the local Postgres " "instance, see above for details"); return false; } return true; } /* * keeper_node_active calls pgautofailover.node_active on the monitor. */ bool keeper_node_active(Keeper *keeper, bool doInit, MonitorAssignedState *assignedState) { Monitor *monitor = &(keeper->monitor); KeeperConfig *config = &(keeper->config); KeeperStateData *keeperState = &(keeper->state); LocalPostgresServer *postgres = &(keeper->postgres); bool reportPgIsRunning = ReportPgIsRunning(keeper); /* * First, connect to the monitor and check we're compatible with the * extension there. An upgrade on the monitor might have happened in * between loops here. * * Note that we don't need a very strong a guarantee about the version * number of the monitor extension, as we have other places in the code * that are protected against "suprises". The worst case would be a race * condition where the extension check passes, and then the monitor is * upgraded, and then we call node_active(). * * - The extension on the monitor is protected against running a version * of the node_active (or any other) function that does not match with * the SQL level version. * * - Then, if we changed the API without changing the arguments, that * means we changed what we may return. We are protected against changes * in number of return values, so we're left with changes within the * columns themselves. Basically that's a new state that we don't know * how to handle. In that case we're going to fail to parse it, and at * next attempt we're going to catch up with the new version number. * * All in all, the worst case is going to be one extra call before we * restart node active process, and an extra error message in the logs * during the live upgrade of pg_auto_failover. */ MonitorExtensionVersion monitorVersion = { 0 }; if (!keeper_check_monitor_extension_version(keeper, &monitorVersion)) { /* * We could fail here for two different reasons: * * - if we failed to connect to the monitor (network split, monitor is * in maintenance or being restarted, etc): in that case just return * false and have the main loop handle the situation * * - if we could connect to the monitor and then failed to check that * the version of the monitor is the one we expect, then we're not * compatible with this monitor and that's a different story. */ if (monitor->pgsql.status != PG_CONNECTION_OK) { return false; } /* * Okay we're not compatible with the current version of the * pgautofailover extension on the monitor. The most plausible scenario * is that the monitor got update: we're still running e.g. 1.4 and the * monitor is running 1.5. * * In that case we exit, and because the keeper node-active service is * RP_PERMANENT the supervisor is going to restart this process. The * restart happens with fork() and exec(), so it uses the current * version of pg_autoctl binary on disk, which has been updated to e.g. * 1.5 too. * * TL;DR: just exit now, have the service restarted by the supervisor * with the expected version of pg_autoctl that matches the monitor's * extension version. */ KeeperVersion keeperVersion = { 0 }; if (!keeper_pg_autoctl_get_version_from_disk(keeper, &keeperVersion)) { /* errors have already been logged */ return false; } /* * Only call exit() when the on-disk pg_autoctl required extension * version matches the current monitor extension version, ensuring that * the restart is going to be effective. */ if (strcmp(monitorVersion.installedVersion, keeperVersion.required_extension_version) == 0) { log_info("pg_autoctl version \"%s\" with compatibility with " "monitor extension \"%s\" has been found on-disk, " "exiting for a restart of the node-active process.", keeperVersion.pg_autoctl_version, keeperVersion.required_extension_version); exit(EXIT_CODE_MONITOR); } /* * If the monitor is of a different version number than the one * required by this instance of pg_autoctl, and then the on-disk * pg_autoctl binary still reports the same extension version required, * then issue an error now: we don't know how to use the monitor's * protocol. */ log_warn("pg_autoctl version \"%s\" requires monitor extension " "version \"%s\" and current version on the monitor is \"%s\"", keeperVersion.pg_autoctl_version, keeperVersion.required_extension_version, monitorVersion.installedVersion); int pg_autoctl_version = 0; int monitor_version = 0; if (parse_pgaf_extension_version_string( monitorVersion.installedVersion, &monitor_version) && parse_pgaf_extension_version_string( keeperVersion.required_extension_version, &pg_autoctl_version) && pg_autoctl_version < monitor_version) { log_info("HINT: the monitor has been upgraded to the more recent " "version \"%s\", " "\"%s\" needs to be upgraded to the same version", monitorVersion.installedVersion, pg_autoctl_program); } /* refrain from using our version of the monitor API/protocol */ return false; } if (doInit) { PostgresSetup *pgSetup = &(postgres->postgresSetup); uint64_t system_identifier = pgSetup->control.system_identifier; if (!monitor_set_group_system_identifier(monitor, keeperState->current_group, system_identifier)) { /* errors have already been logged */ return false; } } /* We used to output that in INFO every 5s, which is too much chatter */ log_debug("Calling node_active for node %s/%d/%d with current state: " "%s, " "PostgreSQL %s running, " "sync_state is \"%s\", " "current lsn is \"%s\".", config->formation, keeperState->current_node_id, keeperState->current_group, NodeStateToString(keeperState->current_role), reportPgIsRunning ? "is" : "is not", postgres->pgsrSyncState, postgres->currentLSN); /* ensure we use the correct retry policy with the monitor */ (void) pgsql_set_main_loop_retry_policy(&(monitor->pgsql.retryPolicy)); /* * Report the current state to the monitor and get the assigned state. */ return monitor_node_active(monitor, config->formation, keeperState->current_node_id, keeperState->current_group, keeperState->current_role, reportPgIsRunning, postgres->postgresSetup.control.timeline_id, postgres->currentLSN, postgres->pgsrSyncState, assignedState); } /* * keeper_ensure_node_has_been_dropped checks if the local node is being * dropped or has been dropped already from the monitor, and when a drop has * been engaged and is not finished, the function implements the remaining * steps of the DROP protocol. */ bool keeper_ensure_node_has_been_dropped(Keeper *keeper, bool *dropped) { Monitor *monitor = &(keeper->monitor); KeeperConfig *config = &(keeper->config); KeeperStateData *keeperState = &(keeper->state); *dropped = false; if (!keeper_state_read(keeperState, config->pathnames.state)) { /* errors have already been logged */ return false; } /* ensure we use the correct retry policy with the monitor */ (void) pgsql_set_main_loop_retry_policy(&(monitor->pgsql.retryPolicy)); /* check if the nodeid still exists on the monitor */ NodeAddressArray nodesArray = { 0 }; if (!monitor_find_node_by_nodeid(monitor, config->formation, config->groupId, keeperState->current_node_id, &nodesArray)) { log_error("Failed to query monitor to see if node id %d " "has been dropped already", keeperState->current_node_id); return false; } log_debug("keeper_node_has_been_dropped: found %d node by id %d", nodesArray.count, keeperState->current_node_id); if (nodesArray.count == 0) { /* no node found with our nodeid, the drop has been successfull */ *dropped = true; /* if the monitor doesn't know about us, we're as good as DROPPED */ uint64_t now = time(NULL); keeperState->last_monitor_contact = now; keeperState->current_role = DROPPED_STATE; keeperState->assigned_role = DROPPED_STATE; return keeper_store_state(keeper); } else if (nodesArray.count == 1) { bool doInit = false; MonitorAssignedState assignedState = { 0 }; /* grab our assigned state from the monitor now */ (void) keeper_update_pg_state(keeper, LOG_DEBUG); if (!keeper_node_active(keeper, doInit, &assignedState)) { /* errors have already been logged */ return false; } if (keeperState->current_role == DROPPED_STATE && assignedState.state == DROPPED_STATE) { *dropped = true; uint64_t now = time(NULL); keeperState->last_monitor_contact = now; keeperState->current_role = DROPPED_STATE; keeperState->assigned_role = assignedState.state; return keeper_store_state(keeper); } else if (keeperState->current_role != DROPPED_STATE && assignedState.state == DROPPED_STATE) { log_info("Reaching assigned state \"%s\"", NodeStateToString(assignedState.state)); if (!keeper_fsm_step(keeper)) { /* errors have already been logged */ return false; } if (keeperState->current_role == DROPPED_STATE && keeperState->current_role == keeperState->assigned_role) { *dropped = true; /* * Call node_active one last time now: after being assigned * DROPPED, we need to report we reached the state for the * monitor to actually drop this node. */ (void) keeper_update_pg_state(keeper, LOG_DEBUG); if (!keeper_node_active(keeper, doInit, &assignedState)) { /* errors have already been logged */ return false; } } return true; } /* we did all the checks we're supposed to, dropped is false */ return true; } else { log_error("BUG: monitor_find_node_by_nodeid returned %d nodes", nodesArray.count); return false; } return false; } /* * keeper_check_monitor_extension_version checks that the monitor we connect to * has an extension version compatible with our expectations. */ bool keeper_check_monitor_extension_version(Keeper *keeper, MonitorExtensionVersion *version) { Monitor *monitor = &(keeper->monitor); if (!monitor_get_extension_version(monitor, version)) { /* * Only output a FATAL error message when we could connect and then * failed to get the monitor extension version that we expect. * Connection failures are retried the usual way. */ if (monitor->pgsql.status == PG_CONNECTION_OK) { log_fatal("Failed to check version compatibility with the monitor " "extension \"%s\", see above for details", PG_AUTOCTL_MONITOR_EXTENSION_NAME); } return false; } /* from a member of the cluster, we don't try to upgrade the extension */ if (strcmp(version->installedVersion, PG_AUTOCTL_EXTENSION_VERSION) != 0) { log_info("The monitor at \"%s\" has extension \"%s\" version \"%s\", " "this pg_autoctl version requires version \"%s\".", keeper->config.monitor_pguri, PG_AUTOCTL_MONITOR_EXTENSION_NAME, version->installedVersion, PG_AUTOCTL_EXTENSION_VERSION); return false; } else { log_trace("The version of extension \"%s\" is \"%s\" on the monitor", PG_AUTOCTL_MONITOR_EXTENSION_NAME, version->installedVersion); } return true; } /* * keeper_init_fsm initializes the keeper's local FSM and does nothing more. * * It's only intended to be used when we are not using a monitor, which means * we're going to expose our FSM driving as an HTTP API, and sit there waiting * for orders from another software. * * The function is modeled to look like keeper_register_and_init with the * difference that we don't have a monitor to talk to. */ bool keeper_init_fsm(Keeper *keeper) { KeeperConfig *config = &(keeper->config); PostgresSetup *pgSetup = &(config->pgSetup); /* fake the initial state provided at monitor registration time */ MonitorAssignedState assignedState = { .nodeId = monitorDisabledNodeId, .groupId = -1, .state = INIT_STATE }; /* * First try to create our state file. The keeper_state_create_file function * may fail if we have no permission to write to the state file directory * or the disk is full. In that case, we stop before having registered the * local PostgreSQL node to the monitor. */ if (!keeper_state_create_file(config->pathnames.state)) { log_fatal("Failed to create a state file prior to registering the " "node with the monitor, see above for details"); return false; } /* now that we have a state on-disk, finish init of the keeper instance */ if (!keeper_init(keeper, config)) { return false; } /* initialize FSM state */ if (!keeper_update_state(keeper, assignedState.nodeId, assignedState.groupId, assignedState.state, false)) { log_error("Failed to update keepers's state"); /* * Make sure we don't have a corrupted state file around, that could * prevent trying to init again and cause strange errors. */ unlink_file(config->pathnames.state); return false; } /* * Leave a track record that we're ok to initialize in PGDATA, so that in * case of `pg_autoctl create` being interrupted, we may resume operations * and accept to work on already running PostgreSQL primary instances. */ if (!keeper_init_state_create(&(keeper->initState), pgSetup, config->pathnames.init)) { /* errors have already been logged */ return false; } return true; } /* * keeper_register_and_init registers the local node to the pg_auto_failover * Monitor in the given initialState, and then create the state on-disk with * the assigned goal from the Monitor. */ bool keeper_register_and_init(Keeper *keeper, NodeState initialState) { KeeperConfig *config = &(keeper->config); PostgresSetup *pgSetup = &(config->pgSetup); KeeperStateInit *initState = &(keeper->initState); Monitor *monitor = &(keeper->monitor); MonitorAssignedState assignedState = { 0 }; char expectedSlotName[BUFSIZE] = { 0 }; ConnectionRetryPolicy retryPolicy = { 0 }; (void) pgsql_set_monitor_interactive_retry_policy(&retryPolicy); /* * First try to create our state file. The keeper_state_create_file function * may fail if we have no permission to write to the state file directory * or the disk is full. In that case, we stop before having registered the * local PostgreSQL node to the monitor. * * When using pg_autoctl create postgres on-top of a previously dropped * node, we already have a state file around and we're going to use some of * its content. */ if (!file_exists(config->pathnames.state)) { if (!keeper_state_create_file(config->pathnames.state)) { log_fatal("Failed to create a state file prior to registering the " "node with the monitor, see above for details"); return false; } } /* now that we have a state on-disk, finish init of the keeper instance */ if (!keeper_init(keeper, config)) { return false; } /* * We implement a specific retry policy for cases where we have a transient * error on the monitor, such as OBJECT_IN_USE which indicates that another * standby is concurrently being added to the same group. */ (void) pgsql_set_init_retry_policy(&(keeper->monitor.pgsql.retryPolicy)); while (!pgsql_retry_policy_expired(&retryPolicy)) { bool mayRetry = false; /* * When registering to the monitor, we get assigned a nodeId, that we * keep preciously in our state file. We need to have a local version * of the nodeId that is the same one as on the monitor. * * In particular, if we fail to update our local state file, we should * cancel our registration, because there's no way we can re-discover * our nodeId later. * * We register to the monitor in a SQL transaction that we only COMMIT * after we have updated our local state file. If we fail to do so, we * ROLLBACK the transaction, and thus we are not registered to the * monitor and may try again. If we are disconnected halfway through * the registration (process killed, crash, etc), then the server * issues a ROLLBACK for us upon disconnection. */ if (!pgsql_begin(&(monitor->pgsql))) { log_error("Failed to open a SQL transaction to register this node"); unlink_file(config->pathnames.state); return false; } if (monitor_register_node(monitor, config->formation, config->name, config->hostname, config->pgSetup.pgport, config->pgSetup.control.system_identifier, config->pgSetup.dbname, keeper->state.current_node_id, config->groupId, initialState, config->pgSetup.pgKind, config->pgSetup.settings.candidatePriority, config->pgSetup.settings.replicationQuorum, config->pgSetup.citusClusterName, &mayRetry, &assignedState)) { /* registration was successful, break out of the retry loop */ break; } if (!mayRetry) { /* errors have already been logged, remove state file */ goto rollback; } int sleepTimeMs = pgsql_compute_connection_retry_sleep_time(&retryPolicy); log_warn("Failed to register node %s:%d in group %d of " "formation \"%s\" with initial state \"%s\" " "because the monitor is already registering another " "standby, retrying in %d ms", config->hostname, config->pgSetup.pgport, config->groupId, config->formation, NodeStateToString(initialState), sleepTimeMs); /* * The current transaction is dead: we caugth an ERROR from the * call to pgautofailover.register_node(). */ if (!pgsql_rollback(&(monitor->pgsql))) { log_error("Failed to ROLLBACK failed register_node transaction " " on the monitor, see above for details."); pgsql_finish(&(monitor->pgsql)); return false; } /* we have milliseconds, pg_usleep() wants microseconds */ (void) pg_usleep(sleepTimeMs * 1000); } /* we might have been assigned a new name */ strlcpy(config->name, assignedState.name, sizeof(config->name)); /* initialize FSM state from monitor's answer */ log_info("Writing keeper state file at \"%s\"", config->pathnames.state); if (!keeper_update_state(keeper, assignedState.nodeId, assignedState.groupId, assignedState.state, true)) { log_error("Failed to update keepers's state"); goto rollback; } /* * Also update the groupId and replication slot name in the * configuration file. */ (void) postgres_sprintf_replicationSlotName(assignedState.nodeId, expectedSlotName, sizeof(expectedSlotName)); /* also update the groupId in the configuration file. */ if (!keeper_config_update(config, assignedState.nodeId, assignedState.groupId)) { log_error("Failed to update the configuration file with the groupId: %d", assignedState.groupId); goto rollback; } /* * If we dropped a primary using --force, it's possible that the postgres * state file still says that postgres should be running. In that case * postgres would probably be running now. The problem is that our * fsm_init_primary transation errors out when a postgres is running during * initialization. So if we were dropped and this is the first time * create is run after that, then we first stop postgres and record this * in our postgres state file. */ if (keeper->state.current_role == DROPPED_STATE && !file_exists(keeper->config.pathnames.init)) { log_info("Making sure postgres was stopped, when it was previously dropped"); ensure_postgres_service_is_stopped(&keeper->postgres); } /* * Leave a track record that we're ok to initialize in PGDATA, so that in * case of `pg_autoctl create` being interrupted, we may resume operations * and accept to work on already running PostgreSQL primary instances. */ if (!keeper_init_state_create(initState, pgSetup, keeper->config.pathnames.init)) { /* errors have already been logged */ goto rollback; } if (!pgsql_commit(&(monitor->pgsql))) { log_error("Failed to COMMIT register_node transaction on the " "monitor, see above for details"); /* we can't send a ROLLBACK when a COMMIT failed */ unlink_file(config->pathnames.state); pgsql_finish(&(monitor->pgsql)); return false; } pgsql_finish(&(monitor->pgsql)); return true; rollback: /* * Make sure we don't have a corrupted state file around, that could * prevent trying to init again and cause strange errors. */ unlink_file(config->pathnames.state); if (!pgsql_rollback(&(monitor->pgsql))) { log_error("Failed to ROLLBACK failed register_node transaction " " on the monitor, see above for details."); } pgsql_finish(&(monitor->pgsql)); return false; } /* * keeper_register_again registers the given node again to a given monitor URI, * possibly new. This function has been designed to be used from the "enable * monitor" command, in such a scenario: * * $ pg_autoctl disable monitor --force * $ pg_autoctl enable monitor --monitor postgresql://... * * The idea is that we have lost the monitor, and we want to re-register nodes * to the new empty monitor, without having to stop pg_autoctl nor Postgres. */ bool keeper_register_again(Keeper *keeper) { Monitor *monitor = &(keeper->monitor); KeeperConfig *config = &(keeper->config); PostgresSetup *pgSetup = &(config->pgSetup); MonitorAssignedState assignedState = { 0 }; ConnectionRetryPolicy retryPolicy = { 0 }; bool registered = false; (void) pgsql_set_monitor_interactive_retry_policy(&retryPolicy); /* fetch local metadata for the registration (system_identifier) */ if (!pgsql_get_postgres_metadata(&(keeper->postgres.sqlClient), &pgSetup->is_in_recovery, keeper->postgres.pgsrSyncState, keeper->postgres.currentLSN, &(pgSetup->control))) { log_error("Failed to get the local Postgres metadata"); return false; } NodeState initialState = pgSetup->is_in_recovery ? WAIT_STANDBY_STATE : SINGLE_STATE; /* * Now register to the new monitor from this "client-side" process, and * then signal the background pg_autoctl service for this node (if any) to * reload its configuration so that it starts calling node_active() to the * new monitor. */ (void) pgsql_set_init_retry_policy(&(monitor->pgsql.retryPolicy)); while (!pgsql_retry_policy_expired(&retryPolicy)) { bool mayRetry = false; if (monitor_register_node(monitor, config->formation, config->name, config->hostname, config->pgSetup.pgport, config->pgSetup.control.system_identifier, config->pgSetup.dbname, keeper->state.current_node_id, config->groupId, initialState, config->pgSetup.pgKind, config->pgSetup.settings.candidatePriority, config->pgSetup.settings.replicationQuorum, DEFAULT_CITUS_CLUSTER_NAME, &mayRetry, &assignedState)) { /* registration was successful, break out of the retry loop */ log_info("Successfully registered to the monitor with nodeId %" PRId64, assignedState.nodeId); registered = true; break; } if (!mayRetry) { /* game over */ break; } int sleepTimeMs = pgsql_compute_connection_retry_sleep_time(&retryPolicy); log_warn("Failed to register node %s:%d in group %d of " "formation \"%s\" with initial state \"%s\" " "because the monitor is already registering another " "standby, retrying in %d ms", config->hostname, config->pgSetup.pgport, config->groupId, config->formation, NodeStateToString(initialState), sleepTimeMs); /* we have milliseconds, pg_usleep() wants microseconds */ (void) pg_usleep(sleepTimeMs * 1000); } if (!registered) { log_error("Failed to register to the monitor"); return false; } /* * If we have just registered the primary node as SINGLE, then we're good, * we may continue as before. */ if (assignedState.state == SINGLE_STATE) { /* now we have registered with a new nodeId, record that */ if (!keeper_update_state(keeper, assignedState.nodeId, assignedState.groupId, assignedState.state, true)) { log_error("Failed to update keepers's state"); return false; } return true; } /* * We are now registered as a WAIT_STANDBY node. * * The local state file might still have it that we are a SECONDARY node * though, and is running with the monitor still disabled. * * Let's move to CATCHINGUP on the monitor and then assign that to the * local state file, so that when we signal the background running process * and it connects to the monitor, it continues without an interruption and * without a pg_basebackup either. * * Wait until the primary has moved and we're being assigned CATCHINGUP. */ int errors = 0, tries = 0; do { /* attempt to make progress every 300ms */ pg_usleep(300 * 1000); if (!pgsql_get_postgres_metadata(&(keeper->postgres.sqlClient), &pgSetup->is_in_recovery, keeper->postgres.pgsrSyncState, keeper->postgres.currentLSN, &(pgSetup->control))) { log_error("Failed to get the local Postgres metadata"); return false; } int currentTLI = keeper->postgres.postgresSetup.control.timeline_id; if (!monitor_node_active(monitor, config->formation, assignedState.nodeId, assignedState.groupId, assignedState.state, ReportPgIsRunning(keeper), currentTLI, keeper->postgres.currentLSN, keeper->postgres.pgsrSyncState, &assignedState)) { ++errors; log_warn("Failed to contact the monitor at \"%s\"", keeper->config.monitor_pguri); if (errors > 5) { log_error("Failed to contact the monitor to publish our " "current state \"%s\".", NodeStateToString(assignedState.state)); return false; } } ++tries; if (tries == 3) { log_info("Still waiting for the monitor to drive us to state \"%s\"", NodeStateToString(CATCHINGUP_STATE)); log_warn("Please make sure that the primary node is currently " "running `pg_autoctl run` and contacting the monitor."); } } while (assignedState.state != CATCHINGUP_STATE); /* now we have registered with a new nodeId, record that */ if (!keeper_update_state(keeper, assignedState.nodeId, assignedState.groupId, assignedState.state, true)) { log_error("Failed to update keepers's state"); return false; } return true; } /* * keeper_state_as_json prepares the current keeper state as a JSON object and * copy the string to the given pre-allocated memory area, of given size. */ bool keeper_state_as_json(Keeper *keeper, char *json, int size) { JSON_Value *js = json_value_init_object(); JSON_Value *jsPostgres = json_value_init_object(); JSON_Value *jsKeeperState = json_value_init_object(); JSON_Object *jsRoot = json_value_get_object(js); pg_setup_as_json(&(keeper->postgres.postgresSetup), jsPostgres); keeperStateAsJSON(&(keeper->state), jsKeeperState); json_object_set_value(jsRoot, "postgres", jsPostgres); json_object_set_value(jsRoot, "state", jsKeeperState); char *serialized_string = json_serialize_to_string_pretty(js); int len = strlcpy(json, serialized_string, size); json_free_serialized_string(serialized_string); json_value_free(js); /* strlcpy returns how many bytes where necessary */ return len < size; } /* * keeper_update_group_hba updates updates the HBA file to ensure we have two * entries per other node in the group, allowing for both replication * connections and connections to the --dbname. */ bool keeper_update_group_hba(Keeper *keeper, NodeAddressArray *diffNodesArray) { LocalPostgresServer *postgres = &(keeper->postgres); PostgresSetup *postgresSetup = &(postgres->postgresSetup); PGSQL *pgsql = &(postgres->sqlClient); char hbaFilePath[MAXPGPATH] = { 0 }; char *authMethod = pg_setup_get_auth_method(postgresSetup); /* early exit when we're alone in the group */ if (diffNodesArray->count == 0) { return true; } /* early exit when we have not created $PGDATA yet */ if (!pg_setup_pgdata_exists(postgresSetup)) { return true; } sformat(hbaFilePath, MAXPGPATH, "%s/pg_hba.conf", postgresSetup->pgdata); if (!pghba_ensure_host_rules_exist(hbaFilePath, diffNodesArray, postgresSetup->ssl.active, postgresSetup->dbname, PG_AUTOCTL_REPLICA_USERNAME, authMethod, keeper->config.pgSetup.hbaLevel)) { log_error("Failed to edit HBA file \"%s\" to update rules to current " "list of nodes registered on the monitor", hbaFilePath); return false; } /* * Only reload if Postgres is known to be running. If it's not running, we * edited the HBA and it's going to take effect at next restart of * Postgres, so we're good here. */ if (keeper->config.pgSetup.hbaLevel >= HBA_EDIT_MINIMAL && pg_setup_is_running(postgresSetup)) { if (!pgsql_reload_conf(pgsql)) { log_error("Failed to reload the postgres configuration after adding " "the standby user to pg_hba"); return false; } } return true; } /* * keeper_refresh_other_nodes call pgautofailover.get_other_nodes on the * monitor and refreshes the keeper otherNodes array with fresh information, * including each node's LSN position. * * When forceCacheInvalidation is true, instead of trusting our previous value * for the keeper otherNodes array, keeper_refresh_other_nodes() instead runs * through the whole monitor.get_other_nodes() result and updates HBA rules for * all entries there. That's necessary after a pg_basebackup for instance. * which will copy over the origin's pg_hba.conf. */ bool keeper_refresh_other_nodes(Keeper *keeper, bool forceCacheInvalidation) { Monitor *monitor = &(keeper->monitor); KeeperConfig *config = &(keeper->config); NodeAddressArray newNodesArray = { 0 }; int64_t nodeId = keeper->state.current_node_id; log_trace("keeper_refresh_other_nodes"); if (config->monitorDisabled) { if (!keeper_read_nodes_from_file(keeper, &newNodesArray)) { log_error("Failed to get other nodes, see above for details"); return false; } } else { if (!monitor_get_other_nodes(monitor, nodeId, ANY_STATE, &newNodesArray)) { log_error("Failed to get_other_nodes() on the monitor"); return false; } } /* * In case of success, copy the current nodes array to the keeper's cache. */ bool success = keeper_call_refresh_hooks(keeper, &newNodesArray, forceCacheInvalidation); if (success) { keeper->otherNodes = newNodesArray; } return success; } /* * keeper_call_refresh_hooks loops over the KeeperNodesArrayRefreshArray and * calls each hook in turn. It returns true when all the hooks have returned * true. */ bool keeper_call_refresh_hooks(Keeper *keeper, NodeAddressArray *newNodesArray, bool forceCacheInvalidation) { bool success = true; for (int index = 0; KeeperRefreshHooks[index]; index++) { KeeperNodesArrayRefreshFunction hookFun = KeeperRefreshHooks[index]; bool ret = (*hookFun)(keeper, newNodesArray, forceCacheInvalidation); success = success && ret; } return success; } /* * keeper_refresh_hba is a KeeperNodesArrayRefreshFunction that adds new * entries in the Postgres HBA file for new nodes that have been added to our * group. */ bool keeper_refresh_hba(Keeper *keeper, NodeAddressArray *newNodesArray, bool forceCacheInvalidation) { NodeAddressArray *otherNodesArray = &(keeper->otherNodes); NodeAddressArray diffNodesArray = { 0 }; /* compute nodes that need an HBA change (new ones, new hostnames) */ if (forceCacheInvalidation) { diffNodesArray = *newNodesArray; } else { (void) diff_nodesArray(otherNodesArray, newNodesArray, &diffNodesArray); } /* * When we're alone in the group, and also when there's no change, then we * are done here already. */ if (newNodesArray->count == 0 || diffNodesArray.count == 0) { /* refresh the keeper's cache with the current other nodes array */ keeper->otherNodes = *newNodesArray; return true; } log_info("Fetched current list of %d other nodes from the monitor " "to update HBA rules, including %d changes.", newNodesArray->count, diffNodesArray.count); /* * We have a new list of other nodes, update the HBA file. We only update * the nodes that we didn't know before, or that have a new host property. */ if (!keeper_update_group_hba(keeper, &diffNodesArray)) { log_error("Failed to update the HBA entries for the new " "elements in the our formation \"%s\" and group %d", keeper->config.formation, keeper->state.current_group); return false; } return true; } /* * diff_nodesArray computes the array of nodes entries that should be added in * the HBA file in the given pre-allocated diffNodesArray parameter. The diff * is computed from the keeper's otherNodesArray on the previous round, and the * one we just got from the monitor. */ static void diff_nodesArray(NodeAddressArray *previousNodesArray, NodeAddressArray *currentNodesArray, NodeAddressArray *diffNodesArray) { int prevIndex = 0; int currIndex = 0; int diffIndex = 0; if (previousNodesArray->count == 0) { /* all the entries are new and we want them in diffNodesArray */ *diffNodesArray = *currentNodesArray; return; } /* we only care about the nodes in the current nodes array */ for (currIndex = 0; currIndex < currentNodesArray->count; currIndex++) { NodeAddress *currNode = &(currentNodesArray->nodes[currIndex]); NodeAddress *prevNode = &(previousNodesArray->nodes[prevIndex]); /* remember, the input arrays are sorted on nodeId */ if (currNode->nodeId < prevNode->nodeId) { diffNodesArray->count++; diffNodesArray->nodes[diffIndex++] = *currNode; } else if (currNode->nodeId == prevNode->nodeId) { /* * We still have to update our HBA file when the host of a node * that we already have has changed on the monitor. */ if (!streq(currNode->host, prevNode->host)) { log_debug("Node %" PRId64 " has a new hostname \"%s\"", currNode->nodeId, currNode->host); diffNodesArray->count++; diffNodesArray->nodes[diffIndex++] = *currNode; } /* * In any case, if we have more elements in previousNodesArray, * advance our position there. */ if (prevIndex < previousNodesArray->count) { prevIndex++; } } else if (currNode->nodeId > prevNode->nodeId) { /* * All the remaining entries of currentNodesArray are new. * * We might have entries in previousNodesArray that are not found * in currentNodesArray anymore, but we don't know how to clean-up * the HBA file entries at the moment anyway, so we just skip them. */ diffNodesArray->count++; diffNodesArray->nodes[diffIndex++] = *currNode; break; } else { log_error("BUG in diff_nodesArray!"); return; } } } /* * keeper_set_node_metadata sets a new nodename for the current pg_autoctl node * on the monitor. This node might be in an environment where you might get a * new IP at reboot, such as in Kubernetes. */ bool keeper_set_node_metadata(Keeper *keeper, KeeperConfig *oldConfig) { KeeperConfig *config = &(keeper->config); KeeperStateData keeperState = { 0 }; if (!keeper_state_read(&keeperState, keeper->config.pathnames.state)) { /* errors have already been logged */ return false; } int64_t nodeId = keeperState.current_node_id; if (streq(oldConfig->name, config->name) && streq(oldConfig->hostname, config->hostname) && oldConfig->pgSetup.pgport == config->pgSetup.pgport) { log_trace("keeper_set_node_metadata: no changes"); return true; } if (!monitor_update_node_metadata(&(keeper->monitor), nodeId, keeper->config.name, keeper->config.hostname, keeper->config.pgSetup.pgport)) { /* errors have already been logged */ return false; } if (!keeper_config_write_file(&(keeper->config))) { log_warn("This node nodename has been updated with nodename \"%s\", " "hostname \"%s\" and pgport %d on the monitor " "but could not be update in the local configuration file!", keeper->config.name, keeper->config.hostname, keeper->config.pgSetup.pgport); return false; } if (strneq(oldConfig->name, config->name)) { log_info("Node name is now \"%s\", used to be \"%s\"", config->name, oldConfig->name); } if (strneq(oldConfig->hostname, config->hostname)) { log_info("Node hostname is now \"%s\", used to be \"%s\"", config->hostname, oldConfig->hostname); } if (oldConfig->pgSetup.pgport != config->pgSetup.pgport) { log_info("Node pgport is now %d, used to be %d", config->pgSetup.pgport, oldConfig->pgSetup.pgport); } return true; } /* * When upgrading from 1.3 to 1.4 the monitor assigns a new name to pg_autoctl * nodes, which did not use to have a name before. In that case, and then * pg_autoctl run has been used without options, our name might be empty here. * We then need to fetch it from the monitor. */ bool keeper_update_nodename_from_monitor(Keeper *keeper) { Monitor *monitor = &(keeper->monitor); KeeperConfig *config = &(keeper->config); char *formation = config->formation; int groupId, nodeId; NodeAddressArray nodesArray = { 0 }; if (!IS_EMPTY_STRING_BUFFER(config->name)) { return true; } /* ensure the keeper state have been loaded already */ if (!keeper_load_state(keeper)) { /* errors have already been logged */ return false; } groupId = keeper->state.current_group; nodeId = keeper->state.current_node_id; log_info("Getting nodes from the monitor for group %d in formation \"%s\"", groupId, formation); if (!monitor_get_nodes(monitor, formation, groupId, &nodesArray)) { /* errors have already been logged */ return false; } /* * We could also add a WHERE clause to the SQL query in monitor_get_nodes, * but we don't expect that many nodes anyway. */ for (int index = 0; index < nodesArray.count; index++) { NodeAddress *node = &(nodesArray.nodes[index]); if (node->nodeId == nodeId) { log_info("Node name on the monitor is now \"%s\"", node->name); strlcpy(config->name, node->name, _POSIX_HOST_NAME_MAX); if (!keeper_config_write_file(config)) { /* errors have already been logged */ return false; } break; } } return true; } /* * keeper_config_accept_new returns true when we can accept to RELOAD our * current config into the new one that's been editing. */ bool keeper_config_accept_new(Keeper *keeper, KeeperConfig *newConfig) { /* make a copy of the current values before changing them */ KeeperConfig oldConfig = keeper->config; KeeperConfig *config = &(keeper->config); bool monitorUpdateNeeded = false; /* some elements are not supposed to change on a reload */ if (strneq(newConfig->pgSetup.pgdata, config->pgSetup.pgdata)) { log_error("Attempt to change postgresql.pgdata from \"%s\" to \"%s\"", config->pgSetup.pgdata, newConfig->pgSetup.pgdata); return false; } /* * Changing the monitor URI. Well it might just be about using a new IP * address, e.g. switching to IPv6, or maybe the monitor has moved to * another hostname. * * We don't check if we are still registered on the new monitor, only that * we can connect. The node_active calls are going to fail it we then * aren't registered anymore. */ if (strneq(newConfig->monitor_pguri, config->monitor_pguri)) { Monitor monitor = { 0 }; if (PG_AUTOCTL_MONITOR_IS_DISABLED(newConfig)) { config->monitorDisabled = true; strlcpy(config->monitor_pguri, PG_AUTOCTL_MONITOR_DISABLED, sizeof(config->monitor_pguri)); log_info("Reloading configuration: the monitor has been disabled"); } else { if (!monitor_init(&monitor, newConfig->monitor_pguri)) { log_fatal("Failed to contact the monitor because " "its URL is invalid, see above for details"); return false; } log_info("Reloading configuration: monitor uri is now \"%s\"; " "used to be \"%s\"", newConfig->monitor_pguri, config->monitor_pguri); config->monitorDisabled = false; strlcpy(config->monitor_pguri, newConfig->monitor_pguri, sizeof(config->monitor_pguri)); } } /* * We don't support changing formation, group, or hostname mid-flight: we * might have to register again to the monitor to make that work, and in * that case an admin should certainly be doing some offline steps, maybe * even having to `pg_autoctl create` all over again. */ if (strneq(newConfig->formation, config->formation)) { log_warn("pg_autoctl doesn't know how to change formation at run-time, " "continuing with formation \"%s\".", config->formation); } /* * Changing the node name is okay, we need to sync the update to the * monitor though. */ if (strneq(newConfig->name, config->name)) { monitorUpdateNeeded = true; log_info("Reloading configuration: node name is now \"%s\"; " "used to be \"%s\"", newConfig->name, config->name); strlcpy(config->name, newConfig->name, _POSIX_HOST_NAME_MAX); } /* * Changing the hostname seems ok, our registration is checked against * formation/groupId/nodeId anyway. The hostname is used so that other * nodes in the network may contact us. Again, it might be a change of * public IP address, e.g. switching to IPv6. * * Changing the hostname in the local configuration file requires also an * update of the metadata on the monitor. */ if (strneq(newConfig->hostname, config->hostname)) { monitorUpdateNeeded = true; log_info("Reloading configuration: hostname is now \"%s\"; " "used to be \"%s\"", newConfig->hostname, config->hostname); strlcpy(config->hostname, newConfig->hostname, _POSIX_HOST_NAME_MAX); } if (monitorUpdateNeeded) { log_info("Node name or hostname have changed, updating the " "metadata on the monitor"); if (!keeper_set_node_metadata(keeper, &oldConfig)) { log_error("Failed to update name and hostname on the monitor, " "see above for details"); return false; } } /* * Changing the replication password? Sure. */ if (strneq(newConfig->replication_password, config->replication_password)) { log_info("Reloading configuration: replication password has changed"); /* note: strneq checks args are not NULL, it's safe to proceed */ strlcpy(config->replication_password, newConfig->replication_password, MAXCONNINFO); } /* * Changing replication.maximum_backup_rate. */ if (strneq(newConfig->maximum_backup_rate, config->maximum_backup_rate)) { log_info("Reloading configuration: " "replication.maximum_backup_rate is now \"%s\"; " "used to be \"%s\"", newConfig->maximum_backup_rate, config->maximum_backup_rate); strlcpy(config->maximum_backup_rate, newConfig->maximum_backup_rate, MAXIMUM_BACKUP_RATE_LEN); } /* * The backupDirectory can be changed online too. */ if (strneq(newConfig->backupDirectory, config->backupDirectory)) { log_info("Reloading configuration: " "replication.backup_directory is now \"%s\"; " "used to be \"%s\"", newConfig->backupDirectory, config->backupDirectory); strlcpy(config->backupDirectory, newConfig->backupDirectory, MAXPGPATH); } /* * And now the timeouts. Of course we support changing them at run-time. */ if (newConfig->network_partition_timeout != config->network_partition_timeout) { log_info("Reloading configuration: timeout.network_partition_timeout " "is now %d; used to be %d", newConfig->network_partition_timeout, config->network_partition_timeout); config->network_partition_timeout = newConfig->network_partition_timeout; } if (newConfig->prepare_promotion_catchup != config->prepare_promotion_catchup) { log_info("Reloading configuration: timeout.prepare_promotion_catchup " "is now %d; used to be %d", newConfig->prepare_promotion_catchup, config->prepare_promotion_catchup); config->prepare_promotion_catchup = newConfig->prepare_promotion_catchup; } if (newConfig->prepare_promotion_walreceiver != config->prepare_promotion_walreceiver) { log_info( "Reloading configuration: timeout.prepare_promotion_walreceiver " "is now %d; used to be %d", newConfig->prepare_promotion_walreceiver, config->prepare_promotion_walreceiver); config->prepare_promotion_walreceiver = newConfig->prepare_promotion_walreceiver; } if (newConfig->postgresql_restart_failure_timeout != config->postgresql_restart_failure_timeout) { log_info( "Reloading configuration: timeout.postgresql_restart_failure_timeout " "is now %d; used to be %d", newConfig->postgresql_restart_failure_timeout, config->postgresql_restart_failure_timeout); config->postgresql_restart_failure_timeout = newConfig->postgresql_restart_failure_timeout; } if (newConfig->postgresql_restart_failure_max_retries != config->postgresql_restart_failure_max_retries) { log_info( "Reloading configuration: retries.postgresql_restart_failure_max_retries " "is now %d; used to be %d", newConfig->postgresql_restart_failure_max_retries, config->postgresql_restart_failure_max_retries); config->postgresql_restart_failure_max_retries = newConfig->postgresql_restart_failure_max_retries; } /* we can change any SSL related setup options at runtime */ return config_accept_new_ssloptions(&(config->pgSetup), &(newConfig->pgSetup)); } /* * reload_configuration reads the supposedly new configuration file and * integrates accepted new values into the current setup. */ bool keeper_reload_configuration(Keeper *keeper, bool firstLoop, bool doInit) { KeeperConfig *config = &(keeper->config); bool postgresNotRunningIsOk = firstLoop; /* * This function implements changes that we want to see before calling the * monitor for the first time, when called as part as the firstLoop. The * function is called again at the end of the loop, once the monitor has * been called, and we're happy to decline then: the job has already been * done in full the first time. */ if (firstLoop && !doInit) { return true; } if (file_exists(config->pathnames.config)) { KeeperConfig newConfig = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; /* * Set the same configuration and state file as the current config. */ strlcpy(newConfig.pathnames.config, config->pathnames.config, MAXPGPATH); strlcpy(newConfig.pathnames.state, config->pathnames.state, MAXPGPATH); /* disconnect to the current monitor if we're connected */ (void) pgsql_finish(&(keeper->monitor.pgsql)); (void) pgsql_finish(&(keeper->monitor.notificationClient)); if (keeper_config_read_file(&newConfig, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk) && keeper_config_accept_new(keeper, &newConfig)) { /* * The keeper->config changed, not the keeper->postgres, but the * main loop takes care of updating it at each loop anyway, so we * don't have to take care of that now. */ log_info("Reloaded the new configuration from \"%s\"", config->pathnames.config); /* * The new configuration might impact the Postgres setup, such as * when changing the SSL file paths. */ if (!keeper_ensure_configuration(keeper, postgresNotRunningIsOk)) { log_warn("Failed to reload pg_autoctl configuration, " "see above for details"); } } else { log_warn("Failed to read configuration file \"%s\", " "continuing with the same configuration.", config->pathnames.config); } } else { log_warn("Configuration file \"%s\" does not exist, " "continuing with the same configuration.", config->pathnames.config); } return true; } /* * keeper_call_reload_hooks loops over the KeeperReloadHooks * reloadFunctionArray and calls each hook in turn. */ void keeper_call_reload_hooks(Keeper *keeper, bool firstLoop, bool doInit) { for (int index = 0; KeeperReloadHooks[index]; index++) { /* at the moment we ignore the return values from the reload hooks */ (void) (*KeeperReloadHooks[index])(keeper, firstLoop, doInit); } /* we're done reloading now. */ asked_to_reload = 0; } /* * keeper_read_nodes_from_file read the keeper->config.pathnames.nodes file (a * JSON Array of Nodes with id, name, host, port, lsn, and is_primary) and * fills in the internal keeper otherNodes array. Use that function when the * monitor is disabled. */ bool keeper_read_nodes_from_file(Keeper *keeper, NodeAddressArray *nodesArray) { KeeperConfig *config = &(keeper->config); KeeperStateData *state = &(keeper->state); char *contents = NULL; long size = 0L; /* refrain from reading the nodes list when in the INIT state */ if (state->current_role == INIT_STATE) { return true; } /* if the file does not exist, we're done */ if (!file_exists(config->pathnames.nodes)) { log_debug("Nodes files \"%s\" does not exist, done processing", config->pathnames.nodes); return true; } if (!read_file_if_exists(config->pathnames.nodes, &contents, &size)) { log_error("Failed to read nodes array from file \"%s\"", config->pathnames.nodes); return false; } /* now parse the nodes JSON file */ if (!parseNodesArray(contents, nodesArray, state->current_node_id)) { log_debug("Failed to parse JSON nodes array:\n%s", contents); log_error("Failed to parse nodes array from file \"%s\"", config->pathnames.nodes); return false; } return true; } /* * keeper_get_primary fetches the current primary Node in the group, either by * connecting to the monitor and using the pgautofailover.get_primary() API * there, or by scanning through the keeper->otherNodes array for the first * node with isPrimary true. * * In both cases, there might not be a primary node identified at the moment, * in which case we return false. */ bool keeper_get_primary(Keeper *keeper, NodeAddress *primaryNode) { KeeperConfig *config = &(keeper->config); if (!config->monitorDisabled) { Monitor *monitor = &(keeper->monitor); if (!monitor_get_primary(monitor, config->formation, keeper->state.current_group, primaryNode)) { log_error("Failed to get the primary node from the monitor, " "see above for details"); return false; } return true; } else { for (int i = 0; i < keeper->otherNodes.count; i++) { NodeAddress *node = &(keeper->otherNodes.nodes[i]); if (node->isPrimary) { /* copy the node address details into primaryNode */ *primaryNode = *node; return true; } } log_error("Failed to get the primary node from the current list " "of other nodes, refresh the list with the command: " "pg_autoctl do fsm nodes set"); return false; } return false; } /* * keeper_get_most_advanced_standby fetches the current most advanded standby * node in the group, either by connecting to the monitor and using the * pgautofailover.get_most_advanced_standby() API there, or by scanning through * the keeper->otherNodes array. */ bool keeper_get_most_advanced_standby(Keeper *keeper, NodeAddress *upstreamNode) { KeeperConfig *config = &(keeper->config); int groupId = keeper->state.current_group; if (!config->monitorDisabled) { Monitor *monitor = &(keeper->monitor); if (!monitor_get_most_advanced_standby(monitor, config->formation, groupId, upstreamNode)) { log_error("Failed to get the most advanced standby node " "from the monitor, see above for details"); return false; } return true; } else { NodeAddress *mostAdvandedStandbyNode = NULL; uint64_t mostAdvandedLSN = 0; for (int i = 0; i < keeper->otherNodes.count; i++) { NodeAddress *node = &(keeper->otherNodes.nodes[i]); uint64_t nodeLSN = 0; if (!parseLSN(node->lsn, &nodeLSN)) { log_error("Failed to parse node %" PRId64 " \"%s\" LSN position \"%s\"", node->nodeId, node->name, node->lsn); return false; } if (mostAdvandedStandbyNode == NULL || nodeLSN > mostAdvandedLSN) { mostAdvandedStandbyNode = node; mostAdvandedLSN = nodeLSN; } } if (mostAdvandedStandbyNode == NULL) { log_error("Failed to get the most avdanced standby node " "from the current list of other nodes, " "refresh the list with the command: " "pg_autoctl do fsm nodes set"); return false; } *upstreamNode = *mostAdvandedStandbyNode; return true; } return false; } /* * keeper_pg_autoctl_get_version_from_disk calls pg_autoctl version --json and * parses the output to fill-in the keeper version. */ bool keeper_pg_autoctl_get_version_from_disk(Keeper *keeper, KeeperVersion *version) { char buffer[BUFSIZE]; Program program = run_program(pg_autoctl_argv0, "version", "--json", NULL); log_debug("%s version --json", pg_autoctl_argv0); if (program.returnCode != 0) { log_error("%s version --json exited with code %d", pg_autoctl_argv0, program.returnCode); free_program(&program); return false; } /* make a local copy of the program output, for JSON parsing */ strlcpy(buffer, program.stdOut, sizeof(buffer)); free_program(&program); JSON_Value *json = json_parse_string(buffer); if (json == NULL || json_type(json) != JSONObject) { log_error("Failed to parse pg_autoctl version --json"); json_value_free(json); return false; } JSON_Object *jsObj = json_value_get_object(json); char *str = (char *) json_object_get_string(jsObj, "pg_autoctl"); if (str != NULL) { strlcpy(version->pg_autoctl_version, str, sizeof(version->pg_autoctl_version)); } else { log_error("Failed to validate pg_autoctl version --json"); json_value_free(json); return false; } str = (char *) json_object_get_string(jsObj, "pgautofailover"); if (str != NULL) { strlcpy(version->required_extension_version, str, sizeof(version->required_extension_version)); } else { log_error("Failed to validate pg_autoctl version --json"); json_value_free(json); return false; } json_value_free(json); return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/keeper.h000066400000000000000000000107721414244367200222100ustar00rootroot00000000000000/* * src/bin/pg_autoctl/keeper.h * Main data structures for the pg_autoctl service state. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef KEEPER_H #define KEEPER_H #include "commandline.h" #include "keeper_config.h" #include "log.h" #include "monitor.h" #include "primary_standby.h" #include "state.h" /* the keeper manages a postgres server according to the given configuration */ typedef struct Keeper { KeeperConfig config; LocalPostgresServer postgres; KeeperStateData state; Monitor monitor; /* * When running without monitor, we need a place to stash the otherNodes * information. This is necessary in some transitions. */ NodeAddressArray otherNodes; /* Only useful during the initialization of the Keeper */ KeeperStateInit initState; } Keeper; typedef struct KeeperVersion { char pg_autoctl_version[BUFSIZE]; char required_extension_version[BUFSIZE]; } KeeperVersion; bool keeper_init(Keeper *keeper, KeeperConfig *config); bool keeper_init_fsm(Keeper *keeper); bool keeper_register_and_init(Keeper *keeper, NodeState initialState); bool keeper_register_again(Keeper *keeper); bool keeper_load_state(Keeper *keeper); bool keeper_store_state(Keeper *keeper); bool keeper_update_state(Keeper *keeper, int64_t node_id, int group_id, NodeState state, bool update_last_monitor_contact); bool keeper_start_postgres(Keeper *keeper); bool keeper_restart_postgres(Keeper *keeper); bool keeper_should_ensure_current_state_before_transition(Keeper *keeper); bool keeper_ensure_postgres_is_running(Keeper *keeper, bool updateRetries); bool keeper_create_and_drop_replication_slots(Keeper *keeper); bool keeper_maintain_replication_slots(Keeper *keeper); bool keeper_ensure_current_state(Keeper *keeper); bool keeper_create_self_signed_cert(Keeper *keeper); bool keeper_ensure_configuration(Keeper *keeper, bool postgresNotRunningIsOk); bool keeper_update_pg_state(Keeper *keeper, int logLevel); bool keeper_node_active(Keeper *keeper, bool doInit, MonitorAssignedState *assignedState); bool keeper_ensure_node_has_been_dropped(Keeper *keeper, bool *dropped); bool ReportPgIsRunning(Keeper *keeper); bool keeper_remove(Keeper *keeper, KeeperConfig *config); bool keeper_check_monitor_extension_version(Keeper *keeper, MonitorExtensionVersion *version); bool keeper_state_as_json(Keeper *keeper, char *json, int size); bool keeper_update_group_hba(Keeper *keeper, NodeAddressArray *diffNodesArray); bool keeper_refresh_other_nodes(Keeper *keeper, bool forceCacheInvalidation); bool keeper_set_node_metadata(Keeper *keeper, KeeperConfig *oldConfig); bool keeper_update_nodename_from_monitor(Keeper *keeper); bool keeper_config_accept_new(Keeper *keeper, KeeperConfig *newConfig); /* * When receiving a SIGHUP signal, the keeper knows how to reload its current * in-memory configuration from the on-disk configuration file, and then apply * changes. For this we use an array of functions that we call in order each * time we are asked to reload. * * Because it's possible to edit the configuration file while pg_autoctl is not * running, we also call the ReloadHook functions when entering our main loop * the first time. */ typedef bool (*KeeperReloadFunction)(Keeper *keeper, bool firstLoop, bool doInit); /* * When updating the list of other nodes (a NodesArray) after calling * node_active, the keeper needs to implement specific actions such as editing * the HBA rules to allow new nodes to connect. */ typedef bool (*KeeperNodesArrayRefreshFunction)(Keeper *keeper, NodeAddressArray *newNodesArray, bool forceCacheInvalidation); /* src/bin/pg_autoctl/service_keeper.c */ extern KeeperReloadFunction *KeeperReloadHooks; extern KeeperNodesArrayRefreshFunction *KeeperRefreshHooks; void keeper_call_reload_hooks(Keeper *keeper, bool firstLoop, bool doInit); bool keeper_reload_configuration(Keeper *keeper, bool firstLoop, bool doInit); bool keeper_call_refresh_hooks(Keeper *keeper, NodeAddressArray *newNodesArray, bool forceCacheInvalidation); bool keeper_refresh_hba(Keeper *keeper, NodeAddressArray *newNodesArray, bool forceCacheInvalidation); bool keeper_read_nodes_from_file(Keeper *keeper, NodeAddressArray *nodesArray); bool keeper_get_primary(Keeper *keeper, NodeAddress *primaryNode); bool keeper_get_most_advanced_standby(Keeper *keeper, NodeAddress *primaryNode); bool keeper_pg_autoctl_get_version_from_disk(Keeper *keeper, KeeperVersion *version); #endif /* KEEPER_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/keeper_config.c000066400000000000000000000634401414244367200235300ustar00rootroot00000000000000/* * src/bin/pg_autoctl/keeper_config.c * Keeper configuration functions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include "postgres_fe.h" #include "defaults.h" #include "ini_file.h" #include "keeper.h" #include "keeper_config.h" #include "log.h" #include "parsing.h" #include "pgctl.h" #define OPTION_AUTOCTL_ROLE(config) \ make_strbuf_option_default("pg_autoctl", "role", NULL, true, NAMEDATALEN, \ config->role, KEEPER_ROLE) #define OPTION_AUTOCTL_MONITOR(config) \ make_strbuf_option("pg_autoctl", "monitor", "monitor", false, MAXCONNINFO, \ config->monitor_pguri) #define OPTION_AUTOCTL_FORMATION(config) \ make_strbuf_option_default("pg_autoctl", "formation", "formation", \ true, NAMEDATALEN, \ config->formation, FORMATION_DEFAULT) #define OPTION_AUTOCTL_GROUPID(config) \ make_int_option("pg_autoctl", "group", "group", false, &(config->groupId)) #define OPTION_AUTOCTL_NAME(config) \ make_strbuf_option_default("pg_autoctl", "name", "name", \ false, _POSIX_HOST_NAME_MAX, \ config->name, "") /* * --hostname used to be --nodename, and we need to support transition from the * old to the new name. For that, we read the pg_autoctl.nodename config * setting and change it on the fly to hostname instead. * * As a result HOSTNAME is marked not required and NODENAME is marked compat. */ #define OPTION_AUTOCTL_HOSTNAME(config) \ make_strbuf_option("pg_autoctl", "hostname", "hostname", \ false, _POSIX_HOST_NAME_MAX, config->hostname) #define OPTION_AUTOCTL_NODENAME(config) \ make_strbuf_compat_option("pg_autoctl", "nodename", \ _POSIX_HOST_NAME_MAX, config->hostname) #define OPTION_AUTOCTL_NODEKIND(config) \ make_strbuf_option("pg_autoctl", "nodekind", NULL, false, NAMEDATALEN, \ config->nodeKind) #define OPTION_POSTGRESQL_PGDATA(config) \ make_strbuf_option("postgresql", "pgdata", "pgdata", true, MAXPGPATH, \ config->pgSetup.pgdata) #define OPTION_POSTGRESQL_PG_CTL(config) \ make_strbuf_option("postgresql", "pg_ctl", "pgctl", false, MAXPGPATH, \ config->pgSetup.pg_ctl) #define OPTION_POSTGRESQL_USERNAME(config) \ make_strbuf_option("postgresql", "username", "username", \ false, NAMEDATALEN, \ config->pgSetup.username) #define OPTION_POSTGRESQL_DBNAME(config) \ make_strbuf_option("postgresql", "dbname", "dbname", false, NAMEDATALEN, \ config->pgSetup.dbname) #define OPTION_POSTGRESQL_HOST(config) \ make_strbuf_option("postgresql", "host", "pghost", \ false, _POSIX_HOST_NAME_MAX, \ config->pgSetup.pghost) #define OPTION_POSTGRESQL_PORT(config) \ make_int_option("postgresql", "port", "pgport", \ true, &(config->pgSetup.pgport)) #define OPTION_POSTGRESQL_PROXY_PORT(config) \ make_int_option("postgresql", "proxyport", "proxyport", \ false, &(config->pgSetup.proxyport)) #define OPTION_POSTGRESQL_LISTEN_ADDRESSES(config) \ make_strbuf_option("postgresql", "listen_addresses", "listen", \ false, MAXPGPATH, config->pgSetup.listen_addresses) #define OPTION_POSTGRESQL_AUTH_METHOD(config) \ make_strbuf_option("postgresql", "auth_method", "auth", \ false, MAXPGPATH, config->pgSetup.authMethod) #define OPTION_POSTGRESQL_HBA_LEVEL(config) \ make_strbuf_option("postgresql", "hba_level", NULL, \ false, MAXPGPATH, config->pgSetup.hbaLevelStr) #define OPTION_SSL_ACTIVE(config) \ make_int_option_default("ssl", "active", NULL, \ false, &(config->pgSetup.ssl.active), 0) #define OPTION_SSL_MODE(config) \ make_strbuf_option("ssl", "sslmode", "ssl-mode", \ false, SSL_MODE_STRLEN, config->pgSetup.ssl.sslModeStr) #define OPTION_SSL_CA_FILE(config) \ make_strbuf_option("ssl", "ca_file", "ssl-ca-file", \ false, MAXPGPATH, config->pgSetup.ssl.caFile) #define OPTION_SSL_CRL_FILE(config) \ make_strbuf_option("ssl", "crl_file", "ssl-crl-file", \ false, MAXPGPATH, config->pgSetup.ssl.crlFile) #define OPTION_SSL_SERVER_CERT(config) \ make_strbuf_option("ssl", "cert_file", "server-cert", \ false, MAXPGPATH, config->pgSetup.ssl.serverCert) #define OPTION_SSL_SERVER_KEY(config) \ make_strbuf_option("ssl", "key_file", "server-key", \ false, MAXPGPATH, config->pgSetup.ssl.serverKey) #define OPTION_REPLICATION_PASSWORD(config) \ make_strbuf_option_default("replication", "password", NULL, \ false, MAXCONNINFO, \ config->replication_password, \ REPLICATION_PASSWORD_DEFAULT) #define OPTION_REPLICATION_MAXIMUM_BACKUP_RATE(config) \ make_strbuf_option_default("replication", "maximum_backup_rate", NULL, \ false, MAXIMUM_BACKUP_RATE_LEN, \ config->maximum_backup_rate, \ MAXIMUM_BACKUP_RATE) #define OPTION_REPLICATION_BACKUP_DIR(config) \ make_strbuf_option("replication", "backup_directory", NULL, \ false, MAXPGPATH, config->backupDirectory) #define OPTION_TIMEOUT_NETWORK_PARTITION(config) \ make_int_option_default("timeout", "network_partition_timeout", \ NULL, false, \ &(config->network_partition_timeout), \ NETWORK_PARTITION_TIMEOUT) #define OPTION_TIMEOUT_PREPARE_PROMOTION_CATCHUP(config) \ make_int_option_default("timeout", "prepare_promotion_catchup", \ NULL, \ false, \ &(config->prepare_promotion_catchup), \ PREPARE_PROMOTION_CATCHUP_TIMEOUT) #define OPTION_TIMEOUT_PREPARE_PROMOTION_WALRECEIVER(config) \ make_int_option_default("timeout", "prepare_promotion_walreceiver", \ NULL, \ false, \ &(config->prepare_promotion_walreceiver), \ PREPARE_PROMOTION_WALRECEIVER_TIMEOUT) #define OPTION_TIMEOUT_POSTGRESQL_RESTART_FAILURE_TIMEOUT(config) \ make_int_option_default("timeout", "postgresql_restart_failure_timeout", \ NULL, \ false, \ &(config->postgresql_restart_failure_timeout), \ POSTGRESQL_FAILS_TO_START_TIMEOUT) #define OPTION_TIMEOUT_POSTGRESQL_RESTART_FAILURE_MAX_RETRIES(config) \ make_int_option_default("timeout", "postgresql_restart_failure_max_retries", \ NULL, \ false, \ &(config->postgresql_restart_failure_max_retries), \ POSTGRESQL_FAILS_TO_START_RETRIES) #define OPTION_TIMEOUT_LISTEN_NOTIFICATIONS(config) \ make_int_option_default("timeout", "listen_notifications_timeout", \ NULL, false, \ &(config->listen_notifications_timeout), \ PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT) #define OPTION_CITUS_ROLE(config) \ make_strbuf_option_default("citus", "role", NULL, false, NAMEDATALEN, \ config->citusRoleStr, DEFAULT_CITUS_ROLE) #define OPTION_CITUS_CLUSTER_NAME(config) \ make_strbuf_option("citus", "cluster_name", "citus-cluster", \ false, NAMEDATALEN, config->pgSetup.citusClusterName) #define SET_INI_OPTIONS_ARRAY(config) \ { \ OPTION_AUTOCTL_ROLE(config), \ OPTION_AUTOCTL_MONITOR(config), \ OPTION_AUTOCTL_FORMATION(config), \ OPTION_AUTOCTL_GROUPID(config), \ OPTION_AUTOCTL_NAME(config), \ OPTION_AUTOCTL_HOSTNAME(config), \ OPTION_AUTOCTL_NODENAME(config), \ OPTION_AUTOCTL_NODEKIND(config), \ OPTION_POSTGRESQL_PGDATA(config), \ OPTION_POSTGRESQL_PG_CTL(config), \ OPTION_POSTGRESQL_USERNAME(config), \ OPTION_POSTGRESQL_DBNAME(config), \ OPTION_POSTGRESQL_HOST(config), \ OPTION_POSTGRESQL_PORT(config), \ OPTION_POSTGRESQL_PROXY_PORT(config), \ OPTION_POSTGRESQL_LISTEN_ADDRESSES(config), \ OPTION_POSTGRESQL_AUTH_METHOD(config), \ OPTION_POSTGRESQL_HBA_LEVEL(config), \ OPTION_SSL_ACTIVE(config), \ OPTION_SSL_MODE(config), \ OPTION_SSL_CA_FILE(config), \ OPTION_SSL_CRL_FILE(config), \ OPTION_SSL_SERVER_CERT(config), \ OPTION_SSL_SERVER_KEY(config), \ OPTION_REPLICATION_MAXIMUM_BACKUP_RATE(config), \ OPTION_REPLICATION_BACKUP_DIR(config), \ OPTION_REPLICATION_PASSWORD(config), \ OPTION_TIMEOUT_NETWORK_PARTITION(config), \ OPTION_TIMEOUT_PREPARE_PROMOTION_CATCHUP(config), \ OPTION_TIMEOUT_PREPARE_PROMOTION_WALRECEIVER(config), \ OPTION_TIMEOUT_POSTGRESQL_RESTART_FAILURE_TIMEOUT(config), \ OPTION_TIMEOUT_POSTGRESQL_RESTART_FAILURE_MAX_RETRIES(config), \ OPTION_TIMEOUT_LISTEN_NOTIFICATIONS(config), \ \ OPTION_CITUS_ROLE(config), \ OPTION_CITUS_CLUSTER_NAME(config), \ INI_OPTION_LAST \ } static bool keeper_config_init_nodekind(KeeperConfig *config); static bool keeper_config_init_hbalevel(KeeperConfig *config); static bool keeper_config_set_backup_directory(KeeperConfig *config, int64_t nodeId); /* * keeper_config_set_pathnames_from_pgdata sets the config pathnames from its * pgSetup.pgdata field, which must have already been set when calling this * function. */ bool keeper_config_set_pathnames_from_pgdata(ConfigFilePaths *pathnames, const char *pgdata) { if (IS_EMPTY_STRING_BUFFER(pgdata)) { /* developer error */ log_error("BUG: keeper_config_set_pathnames_from_pgdata: empty pgdata"); return false; } if (!SetConfigFilePath(pathnames, pgdata)) { log_fatal("Failed to set configuration filename from PGDATA \"%s\"," " see above for details.", pgdata); return false; } if (!SetStateFilePath(pathnames, pgdata)) { log_fatal("Failed to set state filename from PGDATA \"%s\"," " see above for details.", pgdata); return false; } if (!SetNodesFilePath(pathnames, pgdata)) { log_fatal("Failed to set pid filename from PGDATA \"%s\"," " see above for details.", pgdata); return false; } if (!SetPidFilePath(pathnames, pgdata)) { log_fatal("Failed to set pid filename from PGDATA \"%s\"," " see above for details.", pgdata); return false; } return true; } /* * keeper_config_init initializes a KeeperConfig with the default values. */ void keeper_config_init(KeeperConfig *config, bool missingPgdataIsOk, bool pgIsNotRunningIsOk) { PostgresSetup pgSetup = { 0 }; IniOption keeperOptions[] = SET_INI_OPTIONS_ARRAY(config); log_trace("keeper_config_init"); if (!ini_validate_options(keeperOptions)) { log_error("Please review your setup options per above messages"); exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_config_init_nodekind(config)) { /* errors have already been logged. */ log_error("Please review your setup options per above messages"); exit(EXIT_CODE_BAD_CONFIG); } if (!keeper_config_init_hbalevel(config)) { log_error("Failed to initialize postgresql.hba_level"); log_error("Please review your setup options per above messages"); exit(EXIT_CODE_BAD_CONFIG); } if (!pg_setup_init(&pgSetup, &(config->pgSetup), missingPgdataIsOk, pgIsNotRunningIsOk)) { log_error("Please fix your PostgreSQL setup per above messages"); exit(EXIT_CODE_BAD_CONFIG); } /* * Keep the whole set of values discovered in pg_setup_init from the * configuration file */ config->pgSetup = pgSetup; /* * Compute the backupDirectory from pgdata, or check the one given in the * configuration file already. */ if (!keeper_config_set_backup_directory(config, -1)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_CONFIG); } /* set our configuration and state file pathnames */ if (!SetConfigFilePath(&(config->pathnames), config->pgSetup.pgdata)) { log_error("Failed to initialize Keeper's config, see above"); exit(EXIT_CODE_BAD_CONFIG); } if (!SetStateFilePath(&(config->pathnames), config->pgSetup.pgdata)) { log_error("Failed to initialize Keeper's config, see above"); exit(EXIT_CODE_BAD_CONFIG); } } /* * keeper_config_read_file overrides values in given KeeperConfig with whatever * values are read from given configuration filename. */ bool keeper_config_read_file(KeeperConfig *config, bool missingPgdataIsOk, bool pgIsNotRunningIsOk, bool monitorDisabledIsOk) { if (!keeper_config_read_file_skip_pgsetup(config, monitorDisabledIsOk)) { /* errors have already been logged. */ return false; } return keeper_config_pgsetup_init(config, missingPgdataIsOk, pgIsNotRunningIsOk); } /* * keeper_config_read_file_skip_pgsetup overrides values in given KeeperConfig * with whatever values are read from given configuration filename. */ bool keeper_config_read_file_skip_pgsetup(KeeperConfig *config, bool monitorDisabledIsOk) { const char *filename = config->pathnames.config; IniOption keeperOptions[] = SET_INI_OPTIONS_ARRAY(config); log_debug("Reading configuration from %s", filename); if (!read_ini_file(filename, keeperOptions)) { log_error("Failed to parse configuration file \"%s\"", filename); return false; } /* * We have changed the --nodename option to being named --hostname, and * same in the configuration file: pg_autoctl.nodename is now * pg_autoctl.hostname. * * We can read either names from the configuration file and will then write * the current option name (pg_autoctl.hostname), but we can't have either * one be required anymore. * * Implement the "require" property here by making sure one of those names * have been used to populate the monitor config structure. */ if (IS_EMPTY_STRING_BUFFER(config->hostname)) { log_error("Failed to read either pg_autoctl.hostname or its older " "name pg_autoctl.nodename from the \"%s\" configuration file", filename); return false; } /* take care of the special value for disabled monitor setup */ if (PG_AUTOCTL_MONITOR_IS_DISABLED(config)) { config->monitorDisabled = true; if (!monitorDisabledIsOk) { log_error("Monitor is disabled in the configuration"); return false; } } /* * Turn the configuration string for hbaLevel into our enum value. */ if (!keeper_config_init_hbalevel(config)) { log_error("Failed to initialize postgresql.hba_level"); return false; } /* set the ENUM value for hbaLevel */ config->pgSetup.hbaLevel = pgsetup_parse_hba_level(config->pgSetup.hbaLevelStr); /* * Required for grandfathering old clusters that don't have sslmode * explicitely set */ if (IS_EMPTY_STRING_BUFFER(config->pgSetup.ssl.sslModeStr)) { strlcpy(config->pgSetup.ssl.sslModeStr, "prefer", SSL_MODE_STRLEN); } /* set the ENUM value for sslMode */ config->pgSetup.ssl.sslMode = pgsetup_parse_sslmode(config->pgSetup.ssl.sslModeStr); /* now when that is provided, read the Citus Role and convert to enum */ if (IS_EMPTY_STRING_BUFFER(config->citusRoleStr)) { config->citusRole = CITUS_ROLE_PRIMARY; } else { if (strcmp(config->citusRoleStr, "primary") == 0) { config->citusRole = CITUS_ROLE_PRIMARY; } else if (strcmp(config->citusRoleStr, "secondary") == 0) { config->citusRole = CITUS_ROLE_SECONDARY; } else { log_error("Failed to parse citus.role \"%s\": expected either " "\"primary\" or \"secondary\"", config->citusRoleStr); return false; } } if (!keeper_config_init_nodekind(config)) { /* errors have already been logged. */ return false; } return true; } /* * keeper_config_pgsetup_init overrides values in given KeeperConfig with * whatever values are read from given configuration filename. */ bool keeper_config_pgsetup_init(KeeperConfig *config, bool missingPgdataIsOk, bool pgIsNotRunningIsOk) { PostgresSetup pgSetup = { 0 }; log_trace("keeper_config_pgsetup_init"); if (!pg_setup_init(&pgSetup, &config->pgSetup, missingPgdataIsOk, pgIsNotRunningIsOk)) { return false; } /* * Keep the whole set of values discovered in pg_setup_init from the * configuration file */ config->pgSetup = pgSetup; return true; } /* * keeper_config_write_file writes the current values in given KeeperConfig to * filename. */ bool keeper_config_write_file(KeeperConfig *config) { const char *filePath = config->pathnames.config; log_trace("keeper_config_write_file \"%s\"", filePath); FILE *fileStream = fopen_with_umask(filePath, "w", FOPEN_FLAGS_W, 0644); if (fileStream == NULL) { /* errors have already been logged */ return false; } bool success = keeper_config_write(fileStream, config); if (fclose(fileStream) == EOF) { log_error("Failed to write file \"%s\"", filePath); return false; } return success; } /* * keeper_config_write write the current config to given STREAM. */ bool keeper_config_write(FILE *stream, KeeperConfig *config) { IniOption keeperOptions[] = SET_INI_OPTIONS_ARRAY(config); return write_ini_to_stream(stream, keeperOptions); } /* * keeper_config_to_json populates given jsRoot object with the INI * configuration sections as JSON objects, and the options as keys to those * objects. */ bool keeper_config_to_json(KeeperConfig *config, JSON_Value *js) { JSON_Object *jsRoot = json_value_get_object(js); IniOption keeperOptions[] = SET_INI_OPTIONS_ARRAY(config); return ini_to_json(jsRoot, keeperOptions); } /* * keeper_config_log_settings outputs a DEBUG line per each config parameter in * the given KeeperConfig. */ void keeper_config_log_settings(KeeperConfig config) { log_debug("pg_autoctl.monitor: %s", config.monitor_pguri); log_debug("pg_autoctl.formation: %s", config.formation); log_debug("postgresql.hostname: %s", config.hostname); log_debug("postgresql.nodekind: %s", config.nodeKind); log_debug("postgresql.pgdata: %s", config.pgSetup.pgdata); log_debug("postgresql.pg_ctl: %s", config.pgSetup.pg_ctl); log_debug("postgresql.version: %s", config.pgSetup.pg_version); log_debug("postgresql.username: %s", config.pgSetup.username); log_debug("postgresql.dbname: %s", config.pgSetup.dbname); log_debug("postgresql.host: %s", config.pgSetup.pghost); log_debug("postgresql.port: %d", config.pgSetup.pgport); log_debug("replication.replication_password: %s", config.replication_password); log_debug("replication.maximum_backup_rate: %s", config.maximum_backup_rate); } /* * keeper_config_get_setting returns the current value of the given option * "path" (thats a section.option string). The value is returned in the * pre-allocated value buffer of size size. */ bool keeper_config_get_setting(KeeperConfig *config, const char *path, char *value, size_t size) { const char *filename = config->pathnames.config; IniOption keeperOptions[] = SET_INI_OPTIONS_ARRAY(config); return ini_get_setting(filename, keeperOptions, path, value, size); } /* * keeper_config_set_setting sets the setting identified by "path" * (section.option) to the given value. The value is passed in as a string, * which is going to be parsed if necessary. */ bool keeper_config_set_setting(KeeperConfig *config, const char *path, char *value) { const char *filename = config->pathnames.config; IniOption keeperOptions[] = SET_INI_OPTIONS_ARRAY(config); log_trace("keeper_config_set_setting: %s = %s", path, value); if (ini_set_setting(filename, keeperOptions, path, value)) { PostgresSetup pgSetup = { 0 }; bool missing_pgdata_is_ok = true; bool pg_is_not_running_is_ok = true; /* * Before merging given options, validate them as much as we can. * The ini level functions validate the syntax (strings, integers, * etc), not that the values themselves then make sense. */ if (pg_setup_init(&pgSetup, &config->pgSetup, missing_pgdata_is_ok, pg_is_not_running_is_ok)) { config->pgSetup = pgSetup; return true; } } return false; } /* * keeper_config_merge_options merges any option setup in options into config. * Its main use is to override configuration file settings with command line * options. */ bool keeper_config_merge_options(KeeperConfig *config, KeeperConfig *options) { IniOption keeperConfigOptions[] = SET_INI_OPTIONS_ARRAY(config); IniOption keeperOptionsOptions[] = SET_INI_OPTIONS_ARRAY(options); log_trace("keeper_config_merge_options"); if (ini_merge(keeperConfigOptions, keeperOptionsOptions)) { PostgresSetup pgSetup = { 0 }; bool missing_pgdata_is_ok = true; bool pg_is_not_running_is_ok = true; /* * Before merging given options, validate them as much as we can. The * ini level functions validate the syntax (strings, integers, etc), * not that the values themselves then make sense. */ if (!pg_setup_init(&pgSetup, &config->pgSetup, missing_pgdata_is_ok, pg_is_not_running_is_ok)) { return false; } /* * Keep the whole set of values discovered in pg_setup_init from the * configuration file */ config->pgSetup = pgSetup; return keeper_config_write_file(config); } return false; } /* * keeper_config_update updates the configuration of the keeper once we are * registered and know our nodeId and group: then we can also set our * replication slot name and our backup directory using the nodeId. */ bool keeper_config_update(KeeperConfig *config, int64_t nodeId, int groupId) { config->groupId = groupId; (void) postgres_sprintf_replicationSlotName( nodeId, config->replication_slot_name, sizeof(config->replication_slot_name)); /* * Compute the backupDirectory from pgdata, or check the one given in the * configuration file already. */ if (!keeper_config_set_backup_directory(config, nodeId)) { /* errors have already been logged */ return false; } log_debug("keeper_config_update: backup directory = %s", config->backupDirectory); return keeper_config_write_file(config); } /* * keeper_config_init_nodekind initializes the config->nodeKind and * config->pgSetup.pgKind values from the configuration file or command line * options. * * We didn't implement the PgInstanceKind datatype in our INI primitives, so we * need to now to check the configuration values and then transform * config->nodeKind into config->pgSetup.pgKind. */ static bool keeper_config_init_nodekind(KeeperConfig *config) { if (IS_EMPTY_STRING_BUFFER(config->nodeKind)) { /* * If the configuration file lacks the pg_autoctl.nodekind key, it * means we're going to use the default: "standalone". */ strlcpy(config->nodeKind, "standalone", NAMEDATALEN); config->pgSetup.pgKind = NODE_KIND_STANDALONE; } else { config->pgSetup.pgKind = nodeKindFromString(config->nodeKind); /* * Now, NODE_KIND_UNKNOWN signals we failed to recognize selected node * kind, which is an error. */ if (config->pgSetup.pgKind == NODE_KIND_UNKNOWN) { /* we already logged about it */ return false; } } return true; } /* * keeper_config_init_hbalevel initializes the config->pgSetup.hbaLevel and * hbaLevelStr when no command line option switch has been used that places a * value (see --auth, --skip-pg-hba, and --pg-hba-lan). */ static bool keeper_config_init_hbalevel(KeeperConfig *config) { /* * Turn the configuration string for hbaLevel into our enum value. */ if (IS_EMPTY_STRING_BUFFER(config->pgSetup.hbaLevelStr)) { strlcpy(config->pgSetup.hbaLevelStr, "minimal", NAMEDATALEN); } /* set the ENUM value for hbaLevel */ config->pgSetup.hbaLevel = pgsetup_parse_hba_level(config->pgSetup.hbaLevelStr); return true; } /* * keeper_config_set_backup_directory sets the pg_basebackup target directory * to ${PGDATA}/../backup/${hostname} by default. Adding the local hostname * makes it possible to run several instances of Postgres and pg_autoctl on the * same host, which is nice for development and testing scenarios. * * That said, when testing and maybe in other situations, it is custom to have * all the nodes sit on the same machine, and all be "localhost". To avoid any * double-usage of the backup directory, as soon as we have a nodeId we use * ${PGDATA/../backup/node_${nodeId} instead. */ static bool keeper_config_set_backup_directory(KeeperConfig *config, int64_t nodeId) { char *pgdata = config->pgSetup.pgdata; char subdirs[MAXPGPATH] = { 0 }; char backupDirectory[MAXPGPATH] = { 0 }; char absoluteBackupDirectory[PATH_MAX]; /* build the default hostname based backup directory path */ sformat(subdirs, MAXPGPATH, "backup/%s", config->hostname); path_in_same_directory(pgdata, subdirs, backupDirectory); /* * If the user didn't provide a backupDirectory and we're not registered * yet, just use the default value with the hostname. Don't even check it * now. */ if (IS_EMPTY_STRING_BUFFER(config->backupDirectory) && nodeId <= 0) { strlcpy(config->backupDirectory, backupDirectory, MAXPGPATH); return true; } /* if we didn't have a backup directory yet, set one */ if (IS_EMPTY_STRING_BUFFER(config->backupDirectory) || strcmp(backupDirectory, config->backupDirectory) == 0) { /* we might be able to use the nodeId, better than the hostname */ if (nodeId > 0) { sformat(subdirs, MAXPGPATH, "backup/node_%" PRId64, nodeId); path_in_same_directory(pgdata, subdirs, backupDirectory); } strlcpy(config->backupDirectory, backupDirectory, MAXPGPATH); } /* * The best way to make sure we are allowed to create the backup directory * is to just go ahead and create it now. */ log_debug("mkdir -p \"%s\"", config->backupDirectory); if (!ensure_empty_dir(config->backupDirectory, 0700)) { log_fatal("Failed to create the backup directory \"%s\", " "see above for details", config->backupDirectory); return false; } /* Now get the realpath() of the directory we just created */ if (!realpath(config->backupDirectory, absoluteBackupDirectory)) { /* non-fatal error, just keep the computed or given directory path */ log_warn("Failed to get the realpath of backup directory \"%s\": %m", config->backupDirectory); return true; } if (strcmp(config->backupDirectory, absoluteBackupDirectory) != 0) { strlcpy(config->backupDirectory, absoluteBackupDirectory, MAXPGPATH); } return true; } /* * keeper_config_update_with_absolute_pgdata verifies that the pgdata path is * an absolute one If not, the config->pgSetup is updated and we rewrite the * config file */ bool keeper_config_update_with_absolute_pgdata(KeeperConfig *config) { PostgresSetup pgSetup = config->pgSetup; if (pg_setup_set_absolute_pgdata(&pgSetup)) { strlcpy(config->pgSetup.pgdata, pgSetup.pgdata, MAXPGPATH); if (!keeper_config_write_file(config)) { /* errors have already been logged */ return false; } } return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/keeper_config.h000066400000000000000000000060021414244367200235240ustar00rootroot00000000000000/* * src/bin/pg_autoctl/keeper_config.h * Keeper configuration data structure and function definitions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef KEEPER_CONFIG_H #define KEEPER_CONFIG_H #include #include #include "config.h" #include "defaults.h" #include "pgctl.h" #include "pgsql.h" /* * We support "primary" and "secondary" roles in Citus, when Citus support is * enabled. */ typedef enum { CITUS_ROLE_UNKNOWN = 0, CITUS_ROLE_PRIMARY, CITUS_ROLE_SECONDARY } CitusRole; typedef struct KeeperConfig { /* in-memory configuration related variables */ ConfigFilePaths pathnames; /* who's in charge? pg_auto_failover monitor, or a control plane? */ bool monitorDisabled; /* pg_autoctl setup */ char role[NAMEDATALEN]; char monitor_pguri[MAXCONNINFO]; char formation[NAMEDATALEN]; int groupId; char name[_POSIX_HOST_NAME_MAX]; char hostname[_POSIX_HOST_NAME_MAX]; char nodeKind[NAMEDATALEN]; /* PostgreSQL setup */ PostgresSetup pgSetup; /* PostgreSQL replication / tooling setup */ char replication_slot_name[MAXCONNINFO]; char replication_password[MAXCONNINFO]; char maximum_backup_rate[MAXIMUM_BACKUP_RATE_LEN]; char backupDirectory[MAXPGPATH]; /* Citus specific options and settings */ char citusRoleStr[NAMEDATALEN]; CitusRole citusRole; /* pg_autoctl timeouts */ int network_partition_timeout; int prepare_promotion_catchup; int prepare_promotion_walreceiver; int postgresql_restart_failure_timeout; int postgresql_restart_failure_max_retries; int listen_notifications_timeout; } KeeperConfig; #define PG_AUTOCTL_MONITOR_IS_DISABLED(config) \ (strcmp(config->monitor_pguri, PG_AUTOCTL_MONITOR_DISABLED) == 0) bool keeper_config_set_pathnames_from_pgdata(ConfigFilePaths *pathnames, const char *pgdata); void keeper_config_init(KeeperConfig *config, bool missingPgdataIsOk, bool pgIsNotRunningIsOk); bool keeper_config_read_file(KeeperConfig *config, bool missingPgdataIsOk, bool pgIsNotRunningIsOk, bool monitorDisabledIsOk); bool keeper_config_read_file_skip_pgsetup(KeeperConfig *config, bool monitorDisabledIsOk); bool keeper_config_pgsetup_init(KeeperConfig *config, bool missingPgdataIsOk, bool pgIsNotRunningIsOk); bool keeper_config_write_file(KeeperConfig *config); bool keeper_config_write(FILE *stream, KeeperConfig *config); bool keeper_config_to_json(KeeperConfig *config, JSON_Value *js); void keeper_config_log_settings(KeeperConfig config); bool keeper_config_get_setting(KeeperConfig *config, const char *path, char *value, size_t size); bool keeper_config_set_setting(KeeperConfig *config, const char *path, char *value); bool keeper_config_merge_options(KeeperConfig *config, KeeperConfig *options); bool keeper_config_update(KeeperConfig *config, int64_t nodeId, int groupId); bool keeper_config_update_with_absolute_pgdata(KeeperConfig *config); #endif /* KEEPER_CONFIG_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/keeper_pg_init.c000066400000000000000000001073121414244367200237110ustar00rootroot00000000000000/* * src/bin/pg_autoctl/keeper_init.c * Keeper initialisation. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include "cli_common.h" #include "debian.h" #include "defaults.h" #include "env_utils.h" #include "fsm.h" #include "keeper.h" #include "keeper_config.h" #include "keeper_pg_init.h" #include "log.h" #include "monitor.h" #include "parsing.h" #include "pgctl.h" #include "pghba.h" #include "pgsetup.h" #include "pgsql.h" #include "service_keeper_init.h" #include "signals.h" #include "state.h" /* * We keep track of the fact that we had non-fatal warnings during `pg_autoctl * keeper init`: in that case the init step is considered successful, yet users * have extra actions to take care of. * * The only such case supported as of now is failure to `master_activate_node`. * In that case the `pg_autoctl create` job is done: we have registered the * node to the monitor and the coordinator. The operator should now take action * to make it possible to activate the node, and those actions require a * running PostgreSQL instance. */ bool keeperInitWarnings = false; static bool keeper_pg_init_and_register_primary(Keeper *keeper); static bool reach_initial_state(Keeper *keeper); static bool exit_if_dropped(Keeper *keeper); static bool wait_until_primary_is_ready(Keeper *config, MonitorAssignedState *assignedState); static bool wait_until_primary_has_created_our_replication_slot(Keeper *keeper, MonitorAssignedState * assignedState); static bool keeper_pg_init_node_active(Keeper *keeper); /* * keeper_pg_init initializes a pg_autoctl keeper and its local PostgreSQL. * * Depending on whether we have a monitor or not in the config (see * --without-monitor), then we call into keeper_pg_init_and_register or * keeper_pg_init_fsm. */ bool keeper_pg_init(Keeper *keeper) { KeeperConfig *config = &(keeper->config); log_trace("keeper_pg_init: monitor is %s", config->monitorDisabled ? "disabled" : "enabled"); return service_keeper_init(keeper); } /* * keeper_pg_init_and_register initializes a pg_autoctl keeper and its local * PostgreSQL instance. Registering a PostgreSQL instance to the monitor is a 3 * states story: * * - register as INIT, the monitor decides your role (primary or secondary), * and the keeper only does that when the local PostgreSQL instance does not * exist yet. * * - register as SINGLE, when a PostgreSQL instance exists and is not in * recovery. * * - register as INIT then being assigned WAIT_STANDBY, then the keeper should * busy loop (every 1s or something) until the Primary state is WAIT_STANDBY, * so that we can pg_basebackup and move through the CATCHINGUP state. * * In any case, the Keeper implements the first transition after registration * directly, within the `pg_autoctl create` command itself, not waiting until * the first loop when the keeper service starts. Once `pg_autoctl create` is * done, PostgreSQL is known to be running in the proper state. */ bool keeper_pg_init_and_register(Keeper *keeper) { KeeperConfig *config = &(keeper->config); /* * The initial state we may register in depend on the current PostgreSQL * instance that might exist or not at PGDATA. */ PostgresSetup *pgSetup = &(config->pgSetup); bool postgresInstanceExists = pg_setup_pgdata_exists(pgSetup); bool postgresInstanceIsRunning = pg_setup_is_running(pgSetup); PostgresRole postgresRole = pg_setup_role(pgSetup); bool postgresInstanceIsPrimary = postgresRole == POSTGRES_ROLE_PRIMARY; if (postgresInstanceExists) { if (!keeper_ensure_pg_configuration_files_in_pgdata(pgSetup)) { log_fatal("Failed to setup your Postgres instance " "the PostgreSQL way, see above for details"); return false; } } /* * If we don't have a state file, we consider that we're initializing from * scratch and can move on, nothing to do here. */ if (file_exists(config->pathnames.init)) { return keeper_pg_init_continue(keeper); } /* * If we have a state file, we're either running the same command again * (such as pg_autoctl create postgres --run ...) or maybe the user has * changed their mind after having done a pg_autoctl drop node. */ if (file_exists(config->pathnames.state)) { bool dropped = false; /* initialize our local Postgres instance representation */ LocalPostgresServer *postgres = &(keeper->postgres); (void) local_postgres_init(postgres, pgSetup); if (!keeper_ensure_node_has_been_dropped(keeper, &dropped)) { log_fatal("Failed to determine if node %d with current state \"%s\"" " in formation \"%s\" and group %d" " has been dropped from the monitor, see above for details", keeper->state.current_node_id, NodeStateToString(keeper->state.current_role), keeper->config.formation, keeper->config.groupId); return false; } if (dropped) { log_info("This node had been dropped previously, now trying to " "register it again"); } /* * If the node has not been dropped previously, then the state file * indicates a second run of pg_autoctl create postgres command, and * when given --run we start the service normally. * * If dropped is true, the node has been dropped in the past and the * user is trying to cancel the pg_autoctl drop node command by doing a * pg_autoctl create postgres command again. Just continue then. */ if (!dropped) { if (createAndRun) { if (!keeper_init(keeper, config)) { return false; } } else { log_fatal("The state file \"%s\" exists and " "there's no init in progress", config->pathnames.state); log_info("HINT: use `pg_autoctl run` to start the service."); exit(EXIT_CODE_QUIT); } return createAndRun; } } /* * When the monitor is disabled, we're almost done. All that is left is * creating a state file with our nodeId as from the --node-id parameter. * The value is found in the global variable monitorDisabledNodeId. */ if (config->monitorDisabled) { return keeper_init_fsm(keeper); } char scrubbedConnectionString[MAXCONNINFO] = { 0 }; if (!parse_and_scrub_connection_string(config->monitor_pguri, scrubbedConnectionString)) { log_error("Failed to parse the monitor connection string"); return false; } /* * If the local Postgres instance does not exist, we have two possible * choices: either we're the only one in our group, or we are joining a * group that already exists. * * The situation is decided by the Monitor, which implements transaction * semantics and safe concurrency approach, needed here in case other * keeper are concurrently registering other nodes. * * So our strategy is to ask the monitor to pick a state for us and then * implement whatever was decided. After all PGDATA does not exist yet so * we can decide to either pg_ctl initdb or pg_basebackup to create it. */ if (!postgresInstanceExists) { if (!keeper_register_and_init(keeper, INIT_STATE)) { log_error("Failed to register the existing local Postgres node " "\"%s:%d\" running at \"%s\"" "to the pg_auto_failover monitor at %s, " "see above for details", config->hostname, config->pgSetup.pgport, config->pgSetup.pgdata, scrubbedConnectionString); return false; } log_info("Successfully registered as \"%s\" to the monitor.", NodeStateToString(keeper->state.assigned_role)); return reach_initial_state(keeper); } /* * Ok so there's already a Postgres instance that exists in $PGDATA. * * If it's running and is a primary, we can register it as it is and expect * a SINGLE state from the monitor. * * If it's running and is not a primary, we don't know how to handle the * situation yet: the already existing secondary is using its own * replication slot and primary conninfo string (with username, password, * SSL setup, etc). */ if (postgresInstanceIsRunning) { if (postgresInstanceIsPrimary) { log_info("Registering Postgres system %" PRIu64 " running on port %d with pid %d found at \"%s\"", pgSetup->control.system_identifier, pgSetup->pidFile.port, pgSetup->pidFile.pid, pgSetup->pgdata); return keeper_pg_init_and_register_primary(keeper); } else { log_error("pg_autoctl doesn't know how to register an already " "existing standby server at the moment"); return false; } } /* * Ok so there's a Postgres instance that exists in $PGDATA and it's not * running at the moment. We have run pg_controldata on the instance and we * do have its system_identifier. Using it to register, we have two cases: * * - either we are the first node in our group and all is good, we can * register the current PGDATA as a SINGLE, maybe promoting it to being a * primary, * * - or a primary node already is registered in our group, and we are going * to join it as a secondary: that is only possible when the * system_identifier of the other nodes in the group are all the same, * which the monitor checks for us in a way that registration fails when * that's not the case. */ if (postgresInstanceExists && !postgresInstanceIsRunning) { log_info("Registering Postgres system %" PRIu64 " found at \"%s\"", pgSetup->control.system_identifier, pgSetup->pgdata); if (!keeper_register_and_init(keeper, INIT_STATE)) { log_error("Failed to register the existing local Postgres node " "\"%s:%d\" running at \"%s\"" "to the pg_auto_failover monitor at %s, " "see above for details", config->hostname, config->pgSetup.pgport, config->pgSetup.pgdata, scrubbedConnectionString); return false; } log_info("Successfully registered as \"%s\" to the monitor.", NodeStateToString(keeper->state.assigned_role)); return reach_initial_state(keeper); } /* unknown case, the logic above is faulty, at least admit we're defeated */ log_error("Failed to recognise the current initialisation environment"); log_debug("pg exists: %s", postgresInstanceExists ? "yes" : "no"); log_debug("pg is primary: %s", postgresInstanceIsPrimary ? "yes" : "no"); return false; } /* * keeper_pg_init_and_register_primary registers a local Postgres instance that * is known to be a primary: Postgres is running and SELECT pg_is_in_recovery() * returns false. */ static bool keeper_pg_init_and_register_primary(Keeper *keeper) { KeeperConfig *config = &(keeper->config); PostgresSetup *pgSetup = &(config->pgSetup); char absolutePgdata[PATH_MAX]; char scrubbedConnectionString[MAXCONNINFO] = { 0 }; if (!parse_and_scrub_connection_string(config->monitor_pguri, scrubbedConnectionString)) { log_error("Failed to parse the monitor connection string"); return false; } log_info("A postgres directory already exists at \"%s\", registering " "as a single node", realpath(pgSetup->pgdata, absolutePgdata)); /* register to the monitor in the expected state directly */ if (!keeper_register_and_init(keeper, SINGLE_STATE)) { log_error("Failed to register the existing local Postgres node " "\"%s:%d\" running at \"%s\"" "to the pg_auto_failover monitor at %s, " "see above for details", config->hostname, config->pgSetup.pgport, config->pgSetup.pgdata, scrubbedConnectionString); } log_info("Successfully registered as \"%s\" to the monitor.", NodeStateToString(keeper->state.assigned_role)); return reach_initial_state(keeper); } /* * keeper_pg_init_continue attempts to continue a `pg_autoctl create` that * failed through in the middle. A particular case of interest is trying to * init with a stale file lying around. * * When we initialize and register to the monitor, we create two files: the * init file and the state file. When the init is done, we remove the init file * and never create it again. Which means that when the init file exists, we * know we were interrupted in the middle of the init step, after having * registered to the monitor: that's when we create the init file. */ bool keeper_pg_init_continue(Keeper *keeper) { KeeperStateData *keeperState = &(keeper->state); KeeperStateInit *initState = &(keeper->initState); KeeperConfig *config = &(keeper->config); /* initialize our keeper state and read the state file */ if (!keeper_init(keeper, config)) { /* errors have already been logged */ return false; } /* also read the init state file */ if (!keeper_init_state_read(initState, config->pathnames.init)) { log_fatal("Failed to restart from previous keeper init attempt"); log_info("HINT: use `pg_autoctl drop node` to retry in a clean state"); return false; } log_info("Continuing from a previous `pg_autoctl create` failed attempt"); log_info("PostgreSQL state at registration time was: %s", PreInitPostgreInstanceStateToString(initState->pgInitState)); /* * TODO: verify the information in the state file against the information * in the monitor and decide if it's stale or not. */ /* * Also update the groupId and replication slot name in the configuration * file, from the keeper state file: we might not have reached a point * where the configuration changes have been saved to disk in the previous * attempt. */ if (!keeper_config_update(&(keeper->config), keeperState->current_node_id, keeperState->current_group)) { log_error("Failed to update the configuration file with the groupId %d " "and the nodeId %d", keeperState->current_group, keeperState->current_node_id); return false; } /* * If we have an init file and the state file looks good, then the * operation that failed was removing the init state file. */ if (keeper->state.current_role == keeper->state.assigned_role && (keeper->state.current_role == SINGLE_STATE || keeper->state.current_role == CATCHINGUP_STATE)) { return unlink_file(config->pathnames.init); } if (config->monitorDisabled) { return true; } else { return reach_initial_state(keeper); } } /* * reach_initial_state implements the first FSM transition. * * When asked by the monitor to reach the WAIT_STANDBY state, we know we are * going to then move forward to the CATCHINGUP state, and this is the * interesting transition here: we might fail to setup the Streaming * Replication. * * Being nice to the user, we're going to implement that extra step during the * `pg_autoctl create` command, so that we can detect and fix any error before * sarting as a service. */ static bool reach_initial_state(Keeper *keeper) { KeeperConfig *config = &(keeper->config); log_trace("reach_initial_state: %s to %s", NodeStateToString(keeper->state.current_role), NodeStateToString(keeper->state.assigned_role)); /* * To move from current_role to assigned_role, we call in the FSM. */ if (!keeper_fsm_reach_assigned_state(keeper)) { /* errors have already been logged */ return false; } /* * We have extra work to do after the FSM transition is done. * * The goal here is to be as user friendly as possible: make sure that when * the initialization is done, our pg_auto_failover situation is as * expected. So we go the extra mile here. */ switch (keeper->state.assigned_role) { case CATCHINGUP_STATE: { /* * Well we're good then, there's nothing else for us to do. * * This might happen when doing `pg_autoctl create` on an already * initialized cluster, or when running the command for the second * time after fixing a glitch in the setup or the environment. */ break; } case WAIT_STANDBY_STATE: { /* * Now the transition from INIT_STATE to WAIT_STANDBY_STATE consist * of doing nothing on the keeper's side: we are just waiting until * the primary has updated its HBA setup with our hostname. */ MonitorAssignedState assignedState = { 0 }; /* busy loop until we are asked to be in CATCHINGUP_STATE */ if (!wait_until_primary_is_ready(keeper, &assignedState)) { /* the node might have been dropped early */ return exit_if_dropped(keeper); } /* * Now that we are asked to catch up, it means the primary is ready * for us to pg_basebackup, which allows the local instance to then * reach goal state SECONDARY: */ if (!keeper_fsm_reach_assigned_state(keeper)) { /* * One reason why we failed to reach the CATCHING-UP state is * that we've been DROPPED while doing the pg_basebackup or * some other step of that migration. Check about that now. */ return exit_if_dropped(keeper); } /* * Because we did contact the monitor, we need to update our * partial local cache of the monitor's state. That updates the * cache both in memory and on-disk. */ if (!keeper_update_state(keeper, assignedState.nodeId, assignedState.groupId, assignedState.state, true)) { log_error("Failed to update keepers's state"); return false; } /* * We insist on using the realpath(3) for PGDATA in the config, and * now is a good time to check this, because we just created the * directory. */ if (!keeper_config_update_with_absolute_pgdata(&(keeper->config))) { /* errors have already been logged */ return false; } break; } case SINGLE_STATE: { /* it's all done in the INIT ➜ SINGLE transition now. */ break; } case REPORT_LSN_STATE: { /* all the work is done in the INIT ➜ REPORT_LSN transition */ break; } default: /* we don't support any other state at initialization time */ log_error("reach_initial_state: don't know how to read state %s", NodeStateToString(keeper->state.assigned_role)); return false; } /* * The initialization is done, publish the new current state to the * monitor. */ if (!keeper_pg_init_node_active(keeper)) { /* errors have been logged already */ return false; } /* everything went fine, get rid of the init state file */ return unlink_file(config->pathnames.init); } /* * exit_if_dropped checks if the node has been dropped during its * initialization phase, and if that's the case, finished the DROP protocol and * exits with a specific exit code. */ static bool exit_if_dropped(Keeper *keeper) { bool dropped = false; if (!keeper_ensure_node_has_been_dropped(keeper, &dropped)) { log_fatal( "Failed to determine if node %d with current state \"%s\" " " in formation \"%s\" and group %d " "has been dropped from the monitor, see above for details", keeper->state.current_node_id, NodeStateToString(keeper->state.current_role), keeper->config.formation, keeper->config.groupId); return false; } if (dropped) { log_fatal("This node has been dropped from the monitor"); exit(EXIT_CODE_DROPPED); } return false; } /* * wait_until_primary_is_ready calls monitor_node_active every second until the * monitor tells us that we can move from our current state * (WAIT_STANDBY_STATE) to CATCHINGUP_STATE, which only happens when the * primary successfully prepared for Streaming Replication. */ static bool wait_until_primary_is_ready(Keeper *keeper, MonitorAssignedState *assignedState) { bool pgIsRunning = false; int currentTLI = 1; char currrentLSN[PG_LSN_MAXLENGTH] = "0/0"; char *pgsrSyncState = ""; int errors = 0, tries = 0; bool firstLoop = true; /* wait until the primary is ready for us to pg_basebackup */ do { bool groupStateHasChanged = false; if (firstLoop) { firstLoop = false; } else { Monitor *monitor = &(keeper->monitor); KeeperStateData *keeperState = &(keeper->state); int timeoutMs = PG_AUTOCTL_KEEPER_SLEEP_TIME * 1000; (void) pgsql_prepare_to_wait(&(monitor->notificationClient)); (void) monitor_wait_for_state_change(monitor, keeper->config.formation, keeperState->current_group, keeperState->current_node_id, timeoutMs, &groupStateHasChanged); /* when no state change has been notified, close the connection */ if (!groupStateHasChanged && monitor->notificationClient.connectionStatementType == PGSQL_CONNECTION_MULTI_STATEMENT) { pgsql_finish(&(monitor->notificationClient)); } } if (!monitor_node_active(&(keeper->monitor), keeper->config.formation, keeper->state.current_node_id, keeper->state.current_group, keeper->state.current_role, pgIsRunning, currentTLI, currrentLSN, pgsrSyncState, assignedState)) { ++errors; log_warn("Failed to contact the monitor at \"%s\"", keeper->config.monitor_pguri); if (errors > 5) { log_error("Failed to contact the monitor 5 times in a row now, " "so we stop trying. You can do `pg_autoctl create` " "to retry and finish the local setup"); return false; } } /* if state has changed, we didn't wait for a full timeout */ if (!groupStateHasChanged) { ++tries; } /* if the node has been dropped while trying to init, exit early */ if (assignedState->state == DROPPED_STATE) { return false; } if (tries == 3) { log_info("Still waiting for the monitor to drive us to state \"%s\"", NodeStateToString(CATCHINGUP_STATE)); log_warn("Please make sure that the primary node is currently " "running `pg_autoctl run` and contacting the monitor."); } log_trace("wait_until_primary_is_ready: %s", NodeStateToString(assignedState->state)); } while (assignedState->state != CATCHINGUP_STATE); /* * Update our state with the result from the monitor now. */ if (!keeper_update_state(keeper, assignedState->nodeId, assignedState->groupId, assignedState->state, true)) { log_error("Failed to update keepers's state"); return false; } /* Now make sure the replication slot has been created on the primary */ return wait_until_primary_has_created_our_replication_slot(keeper, assignedState); } /* * wait_until_primary_has_created_our_replication_slot loops over querying the * primary server until it has created our replication slot. * * When assigned CATCHINGUP_STATE, in some cases the primary might not be ready * yet. That might happen when all the other standby nodes are in maintenance * and the primary is already in the WAIT_PRIMARY state. */ static bool wait_until_primary_has_created_our_replication_slot(Keeper *keeper, MonitorAssignedState *assignedState) { int errors = 0, tries = 0; bool firstLoop = true; KeeperConfig *config = &(keeper->config); LocalPostgresServer *postgres = &(keeper->postgres); ReplicationSource *upstream = &(postgres->replicationSource); NodeAddress primaryNode = { 0 }; bool hasReplicationSlot = false; if (!keeper_get_primary(keeper, &primaryNode)) { /* errors have already been logged */ return false; } if (!standby_init_replication_source(postgres, &primaryNode, PG_AUTOCTL_REPLICA_USERNAME, config->replication_password, config->replication_slot_name, config->maximum_backup_rate, config->backupDirectory, NULL, /* no targetLSN */ config->pgSetup.ssl, assignedState->nodeId)) { /* can't happen at the moment */ return false; } do { if (asked_to_stop || asked_to_stop_fast || asked_to_quit) { return false; } if (firstLoop) { firstLoop = false; } else { sleep(PG_AUTOCTL_KEEPER_SLEEP_TIME); } if (!upstream_has_replication_slot(upstream, &(config->pgSetup), &hasReplicationSlot)) { ++errors; log_warn("Failed to contact the primary node " NODE_FORMAT, primaryNode.nodeId, primaryNode.name, primaryNode.host, primaryNode.port); if (errors > 5) { log_error("Failed to contact the primary 5 times in a row now, " "so we stop trying. You can do `pg_autoctl create` " "to retry and finish the local setup"); return false; } } ++tries; if (!hasReplicationSlot && tries == 3) { log_info("Still waiting for the to create our replication slot"); log_warn("Please make sure that the primary node is currently " "running `pg_autoctl run` and contacting the monitor."); } } while (!hasReplicationSlot); return true; } /* * create_database_and_extension does the following: * * - ensures PostgreSQL is running * - create the proper role with login * - to be able to fetch pg_hba.conf location and edit it for pg_autoctl * - then createdb pgSetup.dbname, which might not be postgres * - and restart PostgreSQL with the new setup, to make it active/current * - finally when pgKind is Citus, create the citus extension * * When pgKind is Citus, the setup we install in step 2 contains the * shared_preload_libraries = 'citus' entry, so we can proceed with create * extension citus after the restart. */ bool create_database_and_extension(Keeper *keeper) { KeeperConfig *config = &(keeper->config); PostgresSetup *pgSetup = &(config->pgSetup); LocalPostgresServer *postgres = &(keeper->postgres); PGSQL *pgsql = &(postgres->sqlClient); LocalPostgresServer initPostgres = { 0 }; PostgresSetup initPgSetup = { 0 }; bool missingPgdataIsOk = false; bool pgIsNotRunningIsOk = true; char hbaFilePath[MAXPGPATH]; log_trace("create_database_and_extension"); /* we didn't start PostgreSQL yet, also we just ran initdb */ sformat(hbaFilePath, MAXPGPATH, "%s/pg_hba.conf", pgSetup->pgdata); /* * The Postgres URI given to the user by our facility is going to use * --dbname and --hostname, as per the following command: * * $ pg_autoctl show uri --formation default * * We need to make it so that the user can actually use that connection * string with at least the --username used to create the database. */ if (!pghba_ensure_host_rule_exists(hbaFilePath, pgSetup->ssl.active, HBA_DATABASE_DBNAME, pgSetup->dbname, pg_setup_get_username(pgSetup), config->hostname, pg_setup_get_auth_method(pgSetup), pgSetup->hbaLevel)) { log_error("Failed to edit \"%s\" to grant connections to \"%s\", " "see above for details", hbaFilePath, config->hostname); return false; } /* * When --pg-hba-lan is used, we also open the local network CIDR * connections for the given --username and --dbname. */ if (pgSetup->hbaLevel == HBA_EDIT_LAN) { if (!pghba_enable_lan_cidr(&keeper->postgres.sqlClient, keeper->config.pgSetup.ssl.active, HBA_DATABASE_DBNAME, keeper->config.pgSetup.dbname, keeper->config.hostname, pg_setup_get_username(pgSetup), pg_setup_get_auth_method(pgSetup), pgSetup->hbaLevel, pgSetup->pgdata)) { log_error("Failed to grant local network connections in HBA"); return false; } } /* * In test environments using PG_REGRESS_SOCK_DIR="" to disable unix socket * directory, we have to connect to the address from pghost. */ if (env_found_empty("PG_REGRESS_SOCK_DIR")) { log_info("Granting connection from \"%s\" in \"%s\"", pgSetup->pghost, hbaFilePath); /* Intended use is restricted to unit testing, hard-code "trust" here */ if (!pghba_ensure_host_rule_exists(hbaFilePath, pgSetup->ssl.active, HBA_DATABASE_ALL, NULL, /* all: no database name */ NULL, /* no username, "all" */ pgSetup->pghost, "trust", HBA_EDIT_MINIMAL)) { log_error("Failed to edit \"%s\" to grant connections to \"%s\", " "see above for details", hbaFilePath, pgSetup->pghost); return false; } } /* * Use the "template1" database in the next operations when connecting to * do the initial PostgreSQL configuration, and to create our database. We * certainly can't connect to our database until we've created it. */ if (!pg_setup_init(&initPgSetup, pgSetup, missingPgdataIsOk, pgIsNotRunningIsOk)) { log_fatal("Failed to initialize newly created PostgreSQL instance," "see above for details"); return false; } strlcpy(initPgSetup.username, "", NAMEDATALEN); strlcpy(initPgSetup.dbname, "template1", NAMEDATALEN); local_postgres_init(&initPostgres, &initPgSetup); /* * When --ssl-self-signed has been used, now is the time to build a * self-signed certificate for the server. We place the certificate and * private key in $PGDATA/server.key and $PGDATA/server.crt */ if (!keeper_create_self_signed_cert(keeper)) { /* errors have already been logged */ return false; } /* publish our new pgSetup to the caller postgres state too */ postgres->postgresSetup.ssl = initPostgres.postgresSetup.ssl; /* * Ensure pg_stat_statements is available in the server extension dir used * to create the Postgres instance. We only search for the control file to * offer better diagnostics in the logs in case the following CREATE * EXTENSION fails. */ if (!find_extension_control_file(config->pgSetup.pg_ctl, "pg_stat_statements")) { log_warn("Failed to find extension control file for " "\"pg_stat_statements\""); } /* * Ensure citus extension is available in the server extension dir used to * create the Postgres instance. We only search for the control file to * offer better diagnostics in the logs in case the following CREATE * EXTENSION fails. */ if (IS_CITUS_INSTANCE_KIND(postgres->pgKind)) { if (!find_extension_control_file(config->pgSetup.pg_ctl, "citus")) { log_warn("Failed to find extension control file for \"citus\""); } } /* * Add pg_autoctl PostgreSQL settings, including Citus extension in * shared_preload_libraries when dealing with a Citus worker or coordinator * node. */ if (!postgres_add_default_settings(&initPostgres, config->hostname)) { log_error("Failed to add default settings to newly initialized " "PostgreSQL instance, see above for details"); return false; } /* * Now start the database, we need to create our dbname and maybe the Citus * Extension too. */ if (!ensure_postgres_service_is_running(&initPostgres)) { log_error("Failed to start PostgreSQL, see above for details"); return false; } /* * If username was set in the setup and doesn't exist we need to create it. */ if (!IS_EMPTY_STRING_BUFFER(pgSetup->username)) { /* * Remove PGUSER from the environment when we want to create that very * user at bootstrap. */ char pguser[NAMEDATALEN] = { 0 }; if (!get_env_copy_with_fallback("PGUSER", pguser, NAMEDATALEN, "")) { /* errors have already been logged */ return false; } if (strcmp(pguser, pgSetup->username) == 0) { unsetenv("PGUSER"); } if (!pgsql_create_user(&initPostgres.sqlClient, pgSetup->username, NULL, /* password */ true, /* WITH login */ true, /* WITH superuser */ false, /* WITH replication */ -1)) /* connlimit */ { log_fatal("Failed to create role \"%s\"" ", see above for details", pgSetup->username); return false; } /* reinstall the PGUSER value now that the user has been created. */ if (strcmp(pguser, pgSetup->username) == 0) { setenv("PGUSER", pguser, 1); } } /* * Now, maybe create the database (if "postgres", it already exists). * * We need to connect to an existing database here, such as "template1", * and create our target database from there. */ if (!IS_EMPTY_STRING_BUFFER(pgSetup->dbname)) { /* maybe create the database, skipping if it already exists */ log_info("CREATE DATABASE %s;", pgSetup->dbname); if (!pgsql_create_database(&initPostgres.sqlClient, pgSetup->dbname, pg_setup_get_username(pgSetup))) { log_error("Failed to create database %s with owner %s", pgSetup->dbname, pgSetup->username); return false; } } /* close the "template1" connection now */ pgsql_finish(&initPostgres.sqlClient); /* * Connect to Postgres as the system user to create extension: same user as * initdb with superuser privileges. * * Calling keeper_update_state will re-init our sqlClient to now connect * per the configuration settings, cleaning-up the local changes we made * before. */ if (!keeper_update_pg_state(keeper, LOG_ERROR)) { log_error("Failed to update the keeper's state from the local " "PostgreSQL instance, see above for details."); return false; } /* * Install the pg_stat_statements extension in that database, skipping if * the extension has already been installed. */ log_info("CREATE EXTENSION pg_stat_statements;"); if (!pgsql_create_extension(&(postgres->sqlClient), "pg_stat_statements")) { log_error("Failed to create extension pg_stat_statements"); return false; } /* * When initialiasing a PostgreSQL instance that's going to be used as a * Citus node, either a coordinator or a worker, we have to also create an * extension in a database that can be used by citus. */ if (IS_CITUS_INSTANCE_KIND(postgres->pgKind)) { /* * Now allow nodes on the same network to connect to the coordinator, * and the coordinator to connect to its workers. */ if (!pghba_enable_lan_cidr(&initPostgres.sqlClient, pgSetup->ssl.active, HBA_DATABASE_DBNAME, pgSetup->dbname, config->hostname, pg_setup_get_username(pgSetup), pg_setup_get_auth_method(pgSetup), pgSetup->hbaLevel, NULL)) { log_error("Failed to grant local network connections in HBA"); return false; } /* * Install the citus extension in that database, skipping if the * extension has already been installed. */ log_info("CREATE EXTENSION %s;", CITUS_EXTENSION_NAME); if (!pgsql_create_extension(&(postgres->sqlClient), CITUS_EXTENSION_NAME)) { log_error("Failed to create extension %s", CITUS_EXTENSION_NAME); return false; } } /* and we're done with this connection. */ pgsql_finish(pgsql); return true; } /* * keeper_pg_init_node_active calls node_active() on the monitor, to publish * the state reached by the end of the initialization procedure of the node. */ static bool keeper_pg_init_node_active(Keeper *keeper) { MonitorAssignedState assignedState = { 0 }; /* * Save our local state before reporting it to the monitor. If we fail to * contact the monitor, we can always retry later. */ if (!keeper_store_state(keeper)) { /* * Errors have already been logged. * * Make sure we don't have a corrupted state file around, that could * prevent trying to init again and cause strange errors. */ unlink_file(keeper->config.pathnames.state); return false; } (void) keeper_update_pg_state(keeper, LOG_WARN); if (!monitor_node_active(&(keeper->monitor), keeper->config.formation, keeper->state.current_node_id, keeper->state.current_group, keeper->state.current_role, ReportPgIsRunning(keeper), keeper->postgres.postgresSetup.control.timeline_id, keeper->postgres.currentLSN, keeper->postgres.pgsrSyncState, &assignedState)) { log_error("Failed to contact the monitor to publish our " "current state \"%s\".", NodeStateToString(keeper->state.current_role)); return false; } /* * Now save the monitor's assigned state before being done with the init * step. If a transition is needed to reach that state, that's the job of * `pg_autoctl run` to make it happen now. That said, we should make * sure to record the monitor's answer in our local state before we give * control back to the user. */ if (!keeper_update_state(keeper, assignedState.nodeId, assignedState.groupId, assignedState.state, true)) { log_error("Failed to update keepers's state"); /* * Make sure we don't have a corrupted state file around, that could * prevent trying to init again and cause strange errors. */ unlink_file(keeper->config.pathnames.state); return false; } return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/keeper_pg_init.h000066400000000000000000000011061414244367200237100ustar00rootroot00000000000000/* * src/bin/pg_autoctl/keeper_init.h * Keeper configuration data structure and function definitions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef KEEPER_INIT_H #define KEEPER_INIT_H #include #include "keeper.h" #include "keeper_config.h" extern bool keeperInitWarnings; bool keeper_pg_init(Keeper *keeper); bool keeper_pg_init_continue(Keeper *keeper); bool keeper_pg_init_and_register(Keeper *keeper); bool create_database_and_extension(Keeper *keeper); #endif /* KEEPER_INIT_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/lock_utils.c000066400000000000000000000213621414244367200230750ustar00rootroot00000000000000/* * src/bin/pg_autoctl/lock_utils.c * Implementations of utility functions for inter-process locking * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include "defaults.h" #include "file_utils.h" #include "env_utils.h" #include "lock_utils.h" #include "log.h" #include "pidfile.h" #include "string_utils.h" /* * See man semctl(2) */ #if defined(__linux__) union semun { int val; struct semid_ds *buf; unsigned short *array; }; #endif /* * semaphore_init creates or opens a named semaphore for the current process. * * We use the environment variable PG_AUTOCTL_SERVICE to signal when a process * is a child process of the main pg_autoctl supervisor so that we are able to * initialize our locking strategy before parsing the command line. After all, * we might have to log some output during the parsing itself. */ bool semaphore_init(Semaphore *semaphore) { if (env_exists(PG_AUTOCTL_LOG_SEMAPHORE)) { return semaphore_open(semaphore); } else { bool success = semaphore_create(semaphore); /* * Only the main process should unlink the semaphore at exit time. * * When we create a semaphore, ensure we put our semId in the expected * environment variable (PG_AUTOCTL_LOG_SEMAPHORE), and we assign the * current process' pid as the semaphore owner. * * When we open a pre-existing semaphore using PG_AUTOCTL_LOG_SEMAPHORE * as the semId, the semaphore owner is left to zero. * * The atexit(3) function that removes the semaphores only acts when * the owner is our current pid. That way, in case of an early failure * in execv(), the semaphore is not dropped from under the main * program. * * A typical way execv() would fail is when calling run_program() on a * pathname that does not exists. * * Per atexit(3) manual page: * * When a child process is created via fork(2), it inherits copies of * its parent's registrations. Upon a successful call to one of the * exec(3) functions, all registrations are removed. * * And that's why it's important that we don't remove the semaphore in * the atexit() cleanup function when a call to run_command() fails * early. */ if (success) { IntString semIdString = intToString(semaphore->semId); setenv(PG_AUTOCTL_LOG_SEMAPHORE, semIdString.strValue, 1); } return success; } } /* * semaphore_finish closes or unlinks given semaphore. */ bool semaphore_finish(Semaphore *semaphore) { /* * At initialization time we either create a new semaphore and register * getpid() as the owner, or we open a previously existing semaphore from * its semId as found in our environment variable PG_AUTOCTL_LOG_SEMAPHORE. * * At finish time (called from the atexit(3) registry), we remove the * semaphore only when we are the owner of it. */ if (semaphore->owner == getpid()) { return semaphore_unlink(semaphore); } return true; } /* * semaphore_create creates a new semaphore with the value 1. */ bool semaphore_create(Semaphore *semaphore) { union semun semun; semaphore->owner = getpid(); semaphore->semId = semget(IPC_PRIVATE, 1, 0600); if (semaphore->semId < 0) { /* the semaphore_log_lock_function has not been set yet */ log_fatal("Failed to create semaphore: %m\n"); return false; } /* to see this log line, change the default log level in set_logger() */ log_trace("Created semaphore %d", semaphore->semId); semun.val = 1; if (semctl(semaphore->semId, 0, SETVAL, semun) < 0) { /* the semaphore_log_lock_function has not been set yet */ log_fatal("Failed to set semaphore %d/%d to value %d : %m\n", semaphore->semId, 0, semun.val); return false; } return true; } /* * semaphore_open opens our IPC_PRIVATE semaphore. * * We don't have a key for it, because we asked the kernel to create a new * semaphore set with the guarantee that it would not exist already. So we * re-use the semaphore identifier directly. * * We don't even have to call semget(2) here at all, because we share our * semaphore identifier in the environment directly. */ bool semaphore_open(Semaphore *semaphore) { char semIdString[BUFSIZE] = { 0 }; /* ensure the owner is set to zero when we re-open an existing semaphore */ semaphore->owner = 0; if (!get_env_copy(PG_AUTOCTL_LOG_SEMAPHORE, semIdString, BUFSIZE)) { /* errors have already been logged */ return false; } if (!stringToInt(semIdString, &semaphore->semId)) { /* errors have already been logged */ return false; } /* to see this log line, change the default log level in set_logger() */ log_trace("Using semaphore %d", semaphore->semId); /* we have the semaphore identifier, no need to call semget(2), done */ return true; } /* * semaphore_unlink removes an existing named semaphore. */ bool semaphore_unlink(Semaphore *semaphore) { union semun semun; semun.val = 0; /* unused, but keep compiler quiet */ log_trace("ipcrm -s %d\n", semaphore->semId); if (semctl(semaphore->semId, 0, IPC_RMID, semun) < 0) { fformat(stderr, "Failed to remove semaphore %d: %m", semaphore->semId); return false; } return true; } /* * semaphore_cleanup is used when we find a stale PID file, to remove a * possibly left behind semaphore. The user could also use ipcs and ipcrm to * figure that out, if the stale pidfile does not exist anymore. */ bool semaphore_cleanup(const char *pidfile) { Semaphore semaphore; long fileSize = 0L; char *fileContents = NULL; char *fileLines[BUFSIZE] = { 0 }; if (!file_exists(pidfile)) { return false; } if (!read_file(pidfile, &fileContents, &fileSize)) { return false; } int lineCount = splitLines(fileContents, fileLines, BUFSIZE); if (lineCount < PIDFILE_LINE_SEM_ID) { log_debug("Failed to cleanup the semaphore from stale pid file \"%s\": " "it contains %d lines, semaphore id is expected in line %d", pidfile, lineCount, PIDFILE_LINE_SEM_ID); free(fileContents); return false; } if (!stringToInt(fileLines[PIDFILE_LINE_SEM_ID], &(semaphore.semId))) { /* errors have already been logged */ free(fileContents); return false; } free(fileContents); log_trace("Read semaphore id %d from stale pidfile", semaphore.semId); return semaphore_unlink(&semaphore); } /* * semaphore_lock locks a semaphore (decrement count), blocking if count would * be < 0 */ bool semaphore_lock(Semaphore *semaphore) { int errStatus; struct sembuf sops; sops.sem_op = -1; /* decrement */ sops.sem_flg = SEM_UNDO; sops.sem_num = 0; /* * Note: if errStatus is -1 and errno == EINTR then it means we returned * from the operation prematurely because we were sent a signal. So we * try and lock the semaphore again. * * We used to check interrupts here, but that required servicing * interrupts directly from signal handlers. Which is hard to do safely * and portably. */ do { errStatus = semop(semaphore->semId, &sops, 1); } while (errStatus < 0 && errno == EINTR); if (errStatus < 0) { fformat(stderr, "%d Failed to acquire a lock with semaphore %d: %m\n", getpid(), semaphore->semId); return false; } return true; } /* * semaphore_unlock unlocks a semaphore (increment count) */ bool semaphore_unlock(Semaphore *semaphore) { int errStatus; struct sembuf sops; sops.sem_op = 1; /* increment */ sops.sem_flg = SEM_UNDO; sops.sem_num = 0; /* * Note: if errStatus is -1 and errno == EINTR then it means we returned * from the operation prematurely because we were sent a signal. So we * try and unlock the semaphore again. Not clear this can really happen, * but might as well cope. */ do { errStatus = semop(semaphore->semId, &sops, 1); } while (errStatus < 0 && errno == EINTR); if (errStatus < 0) { fformat(stderr, "Failed to release a lock with semaphore %d: %m\n", semaphore->semId); return false; } return true; } /* * semaphore_log_lock_function integrates our semaphore facility with the * logging tool in use in this project. */ void semaphore_log_lock_function(void *udata, int mode) { Semaphore *semaphore = (Semaphore *) udata; /* * If locking/unlocking fails for some weird reason, we still want to log. * It's not so bad that we want to completely quit the program. * That's why we ignore the return values of semaphore_unlock and * semaphore_lock. */ switch (mode) { /* unlock */ case 0: { (void) semaphore_unlock(semaphore); break; } /* lock */ case 1: { (void) semaphore_lock(semaphore); break; } default: { fformat(stderr, "BUG: semaphore_log_lock_function called with mode %d", mode); exit(EXIT_CODE_INTERNAL_ERROR); } } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/lock_utils.h000066400000000000000000000015121414244367200230750ustar00rootroot00000000000000/* * src/bin/pg_autoctl/lock_utils.h * Utility functions for inter-process locking * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef LOCK_UTILS_H #define LOCK_UTILS_H #include #include #include #include typedef struct Semaphore { int semId; pid_t owner; } Semaphore; bool semaphore_init(Semaphore *semaphore); bool semaphore_finish(Semaphore *semaphore); bool semaphore_create(Semaphore *semaphore); bool semaphore_open(Semaphore *semaphore); bool semaphore_unlink(Semaphore *semaphore); bool semaphore_cleanup(const char *pidfile); bool semaphore_lock(Semaphore *semaphore); bool semaphore_unlock(Semaphore *semaphore); void semaphore_log_lock_function(void *udata, int mode); #endif /* LOCK_UTILS_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/main.c000066400000000000000000000127601414244367200216530ustar00rootroot00000000000000/* * src/bin/pg_autoctl/main.c * Main entry point for the pg_autoctl command-line tool * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include "postgres_fe.h" #include "cli_root.h" #include "env_utils.h" #include "keeper.h" #include "keeper_config.h" #include "lock_utils.h" #include "string_utils.h" #if (PG_VERSION_NUM >= 120000) #include "common/logging.h" #endif char pg_autoctl_argv0[MAXPGPATH]; char pg_autoctl_program[MAXPGPATH]; int pgconnect_timeout = 2; /* see also POSTGRES_CONNECT_TIMEOUT */ char *ps_buffer; /* will point to argv area */ size_t ps_buffer_size; /* space determined at run time */ size_t last_status_len; /* use to minimize length of clobber */ Semaphore log_semaphore = { 0 }; /* allows inter-process locking */ static void set_logger(void); static void log_semaphore_unlink_atexit(void); /* * Main entry point for the binary. */ int main(int argc, char **argv) { CommandLine command = root; /* allows changing process title in ps/top/ptree etc */ (void) init_ps_buffer(argc, argv); /* set our logging infrastructure */ (void) set_logger(); /* * Since PG 12, we need to call pg_logging_init before any calls to pg_log_* * otherwise, we get a segfault. Although we don't use pg_log_* directly, * functions from the common library such as rmtree do use them. * Logging change introduced in PG 12: https://git.postgresql.org/cgit/postgresql.git/commit/?id=cc8d41511721d25d557fc02a46c053c0a602fed0 */ #if (PG_VERSION_NUM >= 120000) pg_logging_init(argv[0]); #endif /* register our logging clean-up atexit */ atexit(log_semaphore_unlink_atexit); /* * When PG_AUTOCTL_DEBUG is set in the environment, provide the user * commands available to debug a pg_autoctl instance. */ if (env_exists(PG_AUTOCTL_DEBUG)) { command = root_with_debug; } /* * When PGCONNECT_TIMEOUT is set in the environment, keep a copy of it in * our own global variable pgconnect_timeout. We implement our own * connection retry policy and will change change the environment variable * setting when calling pg_basebackup and other tools anyway. */ if (env_exists("PGCONNECT_TIMEOUT")) { char env_pgtimeout[BUFSIZE] = { 0 }; if (get_env_copy("PGCONNECT_TIMEOUT", env_pgtimeout, BUFSIZE) > 0) { if (!stringToInt(env_pgtimeout, &pgconnect_timeout)) { log_warn("Failed to parse environment variable " "PGCONNECT_TIMEOUT value \"%s\" as a " "number of seconds (integer), " "using our default %d seconds instead", env_pgtimeout, pgconnect_timeout); } } } /* * We need to follow POSIX specifications for argument parsing, in * particular we want getopt() to stop as soon as it reaches a non option * in the command line. * * GNU and modern getopt() implementation will reorder the command * arguments, making a mess of our nice subcommands facility. * * Note that we call unsetenv("POSIXLY_CORRECT"); before parsing options * for commands that are the final sub-command of their chain and when we * might mix options and arguments. */ setenv("POSIXLY_CORRECT", "1", 1); /* * Stash away the argv[0] used to run this program and compute the realpath * of the program invoked, which we need at several places including when * preparing the systemd unit files. * * Note that we're using log_debug() in get_program_absolute_path and we * have not set the log level from the command line option parsing yet. We * hard-coded LOG_INFO as our log level. For now we won't see the log_debug * output, but as a developer you could always change the LOG_INFO to * LOG_DEBUG above and then see the message. * * When running pg_autoctl using valgrind we also want the subprocesses to * be run with valgrind. However, valgrind modifies the argv variables to * be the pg_autoctl binary, instead of the valgrind binary. So to make * sure subprocesses are spawned using valgrind, we allow overriding To * this program path detection using the PG_AUTOCTL_DEBUG_BIN_PATH * environment variable. */ strlcpy(pg_autoctl_argv0, argv[0], MAXPGPATH); if (env_exists("PG_AUTOCTL_DEBUG_BIN_PATH")) { if (!get_env_copy("PG_AUTOCTL_DEBUG_BIN_PATH", pg_autoctl_program, MAXPGPATH)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } } else if (!set_program_absolute_path(pg_autoctl_program, MAXPGPATH)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } if (!commandline_run(&command, argc, argv)) { exit(EXIT_CODE_BAD_ARGS); } return 0; } /* * set_logger creates our log semaphore, sets the logging utility aspects such * as using colors in an interactive terminal and the default log level. */ static void set_logger() { /* we're verbose by default */ log_set_level(LOG_INFO); /* * Log messages go to stderr. We use colours when stderr is being shown * directly to the user to make it easier to spot warnings and errors. */ log_use_colors(isatty(fileno(stderr))); /* initialize the semaphore used for locking log output */ if (!semaphore_init(&log_semaphore)) { exit(EXIT_CODE_INTERNAL_ERROR); } /* set our logging facility to use our semaphore as a lock mechanism */ (void) log_set_udata(&log_semaphore); (void) log_set_lock(&semaphore_log_lock_function); } /* * log_semaphore_unlink_atexit calls semaphore_unlink() atexit. */ static void log_semaphore_unlink_atexit(void) { (void) semaphore_finish(&log_semaphore); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/monitor.c000066400000000000000000003760301414244367200224210ustar00rootroot00000000000000/* * src/bin/pg_autoctl/monitor.c * API for interacting with the monitor * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include "defaults.h" #include "env_utils.h" #include "log.h" #include "monitor.h" #include "monitor_config.h" #include "nodestate_utils.h" #include "parsing.h" #include "pgsql.h" #include "primary_standby.h" #include "signals.h" #include "string_utils.h" #define STR_ERRCODE_OBJECT_IN_USE "55006" #define STR_ERRCODE_EXCLUSION_VIOLATION "23P01" #define STR_ERRCODE_SERIALIZATION_FAILURE "40001" #define STR_ERRCODE_STATEMENT_COMPLETION_UNKNOWN "40003" #define STR_ERRCODE_DEADLOCK_DETECTED "40P01" #define STR_ERRCODE_UNDEFINED_OBJECT "42704" #define STR_ERRCODE_CLASS_INSUFFICIENT_RESOURCES "53" #define STR_ERRCODE_CLASS_PROGRAM_LIMIT_EXCEEDED "54" typedef struct NodeAddressParseContext { char sqlstate[SQLSTATE_LENGTH]; NodeAddress *node; bool parsedOK; } NodeAddressParseContext; typedef struct NodeAddressArrayParseContext { char sqlstate[SQLSTATE_LENGTH]; NodeAddressArray *nodesArray; bool parsedOK; } NodeAddressArrayParseContext; typedef struct MonitorEventsArrayParseContext { char sqlstate[SQLSTATE_LENGTH]; MonitorEventsArray *eventsArray; bool parsedOK; } MonitorEventsArrayParseContext; typedef struct MonitorAssignedStateParseContext { char sqlstate[SQLSTATE_LENGTH]; MonitorAssignedState *assignedState; bool parsedOK; } MonitorAssignedStateParseContext; typedef struct NodeReplicationSettingsParseContext { char sqlstate[SQLSTATE_LENGTH]; int candidatePriority; bool replicationQuorum; bool parsedOK; } NodeReplicationSettingsParseContext; typedef struct CurrentNodeStateContext { char sqlstate[SQLSTATE_LENGTH]; CurrentNodeStateArray *nodesArray; bool parsedOK; } CurrentNodeStateContext; typedef struct RemoveNodeContext { char sqlstate[SQLSTATE_LENGTH]; int64_t nodeId; int groupId; bool removed; bool parsedOK; } RemoveNodeContext; /* either "monitor" or "formation" */ #define CONNTYPE_LENGTH 10 typedef struct FormationURIParseContext { char sqlstate[SQLSTATE_LENGTH]; char connType[CONNTYPE_LENGTH]; char connName[BUFSIZE]; char connURI[BUFSIZE]; bool parsedOK; } FormationURIParseContext; typedef struct MonitorExtensionVersionParseContext { char sqlstate[SQLSTATE_LENGTH]; MonitorExtensionVersion *version; bool parsedOK; } MonitorExtensionVersionParseContext; static bool parseNode(PGresult *result, int rowNumber, NodeAddress *node); static void parseNodeResult(void *ctx, PGresult *result); static void parseNodeArray(void *ctx, PGresult *result); static void parseNodeState(void *ctx, PGresult *result); static void parseNodeReplicationSettings(void *ctx, PGresult *result); static bool parseCurrentNodeState(PGresult *result, int rowNumber, CurrentNodeState *nodeState); static bool parseCurrentNodeStateArray(CurrentNodeStateArray *nodesArray, PGresult *result); static void parseRemoveNodeContext(void *ctx, PGresult *result); static void getCurrentState(void *ctx, PGresult *result); static void printLastEvents(void *ctx, PGresult *result); static void getLastEvents(void *ctx, PGresult *result); static void printFormationSettings(void *ctx, PGresult *result); static void printFormationURI(void *ctx, PGresult *result); static void parseCoordinatorNode(void *ctx, PGresult *result); static void parseExtensionVersion(void *ctx, PGresult *result); static bool prepare_connection_to_current_system_user(Monitor *source, Monitor *target); /* * We have several function that consume monitor notification in different * ways. They all have many things in common: * * - they need to call pselect() and take care of signal processing and race * conditions * * - they need to filter out some of the notifications * * - they need to process the notifications that have not been filtered out. * * Both the filtering and the processing are specific to each top-level * function that needs to consumer monitor's notifications. */ typedef void (*NotificationProcessingFunction)(void *context, CurrentNodeState *nodeState); typedef struct LogNotificationContext { int logLevel; } LogNotificationContext; typedef struct ApplySettingsNotificationContext { char *formation; bool applySettingsTransitionInProgress; bool applySettingsTransitionDone; } ApplySettingsNotificationContext; typedef struct WaitUntilStateNotificationContext { char *formation; int groupId; NodeAddressHeaders *headers; NodeState targetState; bool failoverIsDone; bool firstLoop; } WaitUntilStateNotificationContext; typedef struct WaitUntilNodeStateNotificationContext { char *formation; int groupId; int64_t nodeId; NodeAddressHeaders *headers; NodeState *targetStates; int targetStatesLength; bool done; bool firstLoop; } WaitUntilNodeStateNotificationContext; typedef struct WaitForStateChangeNotificationContext { char *formation; int groupId; int64_t nodeId; bool stateHasChanged; } WaitForStateChangeNotificationContext; static bool monitor_process_notifications(Monitor *monitor, int timeoutMs, char *channels[], void *NotificationContext, NotificationProcessingFunction processor); /* * monitor_init initializes a Monitor struct to connect to the given * database URL. */ bool monitor_init(Monitor *monitor, char *url) { log_trace("monitor_init: %s", url); if (!pgsql_init(&monitor->pgsql, url, PGSQL_CONN_MONITOR)) { /* URL must be invalid, pgsql_init logged an error */ return false; } if (!pgsql_init(&monitor->notificationClient, url, PGSQL_CONN_MONITOR)) { /* URL must be invalid, pgsql_init logged an error */ return false; } return true; } /* * monitor_setup_notifications sets the monitor Postgres client structure to * enable notification processing for a given groupId. */ void monitor_setup_notifications(Monitor *monitor, int groupId, int64_t nodeId) { monitor->notificationClient.notificationGroupId = groupId; monitor->notificationClient.notificationNodeId = nodeId; monitor->notificationClient.notificationReceived = false; /* install our notification handler */ monitor->notificationClient.notificationProcessFunction = &monitor_process_state_notification; } /* * monitor_has_received_notifications returns true when some notifications have * been received between the last call to either monitor_setup_notifications or * monitor_has_received_notifications. */ bool monitor_has_received_notifications(Monitor *monitor) { bool ret = monitor->notificationClient.notificationReceived; monitor->notificationClient.notificationReceived = false; return ret; } /* * monitor_process_state_notification processes a notification received on the * "state" channel from the monitor. */ bool monitor_process_state_notification(int notificationGroupId, int64_t notificationNodeId, char *channel, char *payload) { CurrentNodeState nodeState = { 0 }; if (strcmp(channel, "state") != 0) { return false; } /* errors are logged by parse_state_notification_message */ if (parse_state_notification_message(&nodeState, payload)) { if (nodeState.groupId == notificationGroupId) { (void) nodestate_log(&nodeState, LOG_INFO, notificationNodeId); return true; } } return false; } /* * monitor_local_init initializes a Monitor struct to connect to the local * monitor postgres instance, for use from the pg_autoctl instance that manages * the monitor. */ bool monitor_local_init(Monitor *monitor) { MonitorConfig *mconfig = &(monitor->config); PostgresSetup *pgSetup = &(mconfig->pgSetup); char connInfo[MAXCONNINFO] = { 0 }; pg_setup_get_local_connection_string(pgSetup, connInfo); if (!pgsql_init(&monitor->pgsql, connInfo, PGSQL_CONN_LOCAL)) { /* URL must be invalid, pgsql_init logged an error */ return false; } if (!pgsql_init(&monitor->notificationClient, connInfo, PGSQL_CONN_LOCAL)) { /* URL must be invalid, pgsql_init logged an error */ return false; } return true; } /* * monitor_retryable_error returns true when we may retry our query. That's * mostly useful to CLI entry points such as pg_autoctl enable|disable * maintenance where it's better if we can retry in those rare cases. */ bool monitor_retryable_error(const char *sqlstate) { if (strcmp(sqlstate, STR_ERRCODE_SERIALIZATION_FAILURE) == 0) { return true; } if (strcmp(sqlstate, STR_ERRCODE_STATEMENT_COMPLETION_UNKNOWN) == 0) { return true; } if (strcmp(sqlstate, STR_ERRCODE_DEADLOCK_DETECTED) == 0) { return true; } if (strncmp(sqlstate, STR_ERRCODE_CLASS_INSUFFICIENT_RESOURCES, 2) == 0) { return true; } if (strncmp(sqlstate, STR_ERRCODE_CLASS_PROGRAM_LIMIT_EXCEEDED, 2) == 0) { return true; } return false; } /* * monitor_get_nodes gets the hostname and port of all the nodes in the given * group. */ bool monitor_get_nodes(Monitor *monitor, char *formation, int groupId, NodeAddressArray *nodeArray) { PGSQL *pgsql = &monitor->pgsql; const char *sql = groupId == -1 ? "SELECT * FROM pgautofailover.get_nodes($1) ORDER BY node_id" : "SELECT * FROM pgautofailover.get_nodes($1, $2) ORDER BY node_id"; int paramCount = 1; Oid paramTypes[2] = { TEXTOID, INT4OID }; const char *paramValues[2] = { 0 }; NodeAddressArrayParseContext parseContext = { { 0 }, nodeArray, false }; paramValues[0] = formation; if (groupId > -1) { IntString myGroupIdString = intToString(groupId); ++paramCount; paramValues[1] = myGroupIdString.strValue; } if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseNodeArray)) { log_error("Failed to get other nodes from the monitor while running " "\"%s\" with formation %s and group %d", sql, formation, groupId); return false; } if (!parseContext.parsedOK) { log_error("Failed to get the other nodes from the monitor while " "running \"%s\" with formation %s and group %d because " "it returned an unexpected result. " "See previous line for details.", sql, formation, groupId); return false; } return true; } /* * monitor_get_other_nodes_as_json gets the hostname and port of the other node * in the group and prints them out in JSON format. */ bool monitor_print_nodes_as_json(Monitor *monitor, char *formation, int groupId) { PGSQL *pgsql = &monitor->pgsql; SingleValueResultContext context = { { 0 }, PGSQL_RESULT_STRING, false }; const char *sql = groupId == -1 ? "SELECT jsonb_pretty(coalesce(jsonb_agg(row_to_json(nodes)), '[]'))" " FROM pgautofailover.get_nodes($1) as nodes" : "SELECT jsonb_pretty(coalesce(jsonb_agg(row_to_json(nodes)), '[]'))" " FROM pgautofailover.get_nodes($1, $2) as nodes"; int paramCount = 1; Oid paramTypes[2] = { TEXTOID, INT4OID }; const char *paramValues[2] = { 0 }; paramValues[0] = formation; if (groupId > -1) { IntString myGroupIdString = intToString(groupId); ++paramCount; paramValues[1] = myGroupIdString.strValue; } if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to get the nodes from the monitor while running " "\"%s\" with formation %s and group %d", sql, formation, groupId); if (context.strVal) { free(context.strVal); } return false; } if (!context.parsedOk) { log_error("Failed to get the other nodes from the monitor while " "running \"%s\" with formation %s and group %d because " "it returned an unexpected result. " "See previous line for details.", sql, formation, groupId); if (context.strVal) { free(context.strVal); } return false; } fformat(stdout, "%s\n", context.strVal); free(context.strVal); return true; } /* * monitor_get_other_nodes gets the hostname and port of the other node in the * group. */ bool monitor_get_other_nodes(Monitor *monitor, int64_t myNodeId, NodeState currentState, NodeAddressArray *nodeArray) { PGSQL *pgsql = &monitor->pgsql; const char *sql = currentState == ANY_STATE ? "SELECT * FROM pgautofailover.get_other_nodes($1) " "ORDER BY node_id" : "SELECT * FROM pgautofailover.get_other_nodes($1, " "$2::pgautofailover.replication_state) " "ORDER BY node_id"; int paramCount = currentState == ANY_STATE ? 1 : 2; Oid paramTypes[2] = { INT8OID, TEXTOID }; const char *paramValues[3] = { 0 }; NodeAddressArrayParseContext parseContext = { { 0 }, nodeArray, false }; IntString myNodeIdString = intToString(myNodeId); paramValues[0] = myNodeIdString.strValue; if (currentState != ANY_STATE) { paramValues[1] = NodeStateToString(currentState); } if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseNodeArray)) { log_error("Failed to get other nodes from the monitor while running " "\"%s\" with node id %" PRId64, sql, myNodeId); return false; } if (!parseContext.parsedOK) { log_error("Failed to get the other nodes from the monitor while running " "\"%s\" with node id %" PRId64 " because it returned an unexpected result. " "See previous line for details.", sql, myNodeId); return false; } return true; } /* * monitor_print_other_nodes gets the other nodes from the monitor and then * prints them to stdout in a human-friendly tabular format. */ bool monitor_print_other_nodes(Monitor *monitor, int64_t myNodeId, NodeState currentState) { NodeAddressArray otherNodesArray; if (!monitor_get_other_nodes(monitor, myNodeId, currentState, &otherNodesArray)) { /* errors have already been logged */ return false; } (void) printNodeArray(&otherNodesArray); return true; } /* * monitor_print_other_node_as_json gets the hostname and port of the other * node in the group as a JSON string and prints it to given stream. */ bool monitor_print_other_nodes_as_json(Monitor *monitor, int64_t myNodeId, NodeState currentState) { PGSQL *pgsql = &monitor->pgsql; SingleValueResultContext context = { { 0 }, PGSQL_RESULT_STRING, false }; const char *sql = currentState == ANY_STATE ? "SELECT jsonb_pretty(coalesce(jsonb_agg(row_to_json(nodes)), '[]'))" " FROM pgautofailover.get_other_nodes($1) as nodes" : "SELECT jsonb_pretty(coalesce(jsonb_agg(row_to_json(nodes)), '[]'))" " FROM pgautofailover.get_other_nodes($1, " "$3::pgautofailover.replication_state) as nodes"; int paramCount = currentState == ANY_STATE ? 2 : 1; Oid paramTypes[2] = { INT8OID, TEXTOID }; const char *paramValues[2] = { 0 }; IntString myNodeIdString = intToString(myNodeId); paramValues[0] = myNodeIdString.strValue; if (currentState != ANY_STATE) { paramValues[1] = NodeStateToString(currentState); } if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to get the other nodes from the monitor while running " "\"%s\" with node id %" PRId64, sql, myNodeId); if (context.strVal) { free(context.strVal); } return false; } if (!context.parsedOk) { log_error("Failed to get the other nodes from the monitor while running " "\"%s\" with node id %" PRId64 " because it returned an unexpected result. " "See previous line for details.", sql, myNodeId); if (context.strVal) { free(context.strVal); } return false; } fformat(stdout, "%s\n", context.strVal); free(context.strVal); return true; } /* * monitor_get_primary gets the primary node in a give formation and group. */ bool monitor_get_primary(Monitor *monitor, char *formation, int groupId, NodeAddress *node) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.get_primary($1, $2)"; int paramCount = 2; Oid paramTypes[2] = { TEXTOID, INT4OID }; const char *paramValues[2]; NodeAddressParseContext parseContext = { { 0 }, node, false }; IntString groupIdString = intToString(groupId); paramValues[0] = formation; paramValues[1] = groupIdString.strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseNodeResult)) { log_error( "Failed to get the primary node in the HA group from the monitor " "while running \"%s\" with formation \"%s\" and group ID %d", sql, formation, groupId); return false; } if (!parseContext.parsedOK) { log_error( "Failed to get the primary node from the monitor while running " "\"%s\" with formation \"%s\" and group ID %d because it returned an " "unexpected result. See previous line for details.", sql, formation, groupId); return false; } /* The monitor function pgautofailover.get_primary only returns 3 fields */ node->isPrimary = true; log_debug("The primary node returned by the monitor is node " NODE_FORMAT, node->nodeId, node->name, node->host, node->port); return true; } /* * monitor_get_coordinator gets the coordinator node in a given formation. */ bool monitor_get_coordinator(Monitor *monitor, char *formation, CoordinatorNodeAddress *coordinatorNodeAddress) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.get_coordinator($1)"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1]; NodeAddressParseContext parseContext = { { 0 }, &(coordinatorNodeAddress->node), false }; paramValues[0] = formation; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseCoordinatorNode)) { log_error("Failed to get the coordinator node from the monitor, " "while running \"%s\" with formation \"%s\".", sql, formation); return false; } if (!parseContext.parsedOK) { log_error("Failed to get the coordinator node from the monitor " "while running \"%s\" with formation \"%s\" " "because it returned an unexpected result. " "See previous line for details.", sql, formation); return false; } if (parseContext.node == NULL) { log_error("Failed to get the coordinator node from the monitor: " "the monitor returned an empty result set, there's no " "known available coordinator node at this time in " "formation \"%s\"", formation); return false; } coordinatorNodeAddress->found = true; log_debug("The coordinator node returned by the monitor is %s:%d", coordinatorNodeAddress->node.host, coordinatorNodeAddress->node.port); return true; } /* * monitor_get_most_advanced_standby finds the standby node in state REPORT_LSN * with the most advanced LSN position. */ bool monitor_get_most_advanced_standby(Monitor *monitor, char *formation, int groupId, NodeAddress *node) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.get_most_advanced_standby($1, $2)"; int paramCount = 2; Oid paramTypes[2] = { TEXTOID, INT4OID }; const char *paramValues[2]; /* we expect a single entry */ NodeAddressArray nodeArray = { 0 }; NodeAddressArrayParseContext parseContext = { { 0 }, &nodeArray, false }; IntString groupIdString = intToString(groupId); paramValues[0] = formation; paramValues[1] = groupIdString.strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseNodeArray)) { log_error( "Failed to get most advanced standby node in the HA group " "from the monitor while running \"%s\" with " "formation \"%s\" and group ID %d", sql, formation, groupId); return false; } if (!parseContext.parsedOK || nodeArray.count != 1) { log_error( "Failed to get the most advanced standby node from the monitor " "while running \"%s\" with formation \"%s\" and group ID %d " "because it returned an unexpected result. " "See previous line for details.", sql, formation, groupId); return false; } /* copy the node we retrieved in the expected place */ node->nodeId = nodeArray.nodes[0].nodeId; strlcpy(node->name, nodeArray.nodes[0].name, _POSIX_HOST_NAME_MAX); strlcpy(node->host, nodeArray.nodes[0].host, _POSIX_HOST_NAME_MAX); node->port = nodeArray.nodes[0].port; strlcpy(node->lsn, nodeArray.nodes[0].lsn, PG_LSN_MAXLENGTH); node->isPrimary = nodeArray.nodes[0].isPrimary; log_debug("The most advanced standby node is node " NODE_FORMAT, node->nodeId, node->name, node->host, node->port); return true; } /* * monitor_register_node performs the initial registration of a node with the * monitor in the given formation. * * The caller can specify a desired group ID, which will result in the node * being added to the group unless it is already full. If the groupId is -1, * the monitor will pick a group. * * The initialState can be used to indicate that the operator wants to * initialize the node in a specific state directly. This can be useful to add * a standby to an already running primary node, doing the pg_basebackup * directly. * * The initialState can also be used to indicate that the node is already * correctly initialized in a particular state. This can be useful when * bringing back a keeper after replacing the monitor. * * The node ID and group ID selected by the monitor, as well as the goal * state, are set in assignedState, which must not be NULL. */ bool monitor_register_node(Monitor *monitor, char *formation, char *name, char *host, int port, uint64_t system_identifier, char *dbname, int64_t desiredNodeId, int desiredGroupId, NodeState initialState, PgInstanceKind kind, int candidatePriority, bool quorum, char *citusClusterName, bool *mayRetry, MonitorAssignedState *assignedState) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.register_node($1, $2, $3, $4, $5, $6, $7, " "$8, $9::pgautofailover.replication_state, $10, $11, $12, $13)"; int paramCount = 13; Oid paramTypes[13] = { TEXTOID, TEXTOID, INT4OID, NAMEOID, TEXTOID, INT8OID, INT8OID, INT4OID, TEXTOID, TEXTOID, INT4OID, BOOLOID, TEXTOID }; const char *paramValues[13]; MonitorAssignedStateParseContext parseContext = { { 0 }, assignedState, false }; const char *nodeStateString = NodeStateToString(initialState); paramValues[0] = formation; paramValues[1] = host; paramValues[2] = intToString(port).strValue; paramValues[3] = dbname; paramValues[4] = name == NULL ? "" : name; paramValues[5] = intToString(system_identifier).strValue; paramValues[6] = intToString(desiredNodeId).strValue; paramValues[7] = intToString(desiredGroupId).strValue; paramValues[8] = nodeStateString; paramValues[9] = nodeKindToString(kind); paramValues[10] = intToString(candidatePriority).strValue; paramValues[11] = quorum ? "true" : "false"; paramValues[12] = IS_EMPTY_STRING_BUFFER(citusClusterName) ? DEFAULT_CITUS_CLUSTER_NAME : citusClusterName; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseNodeState)) { if (monitor_retryable_error(parseContext.sqlstate) || strcmp(parseContext.sqlstate, STR_ERRCODE_OBJECT_IN_USE) == 0) { *mayRetry = true; return false; } else if (strcmp(parseContext.sqlstate, STR_ERRCODE_EXCLUSION_VIOLATION) == 0) { /* *INDENT-OFF* */ log_error("Failed to register node %s:%d in " "group %d of formation \"%s\" " "with system_identifier %" PRIu64 ", " "because another node already exists in this group with " "another system_identifier", host, port, desiredGroupId, formation, system_identifier); /* *INDENT-ON* */ log_info( "HINT: you may register a standby node from a non-existing " "PGDATA directory that pg_autoctl then creates for you, or " "PGDATA should be a copy of the current primary node such as " "obtained from a backup and recovery tool."); return false; } log_error("Failed to register node %s:%d in group %d of formation \"%s\" " "with initial state \"%s\", see previous lines for details", host, port, desiredGroupId, formation, nodeStateString); return false; } if (!parseContext.parsedOK) { log_error("Failed to register node %s:%d in group %d of formation \"%s\" " "with initial state \"%s\" because the monitor returned an " "unexpected result, see previous lines for details", host, port, desiredGroupId, formation, nodeStateString); return false; } log_info("Registered node " NODE_FORMAT "in formation \"%s\", group %d, state \"%s\"", assignedState->nodeId, assignedState->name, host, port, formation, assignedState->groupId, NodeStateToString(assignedState->state)); return true; } /* * monitor_node_active communicates the current state of the node to the * monitor and puts the new goal state to assignedState, which must not * be NULL. */ bool monitor_node_active(Monitor *monitor, char *formation, int64_t nodeId, int groupId, NodeState currentState, bool pgIsRunning, int currentTLI, char *currentLSN, char *pgsrSyncState, MonitorAssignedState *assignedState) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.node_active($1, $2, $3, " "$4::pgautofailover.replication_state, $5, $6, $7, $8)"; int paramCount = 8; Oid paramTypes[8] = { TEXTOID, INT8OID, INT4OID, TEXTOID, BOOLOID, INT4OID, LSNOID, TEXTOID }; const char *paramValues[8]; MonitorAssignedStateParseContext parseContext = { { 0 }, assignedState, false }; const char *nodeStateString = NodeStateToString(currentState); paramValues[0] = formation; paramValues[1] = intToString(nodeId).strValue; paramValues[2] = intToString(groupId).strValue; paramValues[3] = nodeStateString; paramValues[4] = pgIsRunning ? "true" : "false"; paramValues[5] = intToString(currentTLI).strValue; paramValues[6] = currentLSN; paramValues[7] = pgsrSyncState; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseNodeState)) { log_error("Failed to get node state for node %" PRId64 " in group %d of formation \"%s\" with initial state " "\"%s\", replication state \"%s\", " "and current lsn \"%s\", " "see previous lines for details", nodeId, groupId, formation, nodeStateString, pgsrSyncState, currentLSN); return false; } if (!parseContext.parsedOK) { log_error("Failed to get node state for node %" PRId64 " in group %d of formation " "\"%s\" with initial state \"%s\", replication state \"%s\"," " and current lsn \"%s\"" " because the monitor returned an unexpected result, " "see previous lines for details", nodeId, groupId, formation, nodeStateString, pgsrSyncState, currentLSN); return false; } return true; } /* * monitor_set_node_candidate_priority updates the monitor on the changes * in the node candidate priority. */ bool monitor_set_node_candidate_priority(Monitor *monitor, char *formation, char *name, int candidate_priority) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT pgautofailover.set_node_candidate_priority($1, $2, $3)"; int paramCount = 3; Oid paramTypes[3] = { TEXTOID, TEXTOID, INT4OID }; const char *paramValues[3]; char *candidatePriorityText = intToString(candidate_priority).strValue; bool success = true; paramValues[0] = formation; paramValues[1] = name, paramValues[2] = candidatePriorityText; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, NULL, NULL)) { log_error("Failed to update node candidate priority on node \"%s\"" "in formation \"%s\" for candidate_priority: \"%s\"", name, formation, candidatePriorityText); success = false; } return success; } /* * monitor_set_node_replication_quorum updates the monitor on the changes * in the node replication quorum. */ bool monitor_set_node_replication_quorum(Monitor *monitor, char *formation, char *name, bool replicationQuorum) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT pgautofailover.set_node_replication_quorum($1, $2, $3)"; int paramCount = 3; Oid paramTypes[3] = { TEXTOID, TEXTOID, BOOLOID }; const char *paramValues[3]; char *replicationQuorumText = replicationQuorum ? "true" : "false"; bool success = true; paramValues[0] = formation; paramValues[1] = name, paramValues[2] = replicationQuorumText; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, NULL, NULL)) { log_error("Failed to update node replication quorum on node \"%s\"" "in formation \"%s\" for replication_quorum: \"%s\"", name, formation, replicationQuorumText); success = false; } return success; } /* * monitor_get_node_replication_settings retrieves replication settings * from the monitor. */ bool monitor_get_node_replication_settings(Monitor *monitor, NodeReplicationSettings *settings) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT candidatepriority, replicationquorum FROM pgautofailover.node " "WHERE nodename = $1"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1]; NodeReplicationSettingsParseContext parseContext = { { 0 }, -1, false, false }; paramValues[0] = settings->name; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseNodeReplicationSettings)) { log_error("Failed to retrieve node settings for node \"%s\".", settings->name); return false; } if (!parseContext.parsedOK) { return false; } settings->candidatePriority = parseContext.candidatePriority; settings->replicationQuorum = parseContext.replicationQuorum; return true; } /* * parseNodeReplicationSettings parses nore replication settings * from query output. */ static void parseNodeReplicationSettings(void *ctx, PGresult *result) { NodeReplicationSettingsParseContext *context = (NodeReplicationSettingsParseContext *) ctx; int errors = 0; if (PQntuples(result) != 1) { log_error("Query returned %d rows, expected 1", PQntuples(result)); context->parsedOK = false; return; } if (PQnfields(result) != 2) { log_error("Query returned %d columns, expected 2", PQnfields(result)); context->parsedOK = false; return; } char *value = PQgetvalue(result, 0, 0); if (!stringToInt(value, &context->candidatePriority)) { log_error("Invalid failover candidate priority \"%s\" " "returned by monitor", value); ++errors; } value = PQgetvalue(result, 0, 1); if (value == NULL || ((*value != 't') && (*value != 'f'))) { log_error("Invalid replication quorum \"%s\" " "returned by monitor", value); ++errors; } else { context->replicationQuorum = (*value) == 't'; } if (errors > 0) { context->parsedOK = false; return; } /* if we reach this line, then we're good. */ context->parsedOK = true; } /* * monitor_get_formation_number_sync_standbys retrieves number-sync-standbys * property for formation from the monitor. The function returns true upon * success. */ bool monitor_get_formation_number_sync_standbys(Monitor *monitor, char *formation, int *numberSyncStandbys) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT number_sync_standbys FROM pgautofailover.formation " "WHERE formationid = $1"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1]; SingleValueResultContext parseContext = { { 0 }, PGSQL_RESULT_INT, false }; paramValues[0] = formation; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseSingleValueResult)) { log_error("Failed to retrieve settings for formation \"%s\".", formation); return false; } /* disconnect from monitor */ pgsql_finish(&monitor->pgsql); if (!parseContext.parsedOk) { return false; } *numberSyncStandbys = parseContext.intVal; return true; } /* * monitor_set_formation_number_sync_standbys sets number-sync-standbys * property for formation at the monitor. The function returns true upon * success. */ bool monitor_set_formation_number_sync_standbys(Monitor *monitor, char *formation, int numberSyncStandbys) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT pgautofailover.set_formation_number_sync_standbys($1, $2)"; int paramCount = 2; Oid paramTypes[2] = { TEXTOID, INT4OID }; const char *paramValues[2]; SingleValueResultContext parseContext = { { 0 }, PGSQL_RESULT_BOOL, false }; paramValues[0] = formation; paramValues[1] = intToString(numberSyncStandbys).strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseSingleValueResult)) { log_error("Failed to update number-sync-standbys for formation \"%s\".", formation); return false; } if (!parseContext.parsedOk) { return false; } return parseContext.boolVal; } /* * monitor_remove_by_hostname calls the pgautofailover.monitor_remove function * on the monitor. */ bool monitor_remove_by_hostname(Monitor *monitor, char *host, int port, bool force, int64_t *nodeId, int *groupId) { RemoveNodeContext context = { 0 }; PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT nodeid, groupid, pgautofailover.remove_node($1, $2, $3) " " FROM pgautofailover.node" " WHERE nodehost = $1 and nodeport = $2"; int paramCount = 3; Oid paramTypes[3] = { TEXTOID, INT4OID, BOOLOID }; const char *paramValues[3]; paramValues[0] = host; paramValues[1] = intToString(port).strValue; paramValues[2] = force ? "true" : "false"; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseRemoveNodeContext)) { /* if we fail to find the node we want to remove, we're good */ if (strcmp(context.sqlstate, STR_ERRCODE_UNDEFINED_OBJECT) == 0) { return true; } log_error("Failed to remove node %s:%d from the monitor", host, port); return false; } if (!context.parsedOK) { log_error("Failed to remove node %s:%d from the monitor: " "could not parse monitor's result.", host, port); return false; } /* * We ignore the return value of pgautofailover.remove_node: * - if it's true, then the node has been removed * - if it's false, then the node didn't exist in the first place * * The only case where we return false here is when we failed to run the * pgautofailover.remove_node function on the monitor, see above. */ *nodeId = context.nodeId; *groupId = context.groupId; return true; } /* * monitor_remove_by_nodename calls the pgautofailover.monitor_remove function * on the monitor. */ bool monitor_remove_by_nodename(Monitor *monitor, char *formation, char *name, bool force, int64_t *nodeId, int *groupId) { RemoveNodeContext context = { 0 }; PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT nodeid, groupid, pgautofailover.remove_node(nodeid::int, $3) " " FROM pgautofailover.node" " WHERE formationid = $1 and nodename = $2"; int paramCount = 3; Oid paramTypes[3] = { TEXTOID, TEXTOID }; const char *paramValues[3] = { formation, name, force ? "true" : "false" }; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseRemoveNodeContext)) { log_error("Failed to remove node \"%s\" in formation \"%s\" " "from the monitor", name, formation); return false; } if (!context.parsedOK) { log_error("Failed to remove node \"%s\" in formation \"%s\" " "from the monitor", name, formation); return false; } else if (!context.parsedOK) { log_error("Failed to remove node \"%s\" in formation \"%s\" " "from the monitor: could not parse monitor's result.", name, formation); return false; } /* * We ignore the return value of pgautofailover.remove_node: * - if it's true, then the node has been removed * - if it's false, then the node didn't exist in the first place * * The only case where we return false here is when we failed to run the * pgautofailover.remove_node function on the monitor, see above. */ *nodeId = context.nodeId; *groupId = context.groupId; return true; } /* * parseRemoveNodeContext parses a nodeid and groupid, and the result of the * monitor's function call pgautofailover.remove_node which is a boolean. */ static void parseRemoveNodeContext(void *ctx, PGresult *result) { int errors = 0; RemoveNodeContext *context = (RemoveNodeContext *) ctx; context->parsedOK = false; if (PQntuples(result) == 0) { log_error("Failed to find the node to remove on the monitor"); context->parsedOK = false; return; } else if (PQntuples(result) != 1) { log_error("Query returned %d rows, expected 1", PQntuples(result)); context->parsedOK = false; return; } if (PQnfields(result) != 3) { log_error("Query returned %d columns, expected 3", PQnfields(result)); context->parsedOK = false; return; } char *value = PQgetvalue(result, 0, 0); if (!stringToInt64(value, &context->nodeId)) { log_error("Invalid node ID \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, 0, 1); if (!stringToInt(value, &context->groupId)) { log_error("Invalid group ID \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, 0, 2); if (value == NULL || ((*value != 't') && (*value != 'f'))) { log_error("Invalid boolean value \"%s\" returned by monitor", value); ++errors; } else { context->removed = (*value) == 't'; } if (errors > 0) { context->parsedOK = false; return; } /* if we reach this line, then we're good. */ context->parsedOK = true; } /* * monitor_count_groups counts how many groups we have in this formation, and * sets the obtained value in the groupsCount parameter. */ bool monitor_count_groups(Monitor *monitor, char *formation, int *groupsCount) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_INT, false }; PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT count(distinct(groupid)) " "FROM pgautofailover.node " "WHERE formationid = $1"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1] = { formation }; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to get how many groups are in formation %s", formation); return false; } *groupsCount = context.intVal; return true; } /* * monitor_get_groupId_from_name returns the groupId that belongs to a node * identified by name. */ bool monitor_get_groupId_from_name(Monitor *monitor, char *formation, char *name, int *groupId) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_INT, false }; PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT groupid FROM pgautofailover.node " "WHERE formationid = $1 and nodename = $2"; int paramCount = 2; Oid paramTypes[2] = { TEXTOID, TEXTOID }; const char *paramValues[2] = { formation, name }; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to retrieve groupId for node \"%s\" in formation \"%s\"", name, formation); return false; } *groupId = context.intVal; return true; } /* * monitor_perform_failover calls the pgautofailover.monitor_perform_failover * function on the monitor. */ bool monitor_perform_failover(Monitor *monitor, char *formation, int group) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT pgautofailover.perform_failover($1, $2)"; int paramCount = 2; Oid paramTypes[2] = { TEXTOID, INT4OID }; const char *paramValues[2]; paramValues[0] = formation; paramValues[1] = intToString(group).strValue; /* * pgautofailover.perform_failover() returns VOID. */ if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, NULL, NULL)) { log_error("Failed to perform failover for formation %s and group %d", formation, group); return false; } return true; } /* * monitor_perform_promotion calls the pgautofailover.perform_promotion * function on the monitor. */ bool monitor_perform_promotion(Monitor *monitor, char *formation, char *name) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT pgautofailover.perform_promotion($1, $2)"; int paramCount = 2; Oid paramTypes[2] = { TEXTOID, TEXTOID }; const char *paramValues[2] = { formation, name }; SingleValueResultContext context = { { 0 }, PGSQL_RESULT_BOOL, false }; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to perform failover for node %s in formation %s", name, formation); return false; } if (!context.parsedOk) { log_error( "Failed to call pgautofailover.perform_promotion(\"%s\", \"%s\") " "on the monitor: it returned an unexpected result. " "See previous line for details.", formation, name); return false; } return context.boolVal; } /* * parseNode parses a hostname and a port from the libpq result and writes * it to the NodeAddressParseContext pointed to by ctx. */ static bool parseNode(PGresult *result, int rowNumber, NodeAddress *node) { if (PQgetisnull(result, rowNumber, 0) || PQgetisnull(result, rowNumber, 1) || PQgetisnull(result, rowNumber, 2) || PQgetisnull(result, rowNumber, 3)) { log_error("NodeId, nodename, hostname or port returned by monitor is NULL"); return false; } char *value = PQgetvalue(result, rowNumber, 0); node->nodeId = strtol(value, NULL, 0); if (node->nodeId == 0) { log_error("Invalid nodeId \"%s\" returned by monitor", value); return false; } value = PQgetvalue(result, rowNumber, 1); int length = strlcpy(node->name, value, _POSIX_HOST_NAME_MAX); if (length >= _POSIX_HOST_NAME_MAX) { log_error("Node name \"%s\" returned by monitor is %d characters, " "the maximum supported by pg_autoctl is %d", value, length, _POSIX_HOST_NAME_MAX - 1); return false; } value = PQgetvalue(result, rowNumber, 2); length = strlcpy(node->host, value, _POSIX_HOST_NAME_MAX); if (length >= _POSIX_HOST_NAME_MAX) { log_error("Hostname \"%s\" returned by monitor is %d characters, " "the maximum supported by pg_autoctl is %d", value, length, _POSIX_HOST_NAME_MAX - 1); return false; } value = PQgetvalue(result, rowNumber, 3); if (!stringToInt(value, &node->port) || node->port == 0) { log_error("Invalid port number \"%s\" returned by monitor", value); return false; } /* * pgautofailover.get_other_nodes also returns the LSN and is_primary bits * of information. */ if (PQnfields(result) == 6) { /* we trust Postgres pg_lsn data type to fit in our PG_LSN_MAXLENGTH */ value = PQgetvalue(result, rowNumber, 4); strlcpy(node->lsn, value, PG_LSN_MAXLENGTH); value = PQgetvalue(result, rowNumber, 5); node->isPrimary = strcmp(value, "t") == 0; } return true; } /* * parseNode parses a hostname and a port from the libpq result and writes * it to the NodeAddressParseContext pointed to by ctx. */ static void parseNodeResult(void *ctx, PGresult *result) { NodeAddressParseContext *context = (NodeAddressParseContext *) ctx; if (PQntuples(result) != 1) { log_error("Query returned %d rows, expected 1", PQntuples(result)); context->parsedOK = false; return; } if (PQnfields(result) != 4) { log_error("Query returned %d columns, expected 3", PQnfields(result)); context->parsedOK = false; return; } context->parsedOK = parseNode(result, 0, context->node); } /* * parseNode parses a hostname and a port from the libpq result and writes * it to the NodeAddressParseContext pointed to by ctx. */ static void parseNodeArray(void *ctx, PGresult *result) { bool parsedOk = true; int rowNumber = 0; NodeAddressArrayParseContext *context = (NodeAddressArrayParseContext *) ctx; log_debug("parseNodeArray: %d", PQntuples(result)); /* keep a NULL entry to mark the end of the array */ if (PQntuples(result) > NODE_ARRAY_MAX_COUNT) { log_error("Query returned %d rows, pg_auto_failover supports only up " "to %d standby nodes at the moment", PQntuples(result), NODE_ARRAY_MAX_COUNT); context->parsedOK = false; return; } /* pgautofailover.get_other_nodes returns 6 columns */ if (PQnfields(result) != 6) { log_error("Query returned %d columns, expected 6", PQnfields(result)); context->parsedOK = false; return; } context->nodesArray->count = PQntuples(result); for (rowNumber = 0; rowNumber < PQntuples(result); rowNumber++) { NodeAddress *node = &(context->nodesArray->nodes[rowNumber]); parsedOk = parsedOk && parseNode(result, rowNumber, node); } context->parsedOK = parsedOk; } /* * parseNodeState parses a node state coming back from a call to * register_node or node_active. */ static void parseNodeState(void *ctx, PGresult *result) { MonitorAssignedStateParseContext *context = (MonitorAssignedStateParseContext *) ctx; int errors = 0; if (PQntuples(result) != 1) { log_error("Query returned %d rows, expected 1", PQntuples(result)); context->parsedOK = false; return; } /* * We re-use the same data structure for register_node and node_active, * where the former adds the nodename to its result. */ if (PQnfields(result) != 5 && PQnfields(result) != 6) { log_error("Query returned %d columns, expected 5 or 6", PQnfields(result)); context->parsedOK = false; return; } char *value = PQgetvalue(result, 0, 0); if (!stringToInt64(value, &context->assignedState->nodeId)) { log_error("Invalid node ID \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, 0, 1); if (!stringToInt(value, &context->assignedState->groupId)) { log_error("Invalid group ID \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, 0, 2); context->assignedState->state = NodeStateFromString(value); if (context->assignedState->state == NO_STATE) { log_error("Invalid node state \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, 0, 3); if (!stringToInt(value, &context->assignedState->candidatePriority)) { log_error("Invalid failover candidate priority \"%s\" " "returned by monitor", value); ++errors; } value = PQgetvalue(result, 0, 4); if (value == NULL || ((*value != 't') && (*value != 'f'))) { log_error("Invalid replication quorum \"%s\" " "returned by monitor", value); ++errors; } else { context->assignedState->replicationQuorum = (*value) == 't'; } if (errors > 0) { context->parsedOK = false; return; } if (PQnfields(result) == 6) { value = PQgetvalue(result, 0, 5); strlcpy(context->assignedState->name, value, sizeof(context->assignedState->name)); } /* if we reach this line, then we're good. */ context->parsedOK = true; } /* * monitor_print_state calls the function pgautofailover.current_state on the * monitor, and prints a line of output per state record obtained. */ bool monitor_print_state(Monitor *monitor, char *formation, int group) { CurrentNodeStateArray nodesArray = { 0 }; NodeAddressHeaders *headers = &(nodesArray.headers); PgInstanceKind firstNodeKind = NODE_KIND_UNKNOWN; if (!monitor_get_current_state(monitor, formation, group, &nodesArray)) { /* errors have already been logged */ return false; } if (nodesArray.count > 0) { firstNodeKind = nodesArray.nodes[0].pgKind; } (void) nodestatePrepareHeaders(&nodesArray, firstNodeKind); (void) nodestatePrintHeader(headers); for (int position = 0; position < nodesArray.count; position++) { CurrentNodeState *nodeState = &(nodesArray.nodes[position]); (void) nodestatePrintNodeState(headers, nodeState); } fformat(stdout, "\n"); return true; } /* * monitor_get_current_state gets the current state of a formation in the given * pre-allocated nodesArray. When group is -1, the state of all the nodes that * belong to the formation is retrieved. When group is 0 or more, the state for * only the nodes that belong to the given group in the given formation is * retrieved. */ bool monitor_get_current_state(Monitor *monitor, char *formation, int group, CurrentNodeStateArray *nodesArray) { CurrentNodeStateContext context = { { 0 }, nodesArray, false }; PGSQL *pgsql = &monitor->pgsql; char *sql = NULL; int paramCount = 0; Oid paramTypes[2]; const char *paramValues[2]; IntString groupStr; log_trace("monitor_print_state(%s, %d)", formation, group); switch (group) { case -1: { sql = " SELECT formation_kind, nodename, nodehost, nodeport, " " group_id, node_id, " " current_group_state, assigned_group_state, " " candidate_priority, replication_quorum, " " reported_tli, reported_lsn, health, nodecluster, " " healthlag, reportlag" " FROM pgautofailover.current_state($1) cs " " JOIN (" " select nodeid, " " extract(epoch from now() - healthchecktime), " " extract(epoch from now() - reporttime) " " from pgautofailover.node " " ) as n(nodeid, healthlag, reportlag)" " on n.nodeid = cs.node_id " "ORDER BY group_id, node_id"; paramCount = 1; paramTypes[0] = TEXTOID; paramValues[0] = formation; break; } default: { sql = " SELECT formation_kind, nodename, nodehost, nodeport, " " group_id, node_id, " " current_group_state, assigned_group_state, " " candidate_priority, replication_quorum, " " reported_tli, reported_lsn, health, nodecluster, " " healthlag, reportlag" " FROM pgautofailover.current_state($1, $2) cs " " JOIN (" " select nodeid, " " extract(epoch from now() - healthchecktime), " " extract(epoch from now() - reporttime) " " from pgautofailover.node " " ) as n(nodeid, healthlag, reportlag)" " on n.nodeid = cs.node_id " "ORDER BY group_id, node_id"; groupStr = intToString(group); paramCount = 2; paramTypes[0] = TEXTOID; paramValues[0] = formation; paramTypes[1] = INT4OID; paramValues[1] = groupStr.strValue; break; } } if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &getCurrentState)) { log_error("Failed to retrieve current state from the monitor"); return false; } if (!context.parsedOK) { log_error("Failed to parse current state from the monitor"); return false; } return true; } /* * parseCurrentNodeState parses the 11 columns returned by the API endpoint * pgautofailover.current_state. */ static bool parseCurrentNodeState(PGresult *result, int rowNumber, CurrentNodeState *nodeState) { int colNumber = 0; int errors = 0; /* we don't expect any of the column to be NULL */ for (colNumber = 0; colNumber < 16; colNumber++) { if (PQgetisnull(result, rowNumber, 0)) { log_error("column %d in row %d returned by the monitor is NULL", colNumber, rowNumber); return false; } } /* * 0 - OUT formation_kind text, * 1 - OUT nodename text, * 2 - OUT nodehost text, * 3 - OUT nodeport int, * 4 - OUT group_id int, * 5 - OUT node_id bigint, * 6 - OUT current_group_state pgautofailover.replication_state, * 7 - OUT assigned_group_state pgautofailover.replication_state, * 8 - OUT candidate_priority int, * 9 - OUT replication_quorum bool, * 10 - OUT reported_tli int, * 11 - OUT reported_lsn pg_lsn, * 12 - OUT health integer * 13 - OUT nodecluster text * 14 - healthlag int (extract epoch from interval) * 15 - reportlag int (extract epoch from interval) * * We need the groupId to parse the formation kind into a nodeKind, so we * begin at column 1 and get back to column 0 later, after column 4. */ char *value = PQgetvalue(result, rowNumber, 1); int length = strlcpy(nodeState->node.name, value, _POSIX_HOST_NAME_MAX); if (length >= _POSIX_HOST_NAME_MAX) { log_error("Node name \"%s\" returned by monitor is %d characters, " "the maximum supported by pg_autoctl is %d", value, length, _POSIX_HOST_NAME_MAX - 1); ++errors; } value = PQgetvalue(result, rowNumber, 2); length = strlcpy(nodeState->node.host, value, _POSIX_HOST_NAME_MAX); if (length >= _POSIX_HOST_NAME_MAX) { log_error("Hostname \"%s\" returned by monitor is %d characters, " "the maximum supported by pg_autoctl is %d", value, length, _POSIX_HOST_NAME_MAX - 1); ++errors; } value = PQgetvalue(result, rowNumber, 3); if (!stringToInt(value, &(nodeState->node.port)) || nodeState->node.port == 0) { log_error("Invalid port number \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, rowNumber, 4); if (!stringToInt(value, &(nodeState->groupId))) { log_error("Invalid groupId \"%s\" returned by monitor", value); ++errors; } /* we need the groupId to parse the formation kind into a nodeKind */ value = PQgetvalue(result, rowNumber, 0); if (strcmp(value, "pgsql") == 0 && nodeState->groupId == 0) { nodeState->pgKind = NODE_KIND_STANDALONE; } else if (strcmp(value, "citus") == 0 && nodeState->groupId == 0) { nodeState->pgKind = NODE_KIND_CITUS_COORDINATOR; } else if (strcmp(value, "citus") == 0 && nodeState->groupId > 0) { nodeState->pgKind = NODE_KIND_CITUS_WORKER; } else { log_error("Invalid groupId %d with formation kind \"%s\"", nodeState->groupId, value); ++errors; } value = PQgetvalue(result, rowNumber, 5); if (!stringToInt64(value, &(nodeState->node.nodeId))) { log_error("Invalid nodeId \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, rowNumber, 6); nodeState->reportedState = NodeStateFromString(value); if (nodeState->reportedState == NO_STATE) { log_error("Invalid node state \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, rowNumber, 7); nodeState->goalState = NodeStateFromString(value); if (nodeState->goalState == NO_STATE) { log_error("Invalid node state \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, rowNumber, 8); if (!stringToInt(value, &(nodeState->candidatePriority))) { log_error("Invalid failover candidate priority \"%s\" " "returned by monitor", value); ++errors; } value = PQgetvalue(result, rowNumber, 9); if (value == NULL || ((*value != 't') && (*value != 'f'))) { log_error("Invalid replication quorum \"%s\" " "returned by monitor", value); ++errors; } else { nodeState->replicationQuorum = (*value) == 't'; } value = PQgetvalue(result, rowNumber, 10); if (!stringToInt(value, &(nodeState->node.tli))) { log_error("Invalid timeline \"%s\" returned by monitor", value); ++errors; } /* we trust Postgres pg_lsn data type to fit in our PG_LSN_MAXLENGTH */ value = PQgetvalue(result, rowNumber, 11); strlcpy(nodeState->node.lsn, value, PG_LSN_MAXLENGTH); value = PQgetvalue(result, rowNumber, 12); if (!stringToInt(value, &(nodeState->health))) { log_error("Invalid node health \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, rowNumber, 13); length = strlcpy(nodeState->citusClusterName, value, NAMEDATALEN); if (length >= NAMEDATALEN) { log_error("Cluster name \"%s\" returned by monitor is %d characters, " "the maximum supported by pg_autoctl is %d", value, length, NAMEDATALEN - 1); ++errors; } value = PQgetvalue(result, rowNumber, 14); if (!stringToDouble(value, &(nodeState->healthLag))) { log_error("Invalid health lag \"%s\" returned by monitor", value); ++errors; } value = PQgetvalue(result, rowNumber, 15); if (!stringToDouble(value, &(nodeState->reportLag))) { log_error("Invalid report lag \"%s\" returned by monitor", value); ++errors; } return errors == 0; } /* * parseCurrentNodeStateArray parses an array of up to NODE_ARRAY_MAX_COUNT * nodeStates, one entry per node in a given formation. */ static bool parseCurrentNodeStateArray(CurrentNodeStateArray *nodesArray, PGresult *result) { bool parsedOk = true; int rowNumber = 0; log_trace("parseCurrentNodeStateArray: %d", PQntuples(result)); /* keep a NULL entry to mark the end of the array */ if (PQntuples(result) > NODE_ARRAY_MAX_COUNT) { log_error("Query returned %d rows, pg_auto_failover supports only up " "to %d standby nodes at the moment", PQntuples(result), NODE_ARRAY_MAX_COUNT); return false; } /* pgautofailover.current_state returns 11 columns */ if (PQnfields(result) != 16) { log_error("Query returned %d columns, expected 16", PQnfields(result)); return false; } nodesArray->count = PQntuples(result); for (rowNumber = 0; rowNumber < PQntuples(result); rowNumber++) { CurrentNodeState *nodeState = &(nodesArray->nodes[rowNumber]); parsedOk = parsedOk && parseCurrentNodeState(result, rowNumber, nodeState); } return parsedOk; } /* * getCurrentState loops over pgautofailover.current_state() results and adds * them to the context's nodes array. */ static void getCurrentState(void *ctx, PGresult *result) { CurrentNodeStateContext *context = (CurrentNodeStateContext *) ctx; CurrentNodeStateArray *nodesArray = context->nodesArray; if (!parseCurrentNodeStateArray(nodesArray, result)) { /* errors have already been logged */ context->parsedOK = false; return; } context->parsedOK = true; } /* * monitor_print_state_as_json prints to given stream a single string that * contains the JSON representation of the current state on the monitor. */ bool monitor_print_state_as_json(Monitor *monitor, char *formation, int group) { SingleValueResultContext context = { 0 }; PGSQL *pgsql = &monitor->pgsql; char *sql = NULL; int paramCount = 0; Oid paramTypes[2]; const char *paramValues[2]; IntString groupStr; log_trace("monitor_get_state_as_json(%s, %d)", formation, group); context.resultType = PGSQL_RESULT_STRING; context.parsedOk = false; switch (group) { case -1: { sql = "SELECT jsonb_pretty(" "coalesce(jsonb_agg(row_to_json(state)), '[]'))" " FROM pgautofailover.current_state($1) as state"; paramCount = 1; paramTypes[0] = TEXTOID; paramValues[0] = formation; break; } default: { sql = "SELECT jsonb_pretty(" "coalesce(jsonb_agg(row_to_json(state)), '[]'))" "FROM pgautofailover.current_state($1,$2) as state"; groupStr = intToString(group); paramCount = 2; paramTypes[0] = TEXTOID; paramValues[0] = formation; paramTypes[1] = INT4OID; paramValues[1] = groupStr.strValue; break; } } if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to retrieve current state from the monitor"); return false; } if (!context.parsedOk) { log_error("Failed to parse current state from the monitor"); log_error("%s", context.strVal); if (context.strVal) { free(context.strVal); } return false; } fformat(stdout, "%s\n", context.strVal); free(context.strVal); return true; } /* * monitor_print_last_events calls the function pgautofailover.last_events on * the monitor, and prints a line of output per event obtained. */ bool monitor_print_last_events(Monitor *monitor, char *formation, int group, int count) { MonitorAssignedStateParseContext context = { 0 }; PGSQL *pgsql = &monitor->pgsql; char *sql = NULL; int paramCount = 0; Oid paramTypes[3]; const char *paramValues[3]; IntString countStr; IntString groupStr; log_trace("monitor_print_last_events(%s, %d, %d)", formation, group, count); switch (group) { case -1: { sql = "SELECT eventTime, nodeid, groupid, " " reportedstate, goalState, description " " FROM pgautofailover.last_events($1, count => $2)"; countStr = intToString(count); paramCount = 2; paramTypes[0] = TEXTOID; paramValues[0] = formation; paramTypes[1] = INT4OID; paramValues[1] = countStr.strValue; break; } default: { sql = "SELECT eventTime, nodeid, groupid, " " reportedstate, goalState, description " " FROM pgautofailover.last_events($1,$2,$3)"; countStr = intToString(count); groupStr = intToString(group); paramCount = 3; paramTypes[0] = TEXTOID; paramValues[0] = formation; paramTypes[1] = INT4OID; paramValues[1] = groupStr.strValue; paramTypes[2] = INT4OID; paramValues[2] = countStr.strValue; break; } } if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &printLastEvents)) { log_error("Failed to retrieve last events from the monitor"); return false; } if (!context.parsedOK) { return false; } return true; } /* * monitor_print_last_events_as_json calls the function * pgautofailover.last_events on the monitor, and prints the result as a JSON * array to the given stream (stdout, typically). */ bool monitor_print_last_events_as_json(Monitor *monitor, char *formation, int group, int count, FILE *stream) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_STRING, false }; PGSQL *pgsql = &monitor->pgsql; char *sql = NULL; int paramCount = 0; Oid paramTypes[3]; const char *paramValues[3]; IntString countStr; IntString groupStr; switch (group) { case -1: { sql = "SELECT jsonb_pretty(" "coalesce(jsonb_agg(row_to_json(event)), '[]'))" " FROM pgautofailover.last_events($1, count => $2) as event"; countStr = intToString(count); paramCount = 2; paramTypes[0] = TEXTOID; paramValues[0] = formation; paramTypes[1] = INT4OID; paramValues[1] = countStr.strValue; break; } default: { sql = "SELECT jsonb_pretty(" "coalesce(jsonb_agg(row_to_json(event)), '[]'))" " FROM * FROM pgautofailover.last_events($1,$2,$3) as event"; countStr = intToString(count); groupStr = intToString(group); paramCount = 3; paramTypes[0] = TEXTOID; paramValues[0] = formation; paramTypes[1] = INT4OID; paramValues[1] = groupStr.strValue; paramTypes[2] = INT4OID; paramValues[2] = countStr.strValue; break; } } if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to retrieve the last %d events from the monitor", count); return false; } if (!context.parsedOk) { log_error("Failed to parse %d last events from the monitor", count); log_error("%s", context.strVal); if (context.strVal) { free(context.strVal); } return false; } fformat(stream, "%s\n", context.strVal); free(context.strVal); return true; } /* * printLastEcvents loops over pgautofailover.last_events() results and prints * them, one per line. */ static void printLastEvents(void *ctx, PGresult *result) { MonitorAssignedStateParseContext *context = (MonitorAssignedStateParseContext *) ctx; int currentTupleIndex = 0; int nTuples = PQntuples(result); log_trace("printLastEvents: %d tuples", nTuples); if (PQnfields(result) != 6) { log_error("Query returned %d columns, expected 6", PQnfields(result)); context->parsedOK = false; return; } fformat(stdout, "%30s | %6s | %19s | %19s | %s\n", "Event Time", "Node", "Current State", "Assigned State", "Comment"); fformat(stdout, "%30s-+-%6s-+-%19s-+-%19s-+-%10s\n", "------------------------------", "------", "-------------------", "-------------------", "----------"); for (currentTupleIndex = 0; currentTupleIndex < nTuples; currentTupleIndex++) { char *eventTime = PQgetvalue(result, currentTupleIndex, 0); char *nodeId = PQgetvalue(result, currentTupleIndex, 1); char *groupId = PQgetvalue(result, currentTupleIndex, 2); char *currentState = PQgetvalue(result, currentTupleIndex, 3); char *goalState = PQgetvalue(result, currentTupleIndex, 4); char *description = PQgetvalue(result, currentTupleIndex, 5); char node[BUFSIZE]; /* for our grid alignment output it's best to have a single col here */ sformat(node, BUFSIZE, "%s/%s", groupId, nodeId); fformat(stdout, "%30s | %6s | %19s | %19s | %s\n", eventTime, node, currentState, goalState, description); } fformat(stdout, "\n"); context->parsedOK = true; } /* * monitor_get_last_events calls the function pgautofailover.last_events on * the monitor, and fills-in the given array of MonitorEvents. */ bool monitor_get_last_events(Monitor *monitor, char *formation, int group, int count, MonitorEventsArray *monitorEventsArray) { MonitorEventsArrayParseContext context = { { 0 }, monitorEventsArray, false }; PGSQL *pgsql = &monitor->pgsql; char *sql = NULL; int paramCount = 0; Oid paramTypes[3]; const char *paramValues[3]; IntString countStr; IntString groupStr; log_trace("monitor_print_last_events(%s, %d, %d)", formation, group, count); switch (group) { case -1: { sql = "SELECT eventId, to_char(eventTime, 'YYYY-MM-DD HH24:MI:SS'), " " formationId, nodeid, groupid, " " nodename, nodehost, nodeport, " " reportedstate, goalState, " " reportedrepstate, reportedtli, reportedlsn, " " candidatepriority, replicationquorum, " " description " " FROM pgautofailover.last_events($1, count => $2)"; countStr = intToString(count); paramCount = 2; paramTypes[0] = TEXTOID; paramValues[0] = formation; paramTypes[1] = INT4OID; paramValues[1] = countStr.strValue; break; } default: { sql = "SELECT eventId, to_char(eventTime, 'YYYY-MM-DD HH24:MI:SS'), " " formationId, nodeid, groupid, " " reportedstate, goalState, " " reportedrepstate, reportedtli, reportedlsn, " " candidatepriority, replicationquorum, " " description " " FROM pgautofailover.last_events($1,$2,$3)"; countStr = intToString(count); groupStr = intToString(group); paramCount = 3; paramTypes[0] = TEXTOID; paramValues[0] = formation; paramTypes[1] = INT4OID; paramValues[1] = groupStr.strValue; paramTypes[2] = INT4OID; paramValues[2] = countStr.strValue; break; } } if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &getLastEvents)) { log_error("Failed to retrieve last events from the monitor"); return false; } if (!context.parsedOK) { log_error("Failed to parse last events from the monitor, " "see above for details"); return false; } return true; } /* * getLastEvents loops over pgautofailover.last_events() results and fills in * the given MonitorEventsArray. */ static void getLastEvents(void *ctx, PGresult *result) { MonitorEventsArrayParseContext *context = (MonitorEventsArrayParseContext *) ctx; MonitorEventsArray *eventsArray = context->eventsArray; int currentTupleIndex = 0; int nTuples = PQntuples(result); int errors = 0; log_trace("getLastEvents: %d tuples", nTuples); if (nTuples > EVENTS_ARRAY_MAX_COUNT) { log_error("Query returned %d rows, pg_auto_failover supports only up " "to %d events at the moment", PQntuples(result), EVENTS_ARRAY_MAX_COUNT); context->parsedOK = false; return; } if (PQnfields(result) != 16) { log_error("Query returned %d columns, expected 16", PQnfields(result)); context->parsedOK = false; return; } eventsArray->count = nTuples; for (currentTupleIndex = 0; currentTupleIndex < nTuples; currentTupleIndex++) { MonitorEvent *event = &(eventsArray->events[currentTupleIndex]); char *value = PQgetvalue(result, currentTupleIndex, 0); /* eventId */ if (!stringToInt64(value, &(event->eventId))) { log_error("Invalid event ID \"%s\" returned by monitor", value); ++errors; } /* eventTime */ value = PQgetvalue(result, currentTupleIndex, 1); strlcpy(event->eventTime, value, sizeof(event->eventTime)); /* formationId */ value = PQgetvalue(result, currentTupleIndex, 2); strlcpy(event->formationId, value, sizeof(event->formationId)); /* nodeId */ value = PQgetvalue(result, currentTupleIndex, 3); if (!stringToInt64(value, &(event->nodeId))) { log_error("Invalid node ID \"%s\" returned by monitor", value); ++errors; } /* groupId */ value = PQgetvalue(result, currentTupleIndex, 4); if (!stringToInt(value, &(event->groupId))) { log_error("Invalid group ID \"%s\" returned by monitor", value); ++errors; } /* nodeName */ value = PQgetvalue(result, currentTupleIndex, 5); strlcpy(event->nodeName, value, sizeof(event->nodeName)); /* nodeHost */ value = PQgetvalue(result, currentTupleIndex, 6); strlcpy(event->nodeHost, value, sizeof(event->nodeHost)); /* nodePort */ value = PQgetvalue(result, currentTupleIndex, 7); if (!stringToInt(value, &(event->nodePort))) { log_error("Invalid group ID \"%s\" returned by monitor", value); ++errors; } /* reportedState */ value = PQgetvalue(result, currentTupleIndex, 8); event->reportedState = NodeStateFromString(value); if (event->reportedState == NO_STATE) { log_error("Invalid node state \"%s\" returned by monitor", value); ++errors; } /* assignedState */ value = PQgetvalue(result, currentTupleIndex, 9); event->assignedState = NodeStateFromString(value); if (event->assignedState == NO_STATE) { log_error("Invalid node state \"%s\" returned by monitor", value); ++errors; } /* repolicationState */ value = PQgetvalue(result, currentTupleIndex, 10); strlcpy(event->replicationState, value, sizeof(event->replicationState)); /* timeline */ value = PQgetvalue(result, currentTupleIndex, 11); if (!stringToInt(value, &(event->timeline))) { log_error("Invalid timeline \"%s\" returned by monitor", value); ++errors; } /* LSN */ value = PQgetvalue(result, currentTupleIndex, 12); strlcpy(event->lsn, value, PG_LSN_MAXLENGTH); /* candidatePriority */ value = PQgetvalue(result, currentTupleIndex, 13); if (!stringToInt(value, &(event->candidatePriority))) { log_error("Invalid candidate priority \"%s\" returned by monitor", value); ++errors; } /* replicationQuorum */ value = PQgetvalue(result, currentTupleIndex, 14); event->replicationQuorum = strcmp(value, "t") == 0; /* description */ value = PQgetvalue(result, currentTupleIndex, 15); strlcpy(event->description, value, sizeof(event->description)); if (errors > 0) { context->parsedOK = false; return; } } context->parsedOK = true; } /* * monitor_create_formation calls the SQL API on the monitor to create a new * formation of the given kind. */ bool monitor_create_formation(Monitor *monitor, char *formation, char *kind, char *dbname, bool hasSecondary, int numberSyncStandbys) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.create_formation($1, $2, $3, $4, $5)"; int paramCount = 5; Oid paramTypes[5] = { TEXTOID, TEXTOID, TEXTOID, BOOLOID, INT4OID }; const char *paramValues[5]; paramValues[0] = formation; paramValues[1] = kind; paramValues[2] = dbname; paramValues[3] = hasSecondary ? "true" : "false"; paramValues[4] = intToString(numberSyncStandbys).strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, NULL, NULL)) { log_error("Failed to create formation \"%s\" of kind \"%s\", " "see previous lines for details.", formation, kind); return false; } return true; } /* * monitor_enable_secondary_for_formation enables secondaries for the given * formation */ bool monitor_enable_secondary_for_formation(Monitor *monitor, const char *formation) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.enable_secondary($1)"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1]; paramValues[0] = formation; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, NULL, NULL)) { log_error("Failed to enable secondaries on formation \"%s\", " "see previous lines for details.", formation); return false; } /* disconnect from PostgreSQL now */ pgsql_finish(&monitor->pgsql); return true; } /* * monitor_disable_secondary_for_formation disables secondaries for the given * formation. This requires no secondaries to be currently in the formation, * function will report an error on the monitor due to an execution error of * pgautofailover.disable_secondary when there are still secondaries in the * cluster, or more precise nodes that are not in 'sinlge' state. */ bool monitor_disable_secondary_for_formation(Monitor *monitor, const char *formation) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.disable_secondary($1)"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1]; paramValues[0] = formation; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, NULL, NULL)) { log_error("Failed to disable secondaries on formation \"%s\", " "see previous lines for details.", formation); return false; } return true; } /* * monitor_drop_formation calls the SQL API on the monitor to drop formation. */ bool monitor_drop_formation(Monitor *monitor, char *formation) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.drop_formation($1)"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1]; paramValues[0] = formation; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, NULL, NULL)) { log_error("Failed to drop formation \"%s\", " "see previous lines for details.", formation); return false; } return true; } /* * monitor_formation_uri calls the SQL API on the monitor that returns the * connection string that can be used by applications to connect to the * formation. */ bool monitor_formation_uri(Monitor *monitor, const char *formation, const char *citusClusterName, const SSLOptions *ssl, char *connectionString, size_t size) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_STRING, false }; PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT formation_uri " "FROM pgautofailover.formation_uri($1, $2, $3, $4, $5)"; int paramCount = 5; Oid paramTypes[5] = { TEXTOID, TEXTOID, TEXTOID, TEXTOID, TEXTOID }; const char *paramValues[5] = { 0 }; paramValues[0] = formation; paramValues[1] = citusClusterName; paramValues[2] = ssl->sslModeStr; paramValues[3] = ssl->caFile; paramValues[4] = ssl->crlFile; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to list the formation uri for \"%s\", " "see previous lines for details.", formation); return false; } if (!context.parsedOk) { /* errors have already been logged */ if (context.strVal) { free(context.strVal); } return false; } if (context.strVal == NULL || strcmp(context.strVal, "") == 0) { log_error("Formation \"%s\" currently has no nodes in group 0", formation); if (context.strVal) { free(context.strVal); } return false; } strlcpy(connectionString, context.strVal, size); free(context.strVal); return true; } /* * monitor_print_every_formation_uri prints a table of all our connection * strings: first the monitor URI itself, and then one line per formation. */ bool monitor_print_every_formation_uri(Monitor *monitor, const SSLOptions *ssl) { FormationURIParseContext context = { 0 }; PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT 'monitor', 'monitor', $1 " " UNION ALL " "SELECT 'formation', formationid, formation_uri " " FROM pgautofailover.formation, " " pgautofailover.formation_uri" "(formation.formationid, 'default', $2, $3, $4) " " UNION ALL " "SELECT 'read-replica', nodecluster, formation_uri " " FROM pgautofailover.formation " " JOIN pgautofailover.node using(formationid), " " pgautofailover.formation_uri" "(formation.formationid, nodecluster, $2, $3, $4) " " WHERE node.groupid = 0 and node.nodecluster <> 'default' "; int paramCount = 4; Oid paramTypes[4] = { TEXTOID, TEXTOID, TEXTOID, TEXTOID }; const char *paramValues[4]; paramValues[0] = monitor->pgsql.connectionString; paramValues[1] = ssl->sslModeStr; paramValues[2] = ssl->caFile; paramValues[3] = ssl->crlFile; context.parsedOK = false; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &printFormationURI)) { log_error("Failed to list the formation uri, " "see previous lines for details."); return false; } if (!context.parsedOK) { /* errors have already been logged */ return false; } return true; } /* * monitor_print_every_formation_uri_as_json prints all our connection strings * in the JSON format: first the monitor URI itself, and then one line per * formation. */ bool monitor_print_every_formation_uri_as_json(Monitor *monitor, const SSLOptions *ssl, FILE *stream) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_STRING, false }; PGSQL *pgsql = &monitor->pgsql; const char *sql = "WITH formation(type, name, uri) AS ( " "SELECT 'monitor', 'monitor', $1 " " UNION ALL " "SELECT 'formation', formationid, formation_uri " " FROM pgautofailover.formation, " " pgautofailover.formation_uri" "(formation.formationid, 'default', $2, $3, $4)" " UNION ALL " "SELECT 'read-replica', nodecluster, formation_uri " " FROM pgautofailover.formation " " JOIN pgautofailover.node using(formationid), " " pgautofailover.formation_uri" "(formation.formationid, nodecluster, $2, $3, $4) " " WHERE node.groupid = 0 and node.nodecluster <> 'default' " ") " "SELECT jsonb_pretty(jsonb_agg(row_to_json(formation))) FROM formation"; int paramCount = 4; Oid paramTypes[4] = { TEXTOID, TEXTOID, TEXTOID, TEXTOID }; const char *paramValues[4]; paramValues[0] = monitor->pgsql.connectionString; paramValues[1] = ssl->sslModeStr; paramValues[2] = ssl->caFile; paramValues[3] = ssl->crlFile; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to list the formation uri, " "see previous lines for details."); return false; } if (!context.parsedOk) { /* errors have already been logged */ if (context.strVal) { free(context.strVal); } return false; } fformat(stream, "%s\n", context.strVal); free(context.strVal); return true; } /* * printFormationURI loops over the results of the SQL query in * monitor_print_every_formation_uri and outputs the result in table like * format. */ static void printFormationURI(void *ctx, PGresult *result) { FormationURIParseContext *context = (FormationURIParseContext *) ctx; int currentTupleIndex = 0; int nTuples = PQntuples(result); int maxFormationNameSize = 7; /* "monitor" */ char formationNameSeparator[BUFSIZE] = { 0 }; log_trace("printFormationURI: %d tuples", nTuples); if (PQnfields(result) != 3) { log_error("Query returned %d columns, expected 3", PQnfields(result)); context->parsedOK = false; return; } /* * Dynamically adjust our display output to the length of the longer * hostname in the result set */ for (currentTupleIndex = 0; currentTupleIndex < nTuples; currentTupleIndex++) { int size = strlen(PQgetvalue(result, currentTupleIndex, 1)); if (size > maxFormationNameSize) { maxFormationNameSize = size; } } /* create the visual separator for the formation name too */ (void) prepareHostNameSeparator(formationNameSeparator, maxFormationNameSize); fformat(stdout, "%12s | %*s | %s\n", "Type", maxFormationNameSize, "Name", "Connection String"); fformat(stdout, "%12s-+-%*s-+-%s\n", "------------", maxFormationNameSize, formationNameSeparator, "------------------------------"); for (currentTupleIndex = 0; currentTupleIndex < nTuples; currentTupleIndex++) { char *type = PQgetvalue(result, currentTupleIndex, 0); char *name = PQgetvalue(result, currentTupleIndex, 1); char *URI = PQgetvalue(result, currentTupleIndex, 2); fformat(stdout, "%12s | %*s | %s\n", type, maxFormationNameSize, name, URI); } fformat(stdout, "\n"); context->parsedOK = true; } /* * monitor_count_failover_candidates counts how many nodes in a given group are * not currently in a primary state, and have candidatePriority > 0. */ bool monitor_count_failover_candidates(Monitor *monitor, char *formation, int groupId, int *failoverCandidateCount) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_INT, false }; PGSQL *pgsql = &monitor->pgsql; char *sql = "select count(node.candidatepriority) " " filter(where node.candidatepriority > 0) " " as failover_candidate_count " " from pgautofailover.get_nodes($1, $2) as gn " " join pgautofailover.node " " on node.nodeid = gn.node_id " " where not node_is_primary"; int paramCount = 2; Oid paramTypes[2] = { TEXTOID, INT4OID }; const char *paramValues[2] = { 0 }; IntString myGroupIdString = intToString(groupId); paramValues[0] = formation; paramValues[1] = myGroupIdString.strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to retrieve formation settings from the monitor"); return false; } if (!context.parsedOk) { log_error("Failed to parse query result from the monitor."); return false; } *failoverCandidateCount = context.intVal; return true; } /* * monitor_print_formation_settings calls the function * pgautofailover.formation_settings on the monitor, and prints a line of * output per state record obtained. */ bool monitor_print_formation_settings(Monitor *monitor, char *formation) { MonitorAssignedStateParseContext context = { 0 }; PGSQL *pgsql = &monitor->pgsql; char *sql = "select context, group_id, node_id, nodename, setting, value " " from pgautofailover.formation_settings($1)" " order by case context when 'formation' then 0 " " when 'primary' then 1 " " when 'node' then 2 else 3 end, " " setting, group_id, node_id"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1] = { formation }; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &printFormationSettings)) { log_error("Failed to retrieve formation settings from the monitor"); return false; } if (!context.parsedOK) { log_error("Failed to parse formation settings from the monitor " "for formation \"%s\"", formation); return false; } return true; } /* * printFormationSettings loops over pgautofailover.formation_settings() * results and prints them, one per line. */ static void printFormationSettings(void *ctx, PGresult *result) { MonitorAssignedStateParseContext *context = (MonitorAssignedStateParseContext *) ctx; int index = 0; int nTuples = PQntuples(result); int maxNameSize = 4; /* "Name" */ int maxSettingSize = 7; /* "Setting" */ int maxValueSize = 5; /* "Value" */ char nameSeparatorHeader[BUFSIZE] = { 0 }; char settingSeparatorHeader[BUFSIZE] = { 0 }; char valueSeparatorHeader[BUFSIZE] = { 0 }; if (nTuples == 0) { log_debug("Query returned 0 rows"); context->parsedOK = false; return; } if (PQnfields(result) != 6) { log_error("Query returned %d columns, expected 6", PQnfields(result)); context->parsedOK = false; return; } for (index = 0; index < nTuples; index++) { char *nodename = PQgetvalue(result, index, 3); char *setting = PQgetvalue(result, index, 4); char *value = PQgetvalue(result, index, 5); if (strlen(nodename) > maxNameSize) { maxNameSize = strlen(nodename); } if (strlen(setting) > maxSettingSize) { maxSettingSize = strlen(setting); } if (strlen(value) > maxValueSize) { maxValueSize = strlen(value); } } (void) prepareHostNameSeparator(nameSeparatorHeader, maxNameSize); (void) prepareHostNameSeparator(settingSeparatorHeader, maxSettingSize); (void) prepareHostNameSeparator(valueSeparatorHeader, maxValueSize); fformat(stdout, "%9s | %*s | %*s | %-*s\n", "Context", maxNameSize, "Name", maxSettingSize, "Setting", maxValueSize, "Value"); fformat(stdout, "%9s-+-%*s-+-%*s-+-%*s\n", "---------", maxNameSize, nameSeparatorHeader, maxSettingSize, settingSeparatorHeader, maxValueSize, valueSeparatorHeader); for (index = 0; index < nTuples; index++) { char *context = PQgetvalue(result, index, 0); /* not used at the moment * char *group_id = PQgetvalue(result, index, 1); * char *node_id = PQgetvalue(result, index, 2); */ char *nodename = PQgetvalue(result, index, 3); char *setting = PQgetvalue(result, index, 4); char *value = PQgetvalue(result, index, 5); fformat(stdout, "%9s | %*s | %*s | %-*s\n", context, maxNameSize, nodename, maxSettingSize, setting, maxValueSize, value); } fformat(stdout, "\n"); context->parsedOK = true; } /* * monitor_print_formation_settings calls the function * pgautofailover.formation_settings on the monitor, and prints a line of * output per state record obtained. */ bool monitor_print_formation_settings_as_json(Monitor *monitor, char *formation) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_STRING, false }; PGSQL *pgsql = &monitor->pgsql; char *sql = "with settings as " " ( " " select * " " from pgautofailover.formation_settings($1) " " ), " " f(json) as " " ( " " select jsonb_agg(row_to_json(settings)) " " from settings " " where context = 'formation' " " ), " " p(json) as " " ( " " select jsonb_agg(row_to_json(settings)) " " from settings " " where context = 'primary' " " ), " " n(json) as " " ( " " select jsonb_agg(row_to_json(settings)) " " from settings " " where context = 'node' " " ) " "select jsonb_pretty(jsonb_build_object(" "'formation', f.json, 'primary', p.json, 'nodes', n.json)) " " from f, p, n"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1] = { formation }; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to retrieve current state from the monitor"); return false; } if (!context.parsedOk) { log_error("Failed to parse formation settings from the monitor " "for formation \"%s\"", formation); if (context.strVal) { free(context.strVal); } return false; } fformat(stdout, "%s\n", context.strVal); free(context.strVal); return true; } /* * monitor_synchronous_standby_names returns the value for the Postgres * parameter "synchronous_standby_names" to use for a given group. The setting * is computed on the monitor depending on the current values of the formation * number_sync_standbys and each node's candidate priority and replication * quorum properties. */ bool monitor_synchronous_standby_names(Monitor *monitor, char *formation, int groupId, char *synchronous_standby_names, int size) { PGSQL *pgsql = &monitor->pgsql; SingleValueResultContext context = { { 0 }, PGSQL_RESULT_STRING, false }; const char *sql = "select pgautofailover.synchronous_standby_names($1, $2)"; int paramCount = 2; Oid paramTypes[2] = { TEXTOID, INT4OID }; const char *paramValues[2] = { 0 }; IntString myGroupIdString = intToString(groupId); paramValues[0] = formation; paramValues[1] = myGroupIdString.strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to get the synchronous_standby_names setting value " " from the monitor for formation %s and group %d", formation, groupId); if (context.strVal) { free(context.strVal); } return false; } if (!context.parsedOk) { log_error("Failed to get the synchronous_standby_names setting value " " from the monitor for formation %s and group %d," "see above for details", formation, groupId); if (context.strVal) { free(context.strVal); } return false; } strlcpy(synchronous_standby_names, context.strVal, size); free(context.strVal); return true; } /* * monitor_set_hostname sets the hostname on the monitor, using a simple SQL * update command. */ bool monitor_update_node_metadata(Monitor *monitor, int64_t nodeId, const char *name, const char *hostname, int port) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.update_node_metadata($1, $2, $3, $4)"; int paramCount = 4; Oid paramTypes[4] = { INT8OID, TEXTOID, TEXTOID, INT4OID }; const char *paramValues[4]; SingleValueResultContext context = { { 0 }, PGSQL_RESULT_BOOL, false }; paramValues[0] = intToString(nodeId).strValue; paramValues[1] = name; paramValues[2] = hostname; paramValues[3] = intToString(port).strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { log_error("Failed to update_node_metadata of node %" PRId64 " from the monitor", nodeId); return false; } if (!context.parsedOk) { log_error( "Failed to set node %" PRId64 " metadata on the monitor because it returned an unexpected result. " "See previous line for details.", nodeId); return false; } return true; } /* * monitor_set_node_system_identifier sets the node's sysidentifier column on * the monitor. */ bool monitor_set_node_system_identifier(Monitor *monitor, int64_t nodeId, uint64_t system_identifier) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.set_node_system_identifier($1, $2)"; int paramCount = 2; Oid paramTypes[2] = { INT8OID, INT8OID }; const char *paramValues[2]; NodeAddress node = { 0 }; NodeAddressParseContext parseContext = { { 0 }, &node, false }; paramValues[0] = intToString(nodeId).strValue; paramValues[1] = intToString(system_identifier).strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseNodeResult)) { log_error("Failed to set_node_system_identifier of node %" PRId64 " from the monitor", nodeId); return false; } if (!parseContext.parsedOK) { /* *INDENT-OFF* */ log_error( "Failed to set node %" PRId64" sysidentifier to \"%" PRId64 "\"" " on the monitor because it returned an unexpected result. " "See previous line for details.", nodeId, system_identifier); /* *INDENT-ON* */ return false; } return true; } /* * monitor_set_group_system_identifier sets the node's sysidentifier column on * the monitor for all nodes in the same group, when the current sysidentifier * they have is zero. That's needed after an upgrade from 1.3 to 1.4. */ bool monitor_set_group_system_identifier(Monitor *monitor, int groupId, uint64_t system_identifier) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.set_group_system_identifier($1, $2)"; int paramCount = 2; Oid paramTypes[2] = { INT8OID, INT8OID }; const char *paramValues[2]; SingleValueResultContext context = { 0 }; paramValues[0] = intToString(groupId).strValue; paramValues[1] = intToString(system_identifier).strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &fetchedRows)) { log_error("Failed to set_group_system_identifier for group %d " "from the monitor", groupId); return false; } if (!context.parsedOk) { /* *INDENT-OFF* */ log_error( "Failed to set sysidentifier to \"%" PRIu64 "\" " "for nodes in group %d " "on the monitor because it returned an unexpected result. " "See previous line for details.", system_identifier, groupId); /* *INDENT-ON* */ return false; } if (context.intVal > 0) { log_info("Updated system identifier of %d nodes in group %d " "to the local node value \"%" PRIu64 "\"", context.intVal, groupId, system_identifier); } return true; } /* * parseCoordinatorNode parses a hostname and a port from the libpq result and * writes it to the NodeAddressParseContext pointed to by ctx. This is about * the same as parseNode: the only difference is that an empty result set is * not an error condition in parseCoordinatorNode. */ static void parseCoordinatorNode(void *ctx, PGresult *result) { NodeAddressParseContext *context = (NodeAddressParseContext *) ctx; /* no rows, set the node to NULL, return */ if (PQntuples(result) == 0) { context->node = NULL; context->parsedOK = true; return; } /* we have rows: we accept only one */ if (PQntuples(result) != 1) { log_error("Query returned %d rows, expected 1", PQntuples(result)); context->parsedOK = false; return; } if (PQnfields(result) != 2) { log_error("Query returned %d columns, expected 2", PQnfields(result)); context->parsedOK = false; return; } if (PQgetisnull(result, 0, 0) || PQgetisnull(result, 0, 1)) { log_error("Hostname or port returned by monitor is NULL"); context->parsedOK = false; return; } char *value = PQgetvalue(result, 0, 0); int hostLength = strlcpy(context->node->host, value, _POSIX_HOST_NAME_MAX); if (hostLength >= _POSIX_HOST_NAME_MAX) { log_error("Hostname \"%s\" returned by monitor is %d characters, " "the maximum supported by pg_autoctl is %d", value, hostLength, _POSIX_HOST_NAME_MAX - 1); context->parsedOK = false; return; } value = PQgetvalue(result, 0, 1); if (!stringToInt(value, &context->node->port) || context->node->port == 0) { log_error("Invalid port number \"%s\" returned by monitor", value); context->parsedOK = false; } context->parsedOK = true; } /* * monitor_start_maintenance calls the pgautofailover.start_maintenance(node, * port) on the monitor, so that the monitor assigns the MAINTENANCE_STATE at * the next call to node_active(). */ bool monitor_start_maintenance(Monitor *monitor, int64_t nodeId, bool *mayRetry) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_BOOL, false }; PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT pgautofailover.start_maintenance($1)"; int paramCount = 1; Oid paramTypes[1] = { INT8OID }; const char *paramValues[1]; paramValues[0] = intToString(nodeId).strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { if (monitor_retryable_error(context.sqlstate)) { *mayRetry = true; } else { /* when we may retry then it's up to the caller to handle errors */ log_error("Failed to start_maintenance of node %" PRId64 " from the monitor", nodeId); } return false; } if (!context.parsedOk) { log_error("Failed to start_maintenance of node %" PRId64 " from the monitor: could not parse monitor's result.", nodeId); return false; } return context.boolVal; } /* * monitor_stop_maintenance calls the pgautofailover.start_maintenance(node, * port) on the monitor, so that the monitor assigns the CATCHINGUP_STATE at * the next call to node_active(). */ bool monitor_stop_maintenance(Monitor *monitor, int64_t nodeId, bool *mayRetry) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_BOOL, false }; PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT pgautofailover.stop_maintenance($1)"; int paramCount = 1; Oid paramTypes[1] = { INT8OID }; const char *paramValues[1]; paramValues[0] = intToString(nodeId).strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { if (monitor_retryable_error(context.sqlstate)) { *mayRetry = true; } else { /* when we may retry then it's up to the caller to handle errors */ log_error("Failed to stop_maintenance of node %" PRId64 " from the monitor", nodeId); } return false; } if (!context.parsedOk) { log_error("Failed to stop_maintenance of node %" PRId64 " from the monitor: could not parse monitor's result.", nodeId); return false; } return context.boolVal; } /* * monitor_process_notifications listens to notifications from the monitor and * calls a specific processing function for each notification received. * * We use the pselect(2) facility to check if something is ready to be read on * the PQconn socket for us. When it's the case, return the next notification * message from the "state" channel. Other channel messages are sent to the log * directly. * * When the function returns true, it's safe for the caller to sleep, otherwise * it's expected that the caller keeps polling the results to drain the queue * of notifications received from the previous calls loop. */ static bool monitor_process_notifications(Monitor *monitor, int timeoutMs, char *channels[], void *notificationContext, NotificationProcessingFunction processor) { PGconn *connection = monitor->notificationClient.connection; PGnotify *notify; sigset_t sig_mask; sigset_t sig_mask_orig; /* we have milliseconds, we want seconds and nanoseconds separately */ int seconds = timeoutMs / 1000; int nanosecs = 1000 * 1000 * (timeoutMs % 1000); struct timespec timeout = { .tv_sec = seconds, .tv_nsec = nanosecs }; fd_set input_mask; /* block signals now: process them as if received during the pselect call */ if (!block_signals(&sig_mask, &sig_mask_orig)) { return false; } /* * Check if we received signals just before blocking them. If that's the * case we can stop now. */ if (asked_to_stop || asked_to_stop_fast || asked_to_reload || asked_to_quit) { /* restore signal masks (un block them) now */ (void) unblock_signals(&sig_mask_orig); return false; } if (!pgsql_listen(&(monitor->notificationClient), channels)) { /* restore signal masks (un block them) now */ (void) unblock_signals(&sig_mask_orig); return false; } if (monitor->notificationClient.connection == NULL) { log_warn("Lost connection."); /* restore signal masks (un block them) now */ (void) unblock_signals(&sig_mask_orig); return false; } /* * It looks like we are violating modularity of the code, when we are * following Postgres documentation and examples: * * https://www.postgresql.org/docs/current/libpq-example.html#LIBPQ-EXAMPLE-2 */ int sock = PQsocket(monitor->notificationClient.connection); if (sock < 0) { /* restore signal masks (un block them) now */ (void) unblock_signals(&sig_mask_orig); return false; /* shouldn't happen */ } FD_ZERO(&input_mask); FD_SET(sock, &input_mask); int ret = pselect(sock + 1, &input_mask, NULL, NULL, &timeout, &sig_mask_orig); /* restore signal masks (un block them) now that pselect() is done */ (void) unblock_signals(&sig_mask_orig); if (ret < 0) { /* it might be interrupted by a signal we know how to handle */ if (errno == EINTR) { return true; } else { log_warn("Failed to get monitor notifications: select(): %m"); return false; } } if (ret == 0) { /* we reached the timeout */ return true; } /* Now check for input */ PQconsumeInput(connection); while ((notify = PQnotifies(connection)) != NULL) { if (strcmp(notify->relname, "log") == 0) { log_info("%s", notify->extra); } else if (strcmp(notify->relname, "state") == 0) { CurrentNodeState nodeState = { 0 }; log_trace("received \"%s\"", notify->extra); /* errors are logged by parse_state_notification_message */ if (parse_state_notification_message(&nodeState, notify->extra)) { (void) (*processor)(notificationContext, &nodeState); } } else { log_warn("BUG: received unknown notification on channel \"%s\": %s", notify->relname, notify->extra); } PQfreemem(notify); PQconsumeInput(connection); } return true; } /* * monitor_log_notifications is a Notification Processing Function that gets * all the notifications from the monitor and append them to our logs. */ static void monitor_log_notifications(void *context, CurrentNodeState *nodeState) { LogNotificationContext *ctx = (LogNotificationContext *) context; nodestate_log(nodeState, ctx->logLevel, 0); } /* * monitor_get_notifications listens to notifications from the monitor and logs * them all. */ bool monitor_get_notifications(Monitor *monitor, int timeoutMs) { char *channels[] = { "state", "log", NULL }; LogNotificationContext context = { LOG_INFO }; return monitor_process_notifications(monitor, timeoutMs, channels, (void *) &context, &monitor_log_notifications); } /* * monitor_notification_process_apply_settings is a Notification Processing * Function that maintains the context (which is a * ApplySettingsNotificationContext actually) from notifications that are * received from the monitor_process_notifications function. */ static void monitor_notification_process_apply_settings(void *context, CurrentNodeState *nodeState) { ApplySettingsNotificationContext *ctx = (ApplySettingsNotificationContext *) context; /* filter notifications for our own formation */ if (strcmp(nodeState->formation, ctx->formation) != 0) { return; } if (nodeState->reportedState == PRIMARY_STATE && nodeState->goalState == APPLY_SETTINGS_STATE) { ctx->applySettingsTransitionInProgress = true; log_debug("step 1/4: primary node " NODE_FORMAT " is assigned \"%s\"", nodeState->node.nodeId, nodeState->node.name, nodeState->node.host, nodeState->node.port, NodeStateToString(nodeState->goalState)); } else if (nodeState->reportedState == APPLY_SETTINGS_STATE && nodeState->goalState == APPLY_SETTINGS_STATE) { ctx->applySettingsTransitionInProgress = true; log_debug("step 2/4: primary node " NODE_FORMAT " reported \"%s\"", nodeState->node.nodeId, nodeState->node.name, nodeState->node.host, nodeState->node.port, NodeStateToString(nodeState->reportedState)); } else if (nodeState->reportedState == APPLY_SETTINGS_STATE && nodeState->goalState == PRIMARY_STATE) { ctx->applySettingsTransitionInProgress = true; log_debug("step 3/4: primary node " NODE_FORMAT " is assigned \"%s\"", nodeState->node.nodeId, nodeState->node.name, nodeState->node.host, nodeState->node.port, NodeStateToString(nodeState->goalState)); } else if (ctx->applySettingsTransitionInProgress && ((nodeState->reportedState == PRIMARY_STATE && nodeState->goalState == PRIMARY_STATE) || (nodeState->reportedState == WAIT_PRIMARY_STATE && nodeState->goalState == WAIT_PRIMARY_STATE))) { ctx->applySettingsTransitionDone = true; log_debug("step 4/4: primary node " NODE_FORMAT " reported \"%s\"", nodeState->node.nodeId, nodeState->node.name, nodeState->node.host, nodeState->node.port, NodeStateToString(nodeState->reportedState)); } /* * In some cases applying a new value for a replication setting will not go * through APPLY_SETTINGS. One such case is when changing candidate * priority to trigger a failover when all the available nodes have * candidate priority set to zero. */ if ((nodeState->reportedState == PRIMARY_STATE && nodeState->reportedState == nodeState->goalState) || (nodeState->reportedState == WAIT_PRIMARY_STATE && nodeState->reportedState == nodeState->goalState)) { ctx->applySettingsTransitionDone = true; } } /* * monitor_wait_until_primary_applied_settings receives notifications and * watches for the following "apply_settings" set of transitions: * * - primary/apply_settings * - apply_settings/apply_settings * - apply_settings/primary * - primary/primary * * If we lose the monitor connection while watching for the transition steps * then we stop watching. It's a best effort attempt at having the CLI be * useful for its user, the main one being the test suite. */ bool monitor_wait_until_primary_applied_settings(Monitor *monitor, const char *formation) { PGconn *connection = monitor->notificationClient.connection; ApplySettingsNotificationContext context = { (char *) formation, false, false }; char *channels[] = { "state", "log", NULL }; uint64_t start = time(NULL); if (connection == NULL) { log_warn("Lost connection."); return false; } log_info("Waiting for the settings to have been applied to " "the monitor and primary node"); while (!context.applySettingsTransitionDone) { uint64_t now = time(NULL); if ((now - start) > PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT) { log_error("Failed to receive monitor's notifications that the " "settings have been applied"); break; } if (!monitor_process_notifications( monitor, PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT * 1000, channels, (void *) &context, &monitor_notification_process_apply_settings)) { /* errors have already been logged */ break; } } /* disconnect from monitor */ pgsql_finish(&monitor->notificationClient); return context.applySettingsTransitionDone; } /* * monitor_notification_process_wait_for_state_change is a Notification * Processing Function that gets all the notifications from our group from the * monitor and logs them. */ static void monitor_notification_process_wait_for_state_change(void *context, CurrentNodeState *nodeState) { WaitForStateChangeNotificationContext *ctx = (WaitForStateChangeNotificationContext *) context; /* filter notifications for our own formation */ if (strcmp(nodeState->formation, ctx->formation) != 0 || nodeState->groupId != ctx->groupId) { return; } /* here, we received a state change that belongs to our formation/group */ ctx->stateHasChanged = true; nodestate_log(nodeState, LOG_INFO, ctx->nodeId); } /* * monitor_wait_for_state_change waits for timeout milliseconds or until we * receive a notification for a state change concerning the given nodeId, * whichever comes first. * * When we have received at least one notification for the given groupId then * the stateHasChanged boolean is set to true, otherwise it's set to false. */ bool monitor_wait_for_state_change(Monitor *monitor, const char *formation, int groupId, int64_t nodeId, int timeoutMs, bool *stateHasChanged) { PGconn *connection = monitor->notificationClient.connection; WaitForStateChangeNotificationContext context = { (char *) formation, groupId, nodeId, false /* stateHasChanged */ }; char *channels[] = { "state", NULL }; if (connection == NULL) { log_warn("Lost connection."); return false; } if (!monitor_process_notifications( monitor, timeoutMs, channels, (void *) &context, &monitor_notification_process_wait_for_state_change)) { return false; } *stateHasChanged = context.stateHasChanged; return true; } /* * monitor_report_state_print_headers fetches other nodes array on the monitor * and prints a table array on stdout to prepare for notifications output. */ static void monitor_report_state_print_headers(Monitor *monitor, const char *formation, int groupId, PgInstanceKind nodeKind, NodeAddressArray *nodesArray, NodeAddressHeaders *headers) { log_info("Listening monitor notifications about state changes " "in formation \"%s\" and group %d", formation, groupId); log_info("Following table displays times when notifications are received"); if (!monitor_get_nodes(monitor, (char *) formation, groupId, nodesArray)) { /* ignore the error, use an educated guess for the max size */ log_warn("Failed to get_nodes() on the monitor"); headers->maxNameSize = 25; headers->maxHostSize = 25; headers->maxNodeSize = 5; } (void) nodeAddressArrayPrepareHeaders(headers, nodesArray, groupId, nodeKind); fformat(stdout, "%8s | %*s | %*s | %*s | %19s | %19s\n", "Time", headers->maxNameSize, "Name", headers->maxNodeSize, "Node", headers->maxHostSize, "Host:Port", "Current State", "Assigned State"); fformat(stdout, "%8s-+-%*s-+-%*s-+-%*s-+-%19s-+-%19s\n", "--------", headers->maxNameSize, headers->nameSeparatorHeader, headers->maxNodeSize, headers->nodeSeparatorHeader, headers->maxHostSize, headers->hostSeparatorHeader, "-------------------", "-------------------"); } /* * monitor_check_report_state is Notification Processing Function that gets all * the notifications from our group from the monitor and reports them in a * table-like output to stdout. * * The function also maintains the context->failoverIsDone to signal to its * caller that the wait is over. We reach failoverIsDone when one of the nodes * in the context's group reaches the given targetState. */ static void monitor_check_report_state(void *context, CurrentNodeState *nodeState) { WaitUntilStateNotificationContext *ctx = (WaitUntilStateNotificationContext *) context; uint64_t now = time(NULL); char timestring[MAXCTIMESIZE] = { 0 }; char hostport[BUFSIZE] = { 0 }; char composedId[BUFSIZE] = { 0 }; char tliLSN[BUFSIZE] = { 0 }; /* filter notifications for our own formation */ if (strcmp(nodeState->formation, ctx->formation) != 0 || nodeState->groupId != ctx->groupId) { return; } /* format the current time to be user-friendly */ epoch_to_string(now, timestring); /* "Wed Jun 30 21:49:08 1993" -> "21:49:08" */ timestring[11 + 8] = '\0'; (void) nodestatePrepareNode(ctx->headers, &(nodeState->node), ctx->groupId, hostport, composedId, tliLSN); fformat(stdout, "%8s | %*s | %*s | %*s | %19s | %19s\n", timestring + 11, ctx->headers->maxNameSize, nodeState->node.name, ctx->headers->maxNodeSize, composedId, ctx->headers->maxHostSize, hostport, NodeStateToString(nodeState->reportedState), NodeStateToString(nodeState->goalState)); if (nodeState->goalState == ctx->targetState && nodeState->reportedState == ctx->targetState && !ctx->firstLoop) { ctx->failoverIsDone = true; } if (ctx->firstLoop) { ctx->firstLoop = false; } } /* * monitor_wait_until_some_node_reported_state receives notifications and * watches for a new node to be reported with the given targetState. * * If we lose the monitor connection while watching for the transition steps * then we stop watching. It's a best effort attempt at having the CLI be * useful for its user, the main one being the test suite. */ bool monitor_wait_until_some_node_reported_state(Monitor *monitor, const char *formation, int groupId, PgInstanceKind nodeKind, NodeState targetState, int timeout) { PGconn *connection = monitor->notificationClient.connection; NodeAddressArray nodesArray = { 0 }; NodeAddressHeaders headers = { 0 }; WaitUntilStateNotificationContext context = { (char *) formation, groupId, &headers, targetState, false, /* failoverIsDone */ true /* firstLoop */ }; char *channels[] = { "state", NULL }; uint64_t start = time(NULL); if (connection == NULL) { log_warn("Lost connection."); return false; } /* when timeout <= 0 we just never stop waiting */ if (timeout > 0) { log_info("Waiting %d secs for a notification with " "state \"%s\" in formation \"%s\" and group %d", timeout, NodeStateToString(targetState), formation, groupId); } (void) monitor_report_state_print_headers(monitor, formation, groupId, nodeKind, &nodesArray, &headers); while (!context.failoverIsDone) { /* when timeout <= 0 we just never stop waiting */ if (timeout > 0) { uint64_t now = time(NULL); if ((now - start) > timeout) { log_error("Failed to receive monitor's notifications"); break; } } int thisLoopTimeout = timeout > 0 ? timeout : PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT; if (!monitor_process_notifications( monitor, thisLoopTimeout * 1000, channels, (void *) &context, &monitor_check_report_state)) { /* errors have already been logged */ break; } } /* disconnect from monitor */ pgsql_finish(&monitor->notificationClient); return context.failoverIsDone; } /* * monitor_check_report_state is Notification Processing Function that gets all * the notifications from our group from the monitor and reports them in a * table-like output to stdout. * * The function also maintains the context->failoverIsDone to signal to its * caller that the wait is over. We reach failoverIsDone when one of the nodes * in the context's group reaches the given targetState. */ static void monitor_check_node_report_state(void *context, CurrentNodeState *nodeState) { WaitUntilNodeStateNotificationContext *ctx = (WaitUntilNodeStateNotificationContext *) context; uint64_t now = time(NULL); char timestring[MAXCTIMESIZE] = { 0 }; char hostport[BUFSIZE] = { 0 }; char composedId[BUFSIZE] = { 0 }; char tliLSN[BUFSIZE] = { 0 }; /* filter notifications for our own formation */ if (strcmp(nodeState->formation, ctx->formation) != 0 || nodeState->groupId != ctx->groupId) { return; } /* format the current time to be user-friendly */ epoch_to_string(now, timestring); /* "Wed Jun 30 21:49:08 1993" -> "21:49:08" */ timestring[11 + 8] = '\0'; (void) nodestatePrepareNode(ctx->headers, &(nodeState->node), ctx->groupId, hostport, composedId, tliLSN); fformat(stdout, "%8s | %*s | %*s | %*s | %19s | %19s\n", timestring + 11, ctx->headers->maxNameSize, nodeState->node.name, ctx->headers->maxNodeSize, composedId, ctx->headers->maxHostSize, hostport, NodeStateToString(nodeState->reportedState), NodeStateToString(nodeState->goalState)); for (int i = 0; i < ctx->targetStatesLength; i++) { if (nodeState->goalState == ctx->targetStates[i] && nodeState->reportedState == ctx->targetStates[i] && nodeState->node.nodeId == ctx->nodeId && !ctx->firstLoop) { ctx->done = true; } } if (ctx->firstLoop) { ctx->firstLoop = false; } } /* * monitor_wait_until_some_node_reported_state receives notifications and * watches for a new node to be reported with the given targetState. * * If we lose the monitor connection while watching for the transition steps * then we stop watching. It's a best effort attempt at having the CLI be * useful for its user, the main one being the test suite. */ bool monitor_wait_until_node_reported_state(Monitor *monitor, const char *formation, int groupId, int64_t nodeId, PgInstanceKind nodeKind, NodeState *targetStates, int targetStatesLength) { PGconn *connection = monitor->notificationClient.connection; NodeAddressArray nodesArray = { 0 }; NodeAddressHeaders headers = { 0 }; WaitUntilNodeStateNotificationContext context = { (char *) formation, groupId, nodeId, &headers, targetStates, targetStatesLength, false, /* done */ true /* firstLoop */ }; char *channels[] = { "state", NULL }; uint64_t start = time(NULL); if (connection == NULL) { log_warn("Lost connection."); return false; } (void) monitor_report_state_print_headers(monitor, formation, groupId, nodeKind, &nodesArray, &headers); while (!context.done) { uint64_t now = time(NULL); if ((now - start) > PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT) { log_error("Failed to receive monitor's notifications"); break; } if (!monitor_process_notifications( monitor, PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT * 1000, channels, (void *) &context, &monitor_check_node_report_state)) { /* errors have already been logged */ break; } } /* disconnect from monitor */ pgsql_finish(&monitor->notificationClient); return context.done; } /* * monitor_get_extension_version gets the current extension version from the * Monitor's Postgres catalog pg_available_extensions. */ bool monitor_get_extension_version(Monitor *monitor, MonitorExtensionVersion *version) { MonitorExtensionVersionParseContext context = { { 0 }, version, false }; PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT default_version, installed_version" " FROM pg_available_extensions WHERE name = $1"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1]; paramValues[0] = PG_AUTOCTL_MONITOR_EXTENSION_NAME; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseExtensionVersion)) { log_error("Failed to get the current version for extension \"%s\", " "see previous lines for details.", PG_AUTOCTL_MONITOR_EXTENSION_NAME); return false; } if (!context.parsedOK) { /* errors have already been logged */ return false; } return true; } /* * parseExtensionVersion parses the resultset of a query on the Postgres * pg_available_extension_versions catalogs. */ static void parseExtensionVersion(void *ctx, PGresult *result) { MonitorExtensionVersionParseContext *context = (MonitorExtensionVersionParseContext *) ctx; /* we have rows: we accept only one */ if (PQntuples(result) != 1) { log_error("Query returned %d rows, expected 1", PQntuples(result)); context->parsedOK = false; return; } if (PQnfields(result) != 2) { log_error("Query returned %d columns, expected 2", PQnfields(result)); context->parsedOK = false; return; } if (PQgetisnull(result, 0, 0) || PQgetisnull(result, 0, 1)) { log_error("default_version or installed_version for extension \"%s\" " "is NULL ", PG_AUTOCTL_MONITOR_EXTENSION_NAME); context->parsedOK = false; return; } char *value = PQgetvalue(result, 0, 0); int length = strlcpy(context->version->defaultVersion, value, BUFSIZE); if (length >= BUFSIZE) { log_error("default_version \"%s\" returned by monitor is %d characters, " "the maximum supported by pg_autoctl is %d", value, length, BUFSIZE - 1); context->parsedOK = false; return; } value = PQgetvalue(result, 0, 1); length = strlcpy(context->version->installedVersion, value, BUFSIZE); if (length >= BUFSIZE) { log_error("installed_version \"%s\" returned by monitor is %d characters, " "the maximum supported by pg_autoctl is %d", value, length, BUFSIZE - 1); context->parsedOK = false; return; } context->parsedOK = true; } /* * monitor_extension_update executes ALTER EXTENSION ... UPDATE TO ... */ bool monitor_extension_update(Monitor *monitor, const char *targetVersion) { PGSQL *pgsql = &monitor->pgsql; int targetVersionNum = 0; /* the test suite upgrades to a "dummy" version */ if (strcmp(targetVersion, "dummy") != 0 && !parse_pgaf_extension_version_string(targetVersion, &targetVersionNum)) { /* errors have already been logged */ return false; } /* * When upgrading to version 1.4 we now require btree_gist. It does not * seem like Postgres knows how to handle changes in extension control * requires, so let's do that manually here. */ if (targetVersionNum >= 104) { /* * Ensure "btree_gist" is available in the server extension dir used to * create the Postgres instance. We only search for the control file to * offer better diagnostics in the logs in case the following CREATE * EXTENSION fails. */ char *btreeGistExtName = "btree_gist"; if (!find_extension_control_file(monitor->config.pgSetup.pg_ctl, btreeGistExtName)) { log_warn("Failed to find extension control file for \"%s\"", btreeGistExtName); log_info("You might have to install a PostgreSQL contrib package"); } if (!pgsql_create_extension(pgsql, btreeGistExtName)) { log_error("Failed to create extension \"%s\" " "required by \"%s\" extension version 1.4", btreeGistExtName, PG_AUTOCTL_MONITOR_EXTENSION_NAME); return false; } } return pgsql_alter_extension_update_to(pgsql, PG_AUTOCTL_MONITOR_EXTENSION_NAME, targetVersion); } /* * monitor_ensure_extension_version checks that we are running an extension * version on the monitor that we are compatible with in pg_autoctl. If that's * not the case, we blindly try to update the extension version on the monitor * to the target version we have in our defaults.h. * * NOTE: we don't check here if the update is an upgrade or a downgrade, we * rely on the extension's update path to be free of downgrade paths (such as * pgautofailover--1.2--1.1.sql). */ bool monitor_ensure_extension_version(Monitor *monitor, LocalPostgresServer *postgres, MonitorExtensionVersion *version) { const char *extensionVersion = PG_AUTOCTL_EXTENSION_VERSION; char envExtensionVersion[MAXPGPATH]; /* in test environment, we can export any target version we want */ if (env_exists(PG_AUTOCTL_DEBUG) && env_exists(PG_AUTOCTL_EXTENSION_VERSION_VAR)) { if (!get_env_copy(PG_AUTOCTL_EXTENSION_VERSION_VAR, envExtensionVersion, MAXPGPATH)) { /* errors have already been logged */ return false; } extensionVersion = envExtensionVersion; log_debug("monitor_ensure_extension_version targets extension " "version \"%s\" - as per environment.", extensionVersion); } if (!monitor_get_extension_version(monitor, version)) { log_fatal("Failed to check version compatibility with the monitor " "extension \"%s\", see above for details", PG_AUTOCTL_MONITOR_EXTENSION_NAME); return false; } if (strcmp(version->installedVersion, extensionVersion) != 0) { Monitor dbOwnerMonitor = { 0 }; log_warn("This version of pg_autoctl requires the extension \"%s\" " "version \"%s\" to be installed on the monitor, current " "version is \"%s\".", PG_AUTOCTL_MONITOR_EXTENSION_NAME, extensionVersion, version->installedVersion); /* we need to copy over the pg_ctl path and other pieces of setup. */ dbOwnerMonitor.config.pgSetup = monitor->config.pgSetup; /* * Ok, let's try to update the extension then. * * For that we need to connect as the owner of the database, which was * the current $USER at the time of the `pg_autoctl create monitor` * command. */ if (!prepare_connection_to_current_system_user(monitor, &dbOwnerMonitor)) { log_error("Failed to update extension \"%s\" to version \"%s\": " "failed prepare a connection string to the " "monitor as the database owner", PG_AUTOCTL_MONITOR_EXTENSION_NAME, extensionVersion); return false; } if (!monitor_extension_update(&dbOwnerMonitor, extensionVersion)) { log_fatal("Failed to update extension \"%s\" to version \"%s\" " "on the monitor, see above for details", PG_AUTOCTL_MONITOR_EXTENSION_NAME, extensionVersion); /* explicitly close the dbOwner connection to the monitor */ pgsql_finish(&(dbOwnerMonitor.pgsql)); return false; } /* explicitly close the dbOwner connection to the monitor */ pgsql_finish(&(dbOwnerMonitor.pgsql)); if (!monitor_get_extension_version(monitor, version)) { log_fatal("Failed to check version compatibility with the monitor " "extension \"%s\", see above for details", PG_AUTOCTL_MONITOR_EXTENSION_NAME); return false; } log_info("Updated extension \"%s\" to version \"%s\"", PG_AUTOCTL_MONITOR_EXTENSION_NAME, version->installedVersion); /* * Now that we have done the ALTER EXTENSION UPDATE, our background * workers on the monitor have been started with the new shared library * object and the old SQL definitions. Let's restart Postgres so that * the background workers have a chance of a fresh start with an SQL * schema that matches the expectations of the shared library code. */ log_info("Restarting Postgres on the monitor"); /* avoid spurious error messages about losing our connection */ pgsql_finish(&(monitor->pgsql)); pgsql_finish(&(monitor->notificationClient)); if (!ensure_postgres_service_is_stopped(postgres)) { log_error("Failed to restart Postgres on the monitor after " "an extension update"); return false; } return ensure_postgres_service_is_running(postgres); } /* just mention we checked, and it's ok */ log_info("The version of extension \"%s\" is \"%s\" on the monitor", PG_AUTOCTL_MONITOR_EXTENSION_NAME, version->installedVersion); return true; } /* * prepare_connection_to_current_system_user changes a given pguri to remove * its "user" connection parameter, filling in the pre-allocated keywords and * values string arrays. * * Postgres docs at the following address show 30 connection parameters, so the * arrays should allocate 31 entries at least. The last one is going to be * NULL. * * https://www.postgresql.org/docs/current/libpq-connect.html */ static bool prepare_connection_to_current_system_user(Monitor *source, Monitor *target) { const char *keywords[41] = { 0 }; const char *values[41] = { 0 }; char *errmsg; PQconninfoOption *conninfo, *option; int argCount = 0; conninfo = PQconninfoParse(source->pgsql.connectionString, &errmsg); if (conninfo == NULL) { log_error("Failed to parse pguri \"%s\": %s", source->pgsql.connectionString, errmsg); PQfreemem(errmsg); return false; } for (option = conninfo; option->keyword != NULL; option++) { if (strcmp(option->keyword, "user") == 0) { /* skip the user, $USER is what we want to use here */ continue; } else if (option->val) { if (argCount == 40) { log_error("Failed to parse Postgres URI options: " "pg_autoctl supports up to 40 options " "and we are parsing more than that."); return false; } keywords[argCount] = option->keyword; values[argCount] = option->val; ++argCount; } } keywords[argCount] = NULL; values[argCount] = NULL; /* open the connection now, and check that everything is ok */ target->pgsql.connection = PQconnectdbParams(keywords, values, 0); /* Check to see that the backend connection was successfully made */ if (PQstatus(target->pgsql.connection) != CONNECTION_OK) { log_error("Connection to database failed: %s", PQerrorMessage(target->pgsql.connection)); pgsql_finish(&(target->pgsql)); PQconninfoFree(conninfo); return false; } /* Finally mark the connection as multi statement */ target->pgsql.connectionStatementType = PGSQL_CONNECTION_MULTI_STATEMENT; PQconninfoFree(conninfo); return true; } /* * monitor_find_node_by_nodeid probes the monitor's database to see if the * given nodeid matches with an existing node. When found, the array contains * one entry with the details of the node, otherwise the array is empty. */ bool monitor_find_node_by_nodeid(Monitor *monitor, const char *formation, int groupId, int64_t nodeId, NodeAddressArray *nodesArray) { PGSQL *pgsql = &monitor->pgsql; const char *sql = "SELECT * FROM pgautofailover.get_nodes($1, $2) WHERE node_id = $3"; int paramCount = 3; Oid paramTypes[3] = { TEXTOID, INT4OID, INT8OID }; const char *paramValues[3]; NodeAddressArrayParseContext parseContext = { { 0 }, nodesArray, false }; paramValues[0] = formation; paramValues[1] = intToString(groupId).strValue; paramValues[2] = intToString(nodeId).strValue; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &parseContext, parseNodeArray)) { log_error("Failed to get nodes for group %d in formation \"%s\" " "from the monitor", groupId, formation); return false; } if (!parseContext.parsedOK) { log_error("Failed to get nodes for group %d in formation \"%s\" " "from the monitor", groupId, formation); return false; } return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/monitor.h000066400000000000000000000221401414244367200224140ustar00rootroot00000000000000/* * src/bin/pg_autoctl/monitor.h * Functions for interacting with a pg_auto_failover monitor * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef MONITOR_H #define MONITOR_H #include #include "pgsql.h" #include "monitor_config.h" #include "nodestate_utils.h" #include "primary_standby.h" #include "state.h" /* the monitor manages a postgres server running the pgautofailover extension */ typedef struct Monitor { PGSQL pgsql; PGSQL notificationClient; MonitorConfig config; } Monitor; typedef struct MonitorAssignedState { char name[_POSIX_HOST_NAME_MAX]; int64_t nodeId; int groupId; NodeState state; int candidatePriority; bool replicationQuorum; } MonitorAssignedState; typedef struct StateNotification { char message[BUFSIZE]; NodeState reportedState; NodeState goalState; char formationId[NAMEDATALEN]; int groupId; int64_t nodeId; char hostName[_POSIX_HOST_NAME_MAX]; int nodePort; } StateNotification; #define MONITOR_EVENT_TIME_LEN 20 /* "YYYY-MM-DD HH:MI:SS" */ typedef struct MonitorEvent { int64_t eventId; char eventTime[MONITOR_EVENT_TIME_LEN]; char formationId[NAMEDATALEN]; int64_t nodeId; int groupId; char nodeName[NAMEDATALEN]; char nodeHost[_POSIX_HOST_NAME_MAX]; int nodePort; NodeState reportedState; NodeState assignedState; char replicationState[NAMEDATALEN]; int timeline; char lsn[PG_LSN_MAXLENGTH]; int candidatePriority; bool replicationQuorum; char description[BUFSIZE]; } MonitorEvent; #define EVENTS_ARRAY_MAX_COUNT 1024 typedef struct MonitorEventsArray { int count; MonitorEvent events[EVENTS_ARRAY_MAX_COUNT]; } MonitorEventsArray; typedef struct MonitorExtensionVersion { char defaultVersion[BUFSIZE]; char installedVersion[BUFSIZE]; } MonitorExtensionVersion; typedef struct CoordinatorNodeAddress { bool found; NodeAddress node; } CoordinatorNodeAddress; #define NODE_FORMAT "%" PRId64 " \"%s\" (%s:%d)" bool monitor_init(Monitor *monitor, char *url); void monitor_setup_notifications(Monitor *monitor, int groupId, int64_t nodeId); bool monitor_has_received_notifications(Monitor *monitor); bool monitor_process_state_notification(int notificationGroupId, int64_t notificationNodeId, char *channel, char *payload); bool monitor_local_init(Monitor *monitor); void monitor_finish(Monitor *monitor); bool monitor_retryable_error(const char *sqlstate); bool monitor_get_nodes(Monitor *monitor, char *formation, int groupId, NodeAddressArray *nodeArray); bool monitor_print_nodes(Monitor *monitor, char *formation, int groupId); bool monitor_print_nodes_as_json(Monitor *monitor, char *formation, int groupId); bool monitor_get_other_nodes(Monitor *monitor, int64_t myNodeId, NodeState currentState, NodeAddressArray *nodeArray); bool monitor_print_other_nodes(Monitor *monitor, int64_t myNodeId, NodeState currentState); bool monitor_print_other_nodes_as_json(Monitor *monitor, int64_t myNodeId, NodeState currentState); bool monitor_get_primary(Monitor *monitor, char *formation, int groupId, NodeAddress *node); bool monitor_get_coordinator(Monitor *monitor, char *formation, CoordinatorNodeAddress *coordinatorNodeAddress); bool monitor_get_most_advanced_standby(Monitor *monitor, char *formation, int groupId, NodeAddress *node); bool monitor_register_node(Monitor *monitor, char *formation, char *name, char *host, int port, uint64_t system_identifier, char *dbname, int64_t desiredNodeId, int desiredGroupId, NodeState initialState, PgInstanceKind kind, int candidatePriority, bool quorum, char *citusClusterName, bool *mayRetry, MonitorAssignedState *assignedState); bool monitor_node_active(Monitor *monitor, char *formation, int64_t nodeId, int groupId, NodeState currentState, bool pgIsRunning, int currentTLI, char *currentLSN, char *pgsrSyncState, MonitorAssignedState *assignedState); bool monitor_get_node_replication_settings(Monitor *monitor, NodeReplicationSettings *settings); bool monitor_set_node_candidate_priority(Monitor *monitor, char *formation, char *name, int candidatePriority); bool monitor_set_node_replication_quorum(Monitor *monitor, char *formation, char *name, bool replicationQuorum); bool monitor_get_formation_number_sync_standbys(Monitor *monitor, char *formation, int *numberSyncStandbys); bool monitor_set_formation_number_sync_standbys(Monitor *monitor, char *formation, int numberSyncStandbys); bool monitor_remove_by_hostname(Monitor *monitor, char *host, int port, bool force, int64_t *nodeId, int *groupId); bool monitor_remove_by_nodename(Monitor *monitor, char *formation, char *name, bool force, int64_t *nodeId, int *groupId); bool monitor_count_groups(Monitor *monitor, char *formation, int *groupsCount); bool monitor_get_groupId_from_name(Monitor *monitor, char *formation, char *name, int *groupId); bool monitor_perform_failover(Monitor *monitor, char *formation, int group); bool monitor_perform_promotion(Monitor *monitor, char *formation, char *name); bool monitor_get_current_state(Monitor *monitor, char *formation, int group, CurrentNodeStateArray *nodesArray); bool monitor_get_last_events(Monitor *monitor, char *formation, int group, int count, MonitorEventsArray *monitorEventsArray); bool monitor_print_state(Monitor *monitor, char *formation, int group); bool monitor_print_last_events(Monitor *monitor, char *formation, int group, int count); bool monitor_print_state_as_json(Monitor *monitor, char *formation, int group); bool monitor_print_last_events_as_json(Monitor *monitor, char *formation, int group, int count, FILE *stream); bool monitor_print_every_formation_uri(Monitor *monitor, const SSLOptions *ssl); bool monitor_print_every_formation_uri_as_json(Monitor *monitor, const SSLOptions *ssl, FILE *stream); bool monitor_create_formation(Monitor *monitor, char *formation, char *kind, char *dbname, bool ha, int numberSyncStandbys); bool monitor_enable_secondary_for_formation(Monitor *monitor, const char *formation); bool monitor_disable_secondary_for_formation(Monitor *monitor, const char *formation); bool monitor_drop_formation(Monitor *monitor, char *formation); bool monitor_count_failover_candidates(Monitor *monitor, char *formation, int groupId, int *failoverCandidateCount); bool monitor_print_formation_settings(Monitor *monitor, char *formation); bool monitor_print_formation_settings_as_json(Monitor *monitor, char *formation); bool monitor_formation_uri(Monitor *monitor, const char *formation, const char *citusClusterName, const SSLOptions *ssl, char *connectionString, size_t size); bool monitor_synchronous_standby_names(Monitor *monitor, char *formation, int groupId, char *synchronous_standby_names, int size); bool monitor_update_node_metadata(Monitor *monitor, int64_t nodeId, const char *name, const char *hostname, int port); bool monitor_set_node_system_identifier(Monitor *monitor, int64_t nodeId, uint64_t system_identifier); bool monitor_set_group_system_identifier(Monitor *monitor, int groupId, uint64_t system_identifier); bool monitor_start_maintenance(Monitor *monitor, int64_t nodeId, bool *mayRetry); bool monitor_stop_maintenance(Monitor *monitor, int64_t nodeId, bool *mayRetry); bool monitor_get_notifications(Monitor *monitor, int timeoutMs); bool monitor_wait_until_primary_applied_settings(Monitor *monitor, const char *formation); bool monitor_wait_until_some_node_reported_state(Monitor *monitor, const char *formation, int groupId, PgInstanceKind nodeKind, NodeState targetState, int timeout); bool monitor_wait_until_node_reported_state(Monitor *monitor, const char *formation, int groupId, int64_t nodeId, PgInstanceKind nodeKind, NodeState *targetStates, int targetStatesLength); bool monitor_wait_for_state_change(Monitor *monitor, const char *formation, int groupId, int64_t nodeId, int timeoutMs, bool *stateHasChanged); bool monitor_get_extension_version(Monitor *monitor, MonitorExtensionVersion *version); bool monitor_extension_update(Monitor *monitor, const char *targetVersion); bool monitor_ensure_extension_version(Monitor *monitor, LocalPostgresServer *postgres, MonitorExtensionVersion *version); bool monitor_find_node_by_nodeid(Monitor *monitor, const char *formation, int groupId, int64_t nodeId, NodeAddressArray *nodesArray); #endif /* MONITOR_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/monitor_config.c000066400000000000000000000440151414244367200237410ustar00rootroot00000000000000/* * src/bin/pg_autoctl/monitor_config.c * Monitor configuration functions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include "postgres_fe.h" #include "config.h" #include "defaults.h" #include "ini_file.h" #include "ipaddr.h" #include "monitor.h" #include "monitor_config.h" #include "log.h" #include "pgctl.h" #define OPTION_AUTOCTL_ROLE(config) \ make_strbuf_option_default("pg_autoctl", "role", NULL, true, NAMEDATALEN, \ config->role, MONITOR_ROLE) /* * --hostname used to be --nodename, and we need to support transition from the * old to the new name. For that, we read the pg_autoctl.nodename config * setting and change it on the fly to hostname instead. * * As a result HOSTNAME is marked not required and NODENAME is marked compat. */ #define OPTION_AUTOCTL_HOSTNAME(config) \ make_strbuf_option("pg_autoctl", "hostname", "hostname", \ false, _POSIX_HOST_NAME_MAX, config->hostname) #define OPTION_AUTOCTL_NODENAME(config) \ make_strbuf_compat_option("pg_autoctl", "nodename", \ _POSIX_HOST_NAME_MAX, config->hostname) #define OPTION_POSTGRESQL_PGDATA(config) \ make_strbuf_option("postgresql", "pgdata", "pgdata", true, MAXPGPATH, \ config->pgSetup.pgdata) #define OPTION_POSTGRESQL_PG_CTL(config) \ make_strbuf_option("postgresql", "pg_ctl", "pgctl", false, MAXPGPATH, \ config->pgSetup.pg_ctl) #define OPTION_POSTGRESQL_USERNAME(config) \ make_strbuf_option("postgresql", "username", "username", \ false, NAMEDATALEN, \ config->pgSetup.username) #define OPTION_POSTGRESQL_DBNAME(config) \ make_strbuf_option("postgresql", "dbname", "dbname", false, NAMEDATALEN, \ config->pgSetup.dbname) #define OPTION_POSTGRESQL_HOST(config) \ make_strbuf_option("postgresql", "host", "pghost", \ false, _POSIX_HOST_NAME_MAX, \ config->pgSetup.pghost) #define OPTION_POSTGRESQL_PORT(config) \ make_int_option("postgresql", "port", "pgport", \ true, &(config->pgSetup.pgport)) #define OPTION_POSTGRESQL_LISTEN_ADDRESSES(config) \ make_strbuf_option("postgresql", "listen_addresses", "listen", \ false, MAXPGPATH, config->pgSetup.listen_addresses) #define OPTION_POSTGRESQL_AUTH_METHOD(config) \ make_strbuf_option("postgresql", "auth_method", "auth", \ false, MAXPGPATH, config->pgSetup.authMethod) #define OPTION_SSL_ACTIVE(config) \ make_int_option_default("ssl", "active", NULL, \ false, &(config->pgSetup.ssl.active), 0) #define OPTION_SSL_MODE(config) \ make_strbuf_option("ssl", "sslmode", "ssl-mode", \ false, SSL_MODE_STRLEN, config->pgSetup.ssl.sslModeStr) #define OPTION_SSL_CA_FILE(config) \ make_strbuf_option("ssl", "ca_file", "ssl-ca-file", \ false, MAXPGPATH, config->pgSetup.ssl.caFile) #define OPTION_SSL_CRL_FILE(config) \ make_strbuf_option("ssl", "crl_file", "ssl-crl-file", \ false, MAXPGPATH, config->pgSetup.ssl.crlFile) #define OPTION_SSL_SERVER_CERT(config) \ make_strbuf_option("ssl", "cert_file", "server-cert", \ false, MAXPGPATH, config->pgSetup.ssl.serverCert) #define OPTION_SSL_SERVER_KEY(config) \ make_strbuf_option("ssl", "key_file", "server-key", \ false, MAXPGPATH, config->pgSetup.ssl.serverKey) #define SET_INI_OPTIONS_ARRAY(config) \ { \ OPTION_AUTOCTL_ROLE(config), \ OPTION_AUTOCTL_HOSTNAME(config), \ OPTION_AUTOCTL_NODENAME(config), \ OPTION_POSTGRESQL_PGDATA(config), \ OPTION_POSTGRESQL_PG_CTL(config), \ OPTION_POSTGRESQL_USERNAME(config), \ OPTION_POSTGRESQL_DBNAME(config), \ OPTION_POSTGRESQL_HOST(config), \ OPTION_POSTGRESQL_PORT(config), \ OPTION_POSTGRESQL_LISTEN_ADDRESSES(config), \ OPTION_POSTGRESQL_AUTH_METHOD(config), \ OPTION_SSL_MODE(config), \ OPTION_SSL_ACTIVE(config), \ OPTION_SSL_CA_FILE(config), \ OPTION_SSL_CRL_FILE(config), \ OPTION_SSL_SERVER_CERT(config), \ OPTION_SSL_SERVER_KEY(config), \ INI_OPTION_LAST \ } /* * monitor_config_set_pathnames_from_pgdata sets the config pathnames from its * pgSetup.pgdata field, which must have already been set when calling this * function. */ bool monitor_config_set_pathnames_from_pgdata(MonitorConfig *config) { if (IS_EMPTY_STRING_BUFFER(config->pgSetup.pgdata)) { /* developer error */ log_error("BUG: monitor_config_set_pathnames_from_pgdata: empty pgdata"); return false; } if (!SetConfigFilePath(&(config->pathnames), config->pgSetup.pgdata)) { log_fatal("Failed to set configuration filename from PGDATA \"%s\"," " see above for details.", config->pgSetup.pgdata); return false; } if (!SetStateFilePath(&(config->pathnames), config->pgSetup.pgdata)) { log_fatal("Failed to set state filename from PGDATA \"%s\"," " see above for details.", config->pgSetup.pgdata); return false; } if (!SetPidFilePath(&(config->pathnames), config->pgSetup.pgdata)) { log_fatal("Failed to set pid filename from PGDATA \"%s\"," " see above for details.", config->pgSetup.pgdata); return false; } return true; } /* * monitor_config_init initializes a MonitorConfig with the default values. */ void monitor_config_init(MonitorConfig *config, bool missing_pgdata_is_ok, bool pg_is_not_running_is_ok) { PostgresSetup pgSetup = { 0 }; IniOption monitorOptions[] = SET_INI_OPTIONS_ARRAY(config); if (!ini_validate_options(monitorOptions)) { log_error("Please review your setup options per above messages"); exit(EXIT_CODE_BAD_CONFIG); } if (!pg_setup_init(&pgSetup, &(config->pgSetup), missing_pgdata_is_ok, pg_is_not_running_is_ok)) { log_error("Please fix your PostgreSQL setup per above messages"); exit(EXIT_CODE_BAD_CONFIG); } /* * Keep the whole set of values discovered in pg_setup_init from the * configuration file */ config->pgSetup = pgSetup; /* A part of the monitor's pgSetup is hard-coded. */ strlcpy(config->pgSetup.dbname, PG_AUTOCTL_MONITOR_DBNAME, NAMEDATALEN); strlcpy(config->pgSetup.username, PG_AUTOCTL_MONITOR_USERNAME, NAMEDATALEN); if (config->pgSetup.hbaLevel == HBA_EDIT_UNKNOWN) { strlcpy(config->pgSetup.hbaLevelStr, "app", NAMEDATALEN); config->pgSetup.hbaLevel = HBA_EDIT_LAN; } } /* * monitor_config_init initializes a MonitorConfig from a KeeperConfig * structure. That's useful for commands that may run on either a monitor or a * keeper node, such as `pg_autoctl monitor state|events|formation`, or * `pg_autoctl do destroy`. */ bool monitor_config_init_from_pgsetup(MonitorConfig *mconfig, PostgresSetup *pgSetup, bool missingPgdataIsOk, bool pgIsNotRunningIsOk) { PostgresSetup *MpgSetup = &(mconfig->pgSetup); /* copy command line options over to the MonitorConfig structure */ *MpgSetup = *pgSetup; if (!monitor_config_set_pathnames_from_pgdata(mconfig)) { /* errors have already been logged */ return false; } if (!monitor_config_read_file(mconfig, missingPgdataIsOk, pgIsNotRunningIsOk)) { log_fatal("Failed to read configuration file \"%s\"", mconfig->pathnames.config); return false; } return true; } /* * monitor_config_read_file overrides values in given MonitorConfig with * whatever values are read from given configuration filename. */ bool monitor_config_read_file(MonitorConfig *config, bool missing_pgdata_is_ok, bool pg_not_running_is_ok) { const char *filename = config->pathnames.config; PostgresSetup pgSetup = { 0 }; IniOption monitorOptions[] = SET_INI_OPTIONS_ARRAY(config); log_debug("Reading configuration from %s", filename); if (!read_ini_file(filename, monitorOptions)) { log_error("Failed to parse configuration file \"%s\"", filename); return false; } /* * We have changed the --nodename option to being named --hostname, and * same in the configuration file: pg_autoctl.nodename is now * pg_autoctl.hostname. * * We can read either names from the configuration file and will then write * the current option name (pg_autoctl.hostname), but we can't have either * one be required anymore. * * Implement the "require" property here by making sure one of those names * have been used to populate the monitor config structure. */ if (IS_EMPTY_STRING_BUFFER(config->hostname)) { log_error("Failed to read either pg_autoctl.hostname or its older " "name pg_autoctl.nodename from the \"%s\" configuration file", filename); return false; } if (!pg_setup_init(&pgSetup, &config->pgSetup, missing_pgdata_is_ok, pg_not_running_is_ok)) { return false; } /* * Keep the whole set of values discovered in pg_setup_init from the * configuration file */ config->pgSetup = pgSetup; /* A part of the monitor's pgSetup is hard-coded. */ strlcpy(config->pgSetup.dbname, PG_AUTOCTL_MONITOR_DBNAME, NAMEDATALEN); strlcpy(config->pgSetup.username, PG_AUTOCTL_MONITOR_USERNAME, NAMEDATALEN); /* * Required for grandfathering old clusters that don't have sslmode * explicitely set */ if (IS_EMPTY_STRING_BUFFER(config->pgSetup.ssl.sslModeStr)) { strlcpy(config->pgSetup.ssl.sslModeStr, "prefer", SSL_MODE_STRLEN); } /* set the ENUM value for sslMode */ config->pgSetup.ssl.sslMode = pgsetup_parse_sslmode(config->pgSetup.ssl.sslModeStr); return true; } /* * monitor_config_write_file writes the current values in given KeeperConfig to * filename. */ bool monitor_config_write_file(MonitorConfig *config) { const char *filePath = config->pathnames.config; log_trace("monitor_config_write_file \"%s\"", filePath); FILE *fileStream = fopen_with_umask(filePath, "w", FOPEN_FLAGS_W, 0644); if (fileStream == NULL) { /* errors have already been logged */ return false; } bool success = monitor_config_write(fileStream, config); if (fclose(fileStream) == EOF) { log_error("Failed to write file \"%s\"", filePath); return false; } return success; } /* * monitor_config_write write the current config to given STREAM. */ bool monitor_config_write(FILE *stream, MonitorConfig *config) { IniOption monitorOptions[] = SET_INI_OPTIONS_ARRAY(config); return write_ini_to_stream(stream, monitorOptions); } /* * monitor_config_to_json populates given jsRoot object with the INI * configuration sections as JSON objects, and the options as keys to those * objects. */ bool monitor_config_to_json(MonitorConfig *config, JSON_Value *js) { JSON_Object *jsRoot = json_value_get_object(js); IniOption monitorOptions[] = SET_INI_OPTIONS_ARRAY(config); return ini_to_json(jsRoot, monitorOptions); } /* * monitor_config_log_settings outputs a DEBUG line per each config parameter * in the given MonitorConfig. */ void monitor_config_log_settings(MonitorConfig config) { log_debug("postgresql.pgdata: %s", config.pgSetup.pgdata); log_debug("postgresql.pg_ctl: %s", config.pgSetup.pg_ctl); log_debug("postgresql.version: %s", config.pgSetup.pg_version); log_debug("postgresql.username: %s", config.pgSetup.username); log_debug("postgresql.dbname: %s", config.pgSetup.dbname); log_debug("postgresql.host: %s", config.pgSetup.pghost); log_debug("postgresql.port: %d", config.pgSetup.pgport); log_debug("postgresql.auth: %s", config.pgSetup.authMethod); log_debug("ssl.active: %d", config.pgSetup.ssl.active); log_debug("ssl.sslMode: %s", config.pgSetup.ssl.sslModeStr); log_debug("ssl.caFile: %s", config.pgSetup.ssl.caFile); log_debug("ssl.crlFile: %s", config.pgSetup.ssl.crlFile); log_debug("ssl.serverKey: %s", config.pgSetup.ssl.serverCert); log_debug("ssl.serverCert: %s", config.pgSetup.ssl.serverKey); } /* * monitor_config_merge_options merges any option setup in options into config. * Its main use is to override configuration file settings with command line * options. */ bool monitor_config_merge_options(MonitorConfig *config, MonitorConfig *options) { IniOption monitorConfigOptions[] = SET_INI_OPTIONS_ARRAY(config); IniOption monitorOptionsOptions[] = SET_INI_OPTIONS_ARRAY(options); if (ini_merge(monitorConfigOptions, monitorOptionsOptions)) { PostgresSetup pgSetup = { 0 }; bool missing_pgdata_is_ok = true; bool pg_is_not_running_is_ok = true; /* * Before merging given options, validate them as much as we can. The * ini level functions validate the syntax (strings, integers, etc), * not that the values themselves then make sense. */ if (!pg_setup_init(&pgSetup, &config->pgSetup, missing_pgdata_is_ok, pg_is_not_running_is_ok)) { return false; } /* * Keep the whole set of values discovered in pg_setup_init from the * configuration file */ config->pgSetup = pgSetup; return monitor_config_write_file(config); } return false; } /* * monitor_config_get_postgres_uri build a connecting string to connect * to the monitor server from a remote machine and writes it to connectionString, * with at most size number of chars. */ bool monitor_config_get_postgres_uri(MonitorConfig *config, char *connectionString, size_t size) { char *connStringEnd = connectionString; char host[BUFSIZE]; if (!IS_EMPTY_STRING_BUFFER(config->hostname)) { strlcpy(host, config->hostname, BUFSIZE); } else if (IS_EMPTY_STRING_BUFFER(config->pgSetup.listen_addresses) || strcmp(config->pgSetup.listen_addresses, POSTGRES_DEFAULT_LISTEN_ADDRESSES) == 0) { /* * We ouput the monitor connection string using the LAN ip of the * current machine (e.g. 192.168.1.1), which is the most probable IP * address that the other members of the pg_auto_failover cluster will * have to use to register and communicate with the monitor. * * The monitor_install() function also has added an HBA entry to this * PostgreSQL server to open it up to the local area network, e.g. * 129.168.1.0/23, so it should just work here. */ bool mayRetry = false; if (!fetchLocalIPAddress(host, BUFSIZE, DEFAULT_INTERFACE_LOOKUP_SERVICE_NAME, DEFAULT_INTERFACE_LOOKUP_SERVICE_PORT, LOG_WARN, &mayRetry)) { /* error is already logged */ return false; } } else { strlcpy(host, config->pgSetup.listen_addresses, BUFSIZE); } /* * Finalize the connection string, with some variants depending on the * usage of SSL certificates. The full variant is with sslrootcert and * sslcrl connection parameters when using sslmode=verify-ca or * sslmode=verify-full. */ connStringEnd += sformat(connStringEnd, size - (connStringEnd - connectionString), "postgres://%s@%s:%d/%s", config->pgSetup.username, host, config->pgSetup.pgport, config->pgSetup.dbname); if (config->pgSetup.ssl.sslMode >= SSL_MODE_PREFER) { char *sslmode = pgsetup_sslmode_to_string(config->pgSetup.ssl.sslMode); connStringEnd += sformat(connStringEnd, size - (connStringEnd - connectionString), "?sslmode=%s", sslmode); if (config->pgSetup.ssl.sslMode >= SSL_MODE_VERIFY_CA) { if (IS_EMPTY_STRING_BUFFER(config->pgSetup.ssl.crlFile)) { connStringEnd += sformat(connStringEnd, size - (connStringEnd - connectionString), "&sslrootcert=%s", config->pgSetup.ssl.caFile); } else { connStringEnd += sformat(connStringEnd, size - (connStringEnd - connectionString), "&sslrootcert=%s&sslcrl=%s", config->pgSetup.ssl.caFile, config->pgSetup.ssl.crlFile); } } } return true; } /* * monitor_config_get_setting returns the current value of the given option * "path" (thats a section.option string). The value is returned in the * pre-allocated value buffer of size size. */ bool monitor_config_get_setting(MonitorConfig *config, const char *path, char *value, size_t size) { const char *filename = config->pathnames.config; IniOption monitorOptions[] = SET_INI_OPTIONS_ARRAY(config); return ini_get_setting(filename, monitorOptions, path, value, size); } /* * monitor_config_set_setting sets the setting identified by "path" * (section.option) to the given value. The value is passed in as a string, * which is going to be parsed if necessary. */ bool monitor_config_set_setting(MonitorConfig *config, const char *path, char *value) { const char *filename = config->pathnames.config; IniOption monitorOptions[] = SET_INI_OPTIONS_ARRAY(config); if (ini_set_setting(filename, monitorOptions, path, value)) { PostgresSetup pgSetup = { 0 }; bool missing_pgdata_is_ok = true; bool pg_is_not_running_is_ok = true; /* * Before merging given options, validate them as much as we can. The * ini level functions validate the syntax (strings, integers, etc), * not that the values themselves then make sense. */ return pg_setup_init(&pgSetup, &(config->pgSetup), missing_pgdata_is_ok, pg_is_not_running_is_ok); } return false; } /* * monitor_config_update_with_absolute_pgdata verifies that the pgdata path * is an absolute one * If not, the config->pgSetup is updated and we rewrite the monitor config file */ bool monitor_config_update_with_absolute_pgdata(MonitorConfig *config) { PostgresSetup *pgSetup = &(config->pgSetup); if (pg_setup_set_absolute_pgdata(pgSetup)) { if (!monitor_config_write_file(config)) { /* errors have already been logged */ return false; } } return true; } /* * monitor_config_accept_new returns true when we can accept to RELOAD our * current config into the new one that's been editing. */ bool monitor_config_accept_new(MonitorConfig *config, MonitorConfig *newConfig) { /* some elements are not supposed to change on a reload */ if (strneq(newConfig->pgSetup.pgdata, config->pgSetup.pgdata)) { log_error("Attempt to change postgresql.pgdata from \"%s\" to \"%s\"", config->pgSetup.pgdata, newConfig->pgSetup.pgdata); return false; } /* changing the hostname online is supported */ if (strneq(newConfig->hostname, config->hostname)) { log_info("Reloading configuration: hostname is now \"%s\"; " "used to be \"%s\"", newConfig->hostname, config->hostname); strlcpy(config->hostname, newConfig->hostname, _POSIX_HOST_NAME_MAX); } /* we can change any SSL related setup options at runtime */ return config_accept_new_ssloptions(&(config->pgSetup), &(newConfig->pgSetup)); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/monitor_config.h000066400000000000000000000037261414244367200237520ustar00rootroot00000000000000/* * src/bin/pg_autoctl/monitor_config.h * Monitor configuration data structure and function definitions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef MONITOR_CONFIG_H #define MONITOR_CONFIG_H #include #include #include "config.h" #include "pgctl.h" #include "parson.h" #include "pgsql.h" typedef struct MonitorConfig { /* in-memory configuration related variables */ ConfigFilePaths pathnames; /* pg_autoctl setup */ char hostname[_POSIX_HOST_NAME_MAX]; /* PostgreSQL setup */ char role[NAMEDATALEN]; /* PostgreSQL setup */ PostgresSetup pgSetup; } MonitorConfig; bool monitor_config_set_pathnames_from_pgdata(MonitorConfig *config); void monitor_config_init(MonitorConfig *config, bool missing_pgdata_is_ok, bool pg_is_not_running_is_ok); bool monitor_config_init_from_pgsetup(MonitorConfig *mconfig, PostgresSetup *pgSetup, bool missingPgdataIsOk, bool pgIsNotRunningIsOk); bool monitor_config_read_file(MonitorConfig *config, bool missing_pgdata_is_ok, bool pg_not_running_is_ok); bool monitor_config_write_file(MonitorConfig *config); bool monitor_config_write(FILE *stream, MonitorConfig *config); bool monitor_config_to_json(MonitorConfig *config, JSON_Value *js); void monitor_config_log_settings(MonitorConfig config); bool monitor_config_merge_options(MonitorConfig *config, MonitorConfig *options); bool monitor_config_get_postgres_uri(MonitorConfig *config, char *connectionString, size_t size); bool monitor_config_get_setting(MonitorConfig *config, const char *path, char *value, size_t size); bool monitor_config_set_setting(MonitorConfig *config, const char *path, char *value); bool monitor_config_update_with_absolute_pgdata(MonitorConfig *config); bool monitor_config_accept_new(MonitorConfig *config, MonitorConfig *newConfig); #endif /* MONITOR_CONFIG_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/monitor_pg_init.c000066400000000000000000000230401414244367200241200ustar00rootroot00000000000000/* * src/bin/pg_autoctl/monitor_pg_init.c * Monitor initialisation. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include "postgres_fe.h" #include "cli_common.h" #include "debian.h" #include "defaults.h" #include "ipaddr.h" #include "log.h" #include "monitor.h" #include "monitor_config.h" #include "monitor_pg_init.h" #include "pgctl.h" #include "pghba.h" #include "pgsetup.h" #include "pgsql.h" #include "pidfile.h" #include "primary_standby.h" #include "service_monitor.h" #include "service_monitor_init.h" #include "service_postgres.h" #include "signals.h" /* * Default settings for PostgreSQL instance when running the pg_auto_failover * monitor. */ GUC monitor_default_settings[] = { { "shared_preload_libraries", "'pgautofailover'" }, { "cluster_name", "'pg_auto_failover monitor'" }, { "listen_addresses", "'*'" }, { "port", "5432" }, { "log_destination", "stderr" }, { "logging_collector", "on" }, { "log_directory", "log" }, { "log_min_messages", "info" }, { "log_connections", "off" }, { "log_disconnections", "off" }, { "log_lock_waits", "on" }, { "log_statement", "ddl" }, { "password_encryption", "md5" }, { "ssl", "off" }, { "ssl_ca_file", "" }, { "ssl_crl_file", "" }, { "ssl_cert_file", "" }, { "ssl_key_file", "" }, { "ssl_ciphers", "'" DEFAULT_SSL_CIPHERS "'" }, #ifdef TEST { "unix_socket_directories", "''" }, #endif { NULL, NULL } }; static bool check_monitor_settings(PostgresSetup pgSetup); /* * monitor_pg_init initializes a pg_auto_failover monitor PostgreSQL cluster * from scratch using `pg_ctl initdb`. */ bool monitor_pg_init(Monitor *monitor) { MonitorConfig *config = &(monitor->config); PostgresSetup *pgSetup = &(config->pgSetup); if (pg_setup_pgdata_exists(pgSetup)) { PostgresSetup existingPgSetup = { 0 }; bool missing_pgdata_is_ok = true; bool pg_is_not_running_is_ok = true; if (!pg_setup_init(&existingPgSetup, pgSetup, missing_pgdata_is_ok, pg_is_not_running_is_ok)) { log_fatal("Failed to initialize a monitor node, " "see above for details"); return false; } if (pg_setup_is_running(&existingPgSetup)) { log_error("Installing pg_auto_failover monitor in existing " "PostgreSQL instance at \"%s\" running on port %d " "is not supported.", pgSetup->pgdata, existingPgSetup.pidFile.port); return false; } /* if we have a debian cluster, re-own the configuration files */ if (!keeper_ensure_pg_configuration_files_in_pgdata(&existingPgSetup)) { log_fatal("Failed to setup your Postgres instance " "the PostgreSQL way, see above for details"); return false; } } else { if (!pg_ctl_initdb(pgSetup->pg_ctl, pgSetup->pgdata)) { log_fatal("Failed to initialize a PostgreSQL instance at \"%s\", " "see above for details", pgSetup->pgdata); return false; } } if (!monitor_add_postgres_default_settings(monitor)) { log_fatal("Failed to initialize our Postgres settings, " "see above for details"); return false; } return true; } /* * Install pg_auto_failover monitor in some existing PostgreSQL instance: * * - add postgresql-auto-failover.conf to postgresql.conf * - create user autoctl with createdb login; * - create database pg_auto_failover with owner autoctl; * - create extension pgautofailover; */ bool monitor_install(const char *hostname, PostgresSetup pgSetupOption, bool checkSettings) { PostgresSetup pgSetup = { 0 }; bool missingPgdataIsOk = false; bool pgIsNotRunningIsOk = true; LocalPostgresServer postgres = { 0 }; char connInfo[MAXCONNINFO]; /* We didn't create our target username/dbname yet */ strlcpy(pgSetupOption.username, "", NAMEDATALEN); strlcpy(pgSetupOption.dbname, "", NAMEDATALEN); /* * We might have just started a PostgreSQL instance, so we want to recheck * the PostgreSQL setup. */ if (!pg_setup_init(&pgSetup, &pgSetupOption, missingPgdataIsOk, pgIsNotRunningIsOk)) { log_fatal("Failed to initialize a monitor node, see above for details"); exit(EXIT_CODE_PGCTL); } (void) local_postgres_init(&postgres, &pgSetup); if (!ensure_postgres_service_is_running(&postgres)) { log_error("Failed to install pg_auto_failover in the monitor's " "Postgres database, see above for details"); return false; } if (!pgsql_create_user(&postgres.sqlClient, PG_AUTOCTL_MONITOR_DBOWNER, /* password, login, superuser, replication, connlimit */ NULL, true, false, false, -1)) { log_error("Failed to create user \"%s\" on local postgres server", PG_AUTOCTL_MONITOR_DBOWNER); return false; } if (!pgsql_create_database(&postgres.sqlClient, PG_AUTOCTL_MONITOR_DBNAME, PG_AUTOCTL_MONITOR_DBOWNER)) { log_error("Failed to create database %s with owner %s", PG_AUTOCTL_MONITOR_DBNAME, PG_AUTOCTL_MONITOR_DBOWNER); return false; } /* now, connect to the newly created database to create our extension */ strlcpy(pgSetup.dbname, PG_AUTOCTL_MONITOR_DBNAME, NAMEDATALEN); pg_setup_get_local_connection_string(&pgSetup, connInfo); pgsql_init(&postgres.sqlClient, connInfo, PGSQL_CONN_LOCAL); /* * Ensure our extension "pgautofailvover" is available in the server * extension dir used to create the Postgres instance. We only search for * the control file to offer better diagnostics in the logs in case the * following CREATE EXTENSION fails. */ if (!find_extension_control_file(pgSetup.pg_ctl, PG_AUTOCTL_MONITOR_EXTENSION_NAME)) { log_warn("Failed to find extension control file for \"%s\"", PG_AUTOCTL_MONITOR_EXTENSION_NAME); } if (!pgsql_create_extension(&postgres.sqlClient, PG_AUTOCTL_MONITOR_EXTENSION_NAME)) { log_error("Failed to create extension %s", PG_AUTOCTL_MONITOR_EXTENSION_NAME); return false; } /* * When installing the monitor on-top of an already running PostgreSQL, we * want to check that our settings have been applied already, and warn the * user to restart their instance otherwise. */ if (checkSettings) { if (!check_monitor_settings(pgSetup)) { /* that's highly unexpected */ log_fatal("Failed to check pg_auto_failover monitor settings"); return false; } } /* * Now make sure we allow nodes on the same network to connect to * pg_auto_failover database. */ if (!pghba_enable_lan_cidr(&postgres.sqlClient, pgSetup.ssl.active, HBA_DATABASE_DBNAME, PG_AUTOCTL_MONITOR_DBNAME, hostname, PG_AUTOCTL_MONITOR_USERNAME, pg_setup_get_auth_method(&pgSetup), pgSetup.hbaLevel, NULL)) { log_warn("Failed to grant connection to local network."); return false; } log_info("Your pg_auto_failover monitor instance is now ready on port %d.", pgSetup.pgport); return true; } /* * check_monitor_settings returns true if the pgautofailover extension is * already part of the shared_preload_libraries GUC. */ static bool check_monitor_settings(PostgresSetup pgSetup) { LocalPostgresServer postgres = { 0 }; char connInfo[MAXCONNINFO]; bool settingsAreOk = false; pg_setup_get_local_connection_string(&pgSetup, connInfo); pgsql_init(&postgres.sqlClient, connInfo, PGSQL_CONN_LOCAL); if (!pgsql_check_monitor_settings(&(postgres.sqlClient), &settingsAreOk)) { /* errors have already been logged */ return false; } if (settingsAreOk) { log_info("PostgreSQL shared_preload_libraries already includes \"%s\"", PG_AUTOCTL_MONITOR_EXTENSION_NAME); } else { log_warn("PostgreSQL shared_preload_libraries doesn't include \"%s\"", PG_AUTOCTL_MONITOR_EXTENSION_NAME); log_fatal("Current PostgreSQL settings are not compliant " "with pg_auto_failover monitor requirements, please restart " "PostgreSQL at the next opportunity to enable " "pg_auto_failover monitor changes"); } return settingsAreOk; } /* * monitor_add_postgres_default_settings adds the monitor Postgres setup. */ bool monitor_add_postgres_default_settings(Monitor *monitor) { MonitorConfig *config = &(monitor->config); PostgresSetup *pgSetup = &(config->pgSetup); char configFilePath[MAXPGPATH] = { 0 }; /* * We managed to initdb, refresh our configuration file location with * the realpath(3): we might have been given a relative pathname. */ if (!monitor_config_update_with_absolute_pgdata(config)) { /* errors have already been logged */ return false; } /* * We just did the initdb ourselves, so we know where the configuration * file is to be found Also, we didn't start PostgreSQL yet. */ join_path_components(configFilePath, pgSetup->pgdata, "postgresql.conf"); /* * When --ssl-self-signed has been used, now is the time to build a * self-signed certificate for the server. We place the certificate and * private key in $PGDATA/server.key and $PGDATA/server.crt */ if (pgSetup->ssl.createSelfSignedCert) { if (!pg_create_self_signed_cert(&(config->pgSetup), config->hostname)) { log_error("Failed to create SSL self-signed certificate, " "see above for details"); return false; } /* update our configuration with ssl server.{key,cert} */ if (!monitor_config_write_file(config)) { /* errors have already been logged */ return false; } } if (!pg_add_auto_failover_default_settings(pgSetup, config->hostname, configFilePath, monitor_default_settings)) { log_error("Failed to add default settings to \"%s\": couldn't " "write the new postgresql.conf, see above for details", configFilePath); return false; } return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/monitor_pg_init.h000066400000000000000000000011161414244367200241250ustar00rootroot00000000000000/* * src/bin/pg_autoctl/monitor_pg_init.h * Monitor configuration data structure and function definitions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef MONITOR_PG_INIT_H #define MONITOR_PG_INIT_H #include #include "monitor.h" #include "monitor_config.h" bool monitor_pg_init(Monitor *monitor); bool monitor_install(const char *hostname, PostgresSetup pgSetupOption, bool checkSettings); bool monitor_add_postgres_default_settings(Monitor *monitor); #endif /* MONITOR_PG_INIT_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/nodestate_utils.c000066400000000000000000000354631414244367200241420ustar00rootroot00000000000000/* * src/bin/pg_autoctl/nodestate_utils.c * Functions for printing node states. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include "file_utils.h" #include "log.h" #include "nodestate_utils.h" #include "string_utils.h" /* * nodestatePrepareHeaders computes the maximum length needed for variable * length columns and prepare the separation strings, filling them with the * right amount of dashes. */ void nodestatePrepareHeaders(CurrentNodeStateArray *nodesArray, PgInstanceKind nodeKind) { int index = 0; nodesArray->headers.nodeKind = nodeKind; nodesArray->headers.maxNameSize = 4; /* "Name" */ nodesArray->headers.maxHostSize = 10; /* "Host:Port" */ nodesArray->headers.maxNodeSize = 5; /* "Node" */ nodesArray->headers.maxLSNSize = 9; /* "TLI: LSN" */ nodesArray->headers.maxStateSize = MAX_NODE_STATE_LEN; nodesArray->headers.maxHealthSize = strlen("read-write *"); /* * Dynamically adjust our display output to the length of the longer * hostname in the result set */ for (index = 0; index < nodesArray->count; index++) { CurrentNodeState *nodeState = &(nodesArray->nodes[index]); (void) nodestateAdjustHeaders(&(nodesArray->headers), &(nodeState->node), nodeState->groupId); } /* prepare a nice dynamic string of '-' as a header separator */ (void) prepareHeaderSeparators(&(nodesArray->headers)); } /* * nodestatePrepareHeaders computes the maximum length needed for variable * length columns and prepare the separation strings, filling them with the * right amount of dashes. */ void nodeAddressArrayPrepareHeaders(NodeAddressHeaders *headers, NodeAddressArray *nodesArray, int groupId, PgInstanceKind nodeKind) { int index = 0; headers->nodeKind = nodeKind; /* * Dynamically adjust our display output to the length of the longer * hostname in the result set */ for (index = 0; index < nodesArray->count; index++) { NodeAddress *node = &(nodesArray->nodes[index]); (void) nodestateAdjustHeaders(headers, node, groupId); } /* prepare a nice dynamic string of '-' as a header separator */ (void) prepareHeaderSeparators(headers); } /* * prepareHeaderSeparators prepares all the separator strings. headers sizes * must have been pre-computed. */ void prepareHeaderSeparators(NodeAddressHeaders *headers) { (void) prepareHostNameSeparator(headers->nameSeparatorHeader, headers->maxNameSize); (void) prepareHostNameSeparator(headers->hostSeparatorHeader, headers->maxHostSize); (void) prepareHostNameSeparator(headers->nodeSeparatorHeader, headers->maxNodeSize); (void) prepareHostNameSeparator(headers->lsnSeparatorHeader, headers->maxLSNSize); (void) prepareHostNameSeparator(headers->stateSeparatorHeader, headers->maxStateSize); (void) prepareHostNameSeparator(headers->healthSeparatorHeader, headers->maxHealthSize); } /* * re-compute headers properties from current properties and the new node * characteristics. */ void nodestateAdjustHeaders(NodeAddressHeaders *headers, NodeAddress *node, int groupId) { char hostport[BUFSIZE] = { 0 }; char composedId[BUFSIZE] = { 0 }; char tliLSN[BUFSIZE] = { 0 }; (void) nodestatePrepareNode(headers, node, groupId, hostport, composedId, tliLSN); int nameLen = strlen(node->name); int hostLen = strlen(hostport); int nodeLen = strlen(composedId); int lsnLen = strlen(tliLSN); /* * In order to have a static nice table output even when using * auto-refreshing commands such as `watch(1)` when states are changing, we * always use the max known state length. */ headers->maxStateSize = MAX_NODE_STATE_LEN; /* initialize to mininum values, if needed */ if (headers->maxNameSize == 0) { /* Name */ headers->maxNameSize = strlen("Name"); } if (headers->maxHostSize == 0) { /* Host:Port */ headers->maxHostSize = strlen("Host:Port"); } if (headers->maxNodeSize == 0) { /* groupId/nodeId */ headers->maxNodeSize = 5; } if (headers->maxLSNSize == 0) { /* Unknown LSN is going to be " 1: 0/0" */ headers->maxLSNSize = 9; } if (headers->maxHealthSize == 0) { /* * Connection is one of "read-only", "read-write", or "unknown", * followed by a mark for the health check (*, !, or ?), so we need as * much space as the full sample "read-write *": */ headers->maxHealthSize = strlen("read-write *"); } if (nameLen > headers->maxNameSize) { headers->maxNameSize = nameLen; } if (hostLen > headers->maxHostSize) { headers->maxHostSize = hostLen; } if (nodeLen > headers->maxNodeSize) { headers->maxNodeSize = nodeLen; } if (lsnLen > headers->maxLSNSize) { headers->maxLSNSize = lsnLen; } } /* * nodestatePrintHeader prints the given CurrentNodeStateArray header. */ void nodestatePrintHeader(NodeAddressHeaders *headers) { fformat(stdout, "%*s | %*s | %*s | %*s | %*s | %*s | %*s\n", headers->maxNameSize, "Name", headers->maxNodeSize, "Node", headers->maxHostSize, "Host:Port", headers->maxLSNSize, "TLI: LSN", headers->maxHealthSize, "Connection", headers->maxStateSize, "Reported State", headers->maxStateSize, "Assigned State"); fformat(stdout, "%*s-+-%*s-+-%*s-+-%*s-+-%*s-+-%*s-+-%*s\n", headers->maxNameSize, headers->nameSeparatorHeader, headers->maxNodeSize, headers->nodeSeparatorHeader, headers->maxHostSize, headers->hostSeparatorHeader, headers->maxLSNSize, headers->lsnSeparatorHeader, headers->maxHealthSize, headers->healthSeparatorHeader, headers->maxStateSize, headers->stateSeparatorHeader, headers->maxStateSize, headers->stateSeparatorHeader); } /* * nodestatePrintNodeState prints the node at the given position in the given * nodesArray, using the nodesArray pre-computed sizes for the dynamic columns. */ void nodestatePrintNodeState(NodeAddressHeaders *headers, CurrentNodeState *nodeState) { char hostport[BUFSIZE] = { 0 }; char composedId[BUFSIZE] = { 0 }; char tliLSN[BUFSIZE] = { 0 }; char connection[BUFSIZE] = { 0 }; char healthChar = nodestateHealthToChar(nodeState->health); (void) nodestatePrepareNode(headers, &(nodeState->node), nodeState->groupId, hostport, composedId, tliLSN); if (healthChar == ' ') { sformat(connection, BUFSIZE, "%s", nodestateConnectionType(nodeState)); } else { sformat(connection, BUFSIZE, "%s %c", nodestateConnectionType(nodeState), healthChar); } fformat(stdout, "%*s | %*s | %*s | %*s | %*s | %*s | %*s\n", headers->maxNameSize, nodeState->node.name, headers->maxNodeSize, composedId, headers->maxHostSize, hostport, headers->maxLSNSize, tliLSN, headers->maxHealthSize, connection, headers->maxStateSize, NodeStateToString(nodeState->reportedState), headers->maxStateSize, NodeStateToString(nodeState->goalState)); } /* * nodestatePrepareNode prepares the "host:port" and the "Node" computed * columns used to display a node. The hostport and composedId parameters must * be pre-allocated string buffers. */ void nodestatePrepareNode(NodeAddressHeaders *headers, NodeAddress *node, int groupId, char *hostport, char *composedId, char *tliLSN) { sformat(hostport, BUFSIZE, "%s:%d", node->host, node->port); sformat(tliLSN, BUFSIZE, "%3d: %s", node->tli, node->lsn); switch (headers->nodeKind) { case NODE_KIND_STANDALONE: { sformat(composedId, BUFSIZE, "%" PRId64, node->nodeId); break; } default: { sformat(composedId, BUFSIZE, "%d/%" PRId64, groupId, node->nodeId); break; } } } /* * prepareHostNameSeparator fills in the pre-allocated given string with the * expected amount of dashes to use as a separator line in our tabular output. */ void prepareHostNameSeparator(char nameSeparatorHeader[], int size) { for (int i = 0; i <= size; i++) { if (i < size) { nameSeparatorHeader[i] = '-'; } else { nameSeparatorHeader[i] = '\0'; break; } } } /* * nodestateAsJSON populates a given JSON_Value with an JSON object that mimics * the output from SELECT * FROM pgautofailover.current_state() by taking the * information bits from the given nodeState. */ bool nodestateAsJSON(CurrentNodeState *nodeState, JSON_Value *js) { JSON_Object *jsobj = json_value_get_object(js); /* same field names as SELECT * FROM pgautofailover.current_state() */ json_object_set_number(jsobj, "node_id", (double) nodeState->node.nodeId); json_object_set_number(jsobj, "group_id", (double) nodeState->groupId); json_object_set_string(jsobj, "nodename", nodeState->node.name); json_object_set_string(jsobj, "nodehost", nodeState->node.host); json_object_set_number(jsobj, "nodeport", (double) nodeState->node.port); json_object_set_string(jsobj, "current_group_state", NodeStateToString(nodeState->reportedState)); json_object_set_string(jsobj, "assigned_group_state", NodeStateToString(nodeState->goalState)); json_object_set_number(jsobj, "timeline", (double) nodeState->node.tli); json_object_set_string(jsobj, "Minimum Recovery Ending LSN", nodeState->node.lsn); json_object_set_string(jsobj, "reachable", nodestateHealthToString(nodeState->health)); json_object_set_string(jsobj, "conntype", nodestateConnectionType(nodeState)); return true; } /* * Transform the health column from a monitor into a string. */ char * nodestateHealthToString(int health) { switch (health) { case -1: { return "unknown"; } case 0: { return "no"; } case 1: { return "yes"; } default: { log_error("BUG in nodestateHealthToString: health = %d", health); return "unknown"; } } } /* * Transform the health column from a monitor into a single char. */ char nodestateHealthToChar(int health) { switch (health) { case -1: { return '?'; } case 0: { return '!'; } case 1: { return ' '; } default: { log_error("BUG in nodestateHealthToString: health = %d", health); return '-'; } } } /* * nodestateConnectionType returns one of "read-write" or "read-only". */ char * nodestateConnectionType(CurrentNodeState *nodeState) { switch (nodeState->reportedState) { case SINGLE_STATE: case PRIMARY_STATE: case WAIT_PRIMARY_STATE: case JOIN_PRIMARY_STATE: case PREPARE_MAINTENANCE_STATE: case APPLY_SETTINGS_STATE: { return "read-write"; } case SECONDARY_STATE: case CATCHINGUP_STATE: case PREP_PROMOTION_STATE: case STOP_REPLICATION_STATE: case WAIT_MAINTENANCE_STATE: case FAST_FORWARD_STATE: case JOIN_SECONDARY_STATE: case REPORT_LSN_STATE: { return "read-only"; } /* in those states Postgres is known to be stopped/down */ case NO_STATE: case INIT_STATE: case DROPPED_STATE: case WAIT_STANDBY_STATE: case DEMOTED_STATE: case DEMOTE_TIMEOUT_STATE: case DRAINING_STATE: case MAINTENANCE_STATE: { return "none"; } case ANY_STATE: { return "unknown"; } /* default: is intentionally left out to have compiler check */ } return "unknown"; } /* * nodestate_log logs a CurrentNodeState, usually that comes from a * notification message we parse. */ void nodestate_log(CurrentNodeState *nodeState, int logLevel, int64_t nodeId) { if (nodeState->node.nodeId == nodeId) { log_level(logLevel, "New state for this node " "(node %" PRId64 ", \"%s\") (%s:%d): %s ➜ %s", nodeState->node.nodeId, nodeState->node.name, nodeState->node.host, nodeState->node.port, NodeStateToString(nodeState->reportedState), NodeStateToString(nodeState->goalState)); } else { log_level(logLevel, "New state for node %" PRId64 " \"%s\" (%s:%d): %s ➜ %s", nodeState->node.nodeId, nodeState->node.name, nodeState->node.host, nodeState->node.port, NodeStateToString(nodeState->reportedState), NodeStateToString(nodeState->goalState)); } } /* * printCurrentState loops over pgautofailover.current_state() results and prints * them, one per line. */ void printNodeArray(NodeAddressArray *nodesArray) { NodeAddressHeaders headers = { 0 }; /* We diplsay nodes all from the same group and don't have their groupId */ (void) nodeAddressArrayPrepareHeaders(&headers, nodesArray, 0, NODE_KIND_STANDALONE); (void) printNodeHeader(&headers); for (int index = 0; index < nodesArray->count; index++) { NodeAddress *node = &(nodesArray->nodes[index]); printNodeEntry(&headers, node); } fformat(stdout, "\n"); } /* * printNodeHeader pretty prints a header for a node list. */ void printNodeHeader(NodeAddressHeaders *headers) { fformat(stdout, "%*s | %*s | %*s | %21s | %8s\n", headers->maxNameSize, "Name", headers->maxNodeSize, "Node", headers->maxHostSize, "Host:Port", "TLI: LSN", "Primary?"); fformat(stdout, "%*s-+-%*s-+-%*s-+-%21s-+-%8s\n", headers->maxNameSize, headers->nameSeparatorHeader, headers->maxNodeSize, headers->nodeSeparatorHeader, headers->maxHostSize, headers->hostSeparatorHeader, "------------------", "--------"); } /* * printNodeEntry pretty prints a node. */ void printNodeEntry(NodeAddressHeaders *headers, NodeAddress *node) { char hostport[BUFSIZE] = { 0 }; char composedId[BUFSIZE] = { 0 }; char tliLSN[BUFSIZE] = { 0 }; (void) nodestatePrepareNode(headers, node, 0, hostport, composedId, tliLSN); fformat(stdout, "%*s | %*s | %*s | %21s | %8s\n", headers->maxNameSize, node->name, headers->maxNodeSize, composedId, headers->maxHostSize, hostport, tliLSN, node->isPrimary ? "yes" : "no"); } /* * nodestateFilterArrayGroup filters the given nodesArray to only the nodes * that are in the same group as the given node name. */ bool nodestateFilterArrayGroup(CurrentNodeStateArray *nodesArray, const char *name) { int groupId = -1; CurrentNodeStateArray nodesInSameGroup = { 0 }; /* first, find the groupId of the target node name */ for (int index = 0; index < nodesArray->count; index++) { CurrentNodeState *nodeState = &(nodesArray->nodes[index]); if (strcmp(nodeState->node.name, name) == 0) { groupId = nodeState->groupId; break; } } /* return false when the node name was not found */ if (groupId == -1) { /* turn the given nodesArray into a all-zero empty array */ memset(nodesArray, 0, sizeof(CurrentNodeStateArray)); return false; } /* now, build a new nodesArray with only the nodes in the same group */ for (int index = 0; index < nodesArray->count; index++) { CurrentNodeState *nodeState = &(nodesArray->nodes[index]); if (nodeState->groupId == groupId) { nodesInSameGroup.nodes[nodesInSameGroup.count] = *nodeState; ++nodesInSameGroup.count; } } /* * Finally, override the nodesArray parameter with the new contents. Note * that we want to preserve the headers. */ NodeAddressHeaders headers = nodesArray->headers; *nodesArray = nodesInSameGroup; nodesArray->headers = headers; return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/nodestate_utils.h000066400000000000000000000055611414244367200241430ustar00rootroot00000000000000/* * src/bin/pg_autoctl/nodestate_utils.h * Functions for printing node states. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef NODESTATE_H #define NODESTATE_H #include #include "pgsql.h" /* * CurrentNodeState gathers information we retrieve through the monitor * pgautofailover.current_state API, and that we can also form from other * pieces such as local configuration + local state, or monitor notifications. */ typedef struct CurrentNodeState { NodeAddress node; char formation[NAMEDATALEN]; char citusClusterName[NAMEDATALEN]; int groupId; PgInstanceKind pgKind; NodeState reportedState; NodeState goalState; int candidatePriority; bool replicationQuorum; int health; double healthLag; double reportLag; } CurrentNodeState; /* * CurrentNodeStateHeaders caches the information we need to print a nice user * formatted table from an array of NodeAddress. */ typedef struct NodeAddressHeaders { PgInstanceKind nodeKind; int maxNameSize; int maxHostSize; int maxNodeSize; int maxLSNSize; int maxStateSize; int maxHealthSize; char nameSeparatorHeader[BUFSIZE]; char hostSeparatorHeader[BUFSIZE]; char nodeSeparatorHeader[BUFSIZE]; char lsnSeparatorHeader[BUFSIZE]; char stateSeparatorHeader[BUFSIZE]; char healthSeparatorHeader[BUFSIZE]; } NodeAddressHeaders; typedef struct CurrentNodeStateArray { int count; CurrentNodeState nodes[NODE_ARRAY_MAX_COUNT]; NodeAddressHeaders headers; } CurrentNodeStateArray; void nodestatePrepareHeaders(CurrentNodeStateArray *nodesArray, PgInstanceKind nodeKind); void nodeAddressArrayPrepareHeaders(NodeAddressHeaders *headers, NodeAddressArray *nodesArray, int groupId, PgInstanceKind nodeKind); void nodestateAdjustHeaders(NodeAddressHeaders *headers, NodeAddress *node, int groupId); void prepareHeaderSeparators(NodeAddressHeaders *headers); void nodestatePrintHeader(NodeAddressHeaders *headers); void nodestatePrintNodeState(NodeAddressHeaders *headers, CurrentNodeState *nodeState); void nodestatePrepareNode(NodeAddressHeaders *headers, NodeAddress *node, int groupId, char *hostport, char *composedId, char *tliLSN); void prepareHostNameSeparator(char nameSeparatorHeader[], int size); bool nodestateAsJSON(CurrentNodeState *nodeState, JSON_Value *js); char * nodestateHealthToString(int health); char nodestateHealthToChar(int health); char * nodestateConnectionType(CurrentNodeState *nodeState); void nodestate_log(CurrentNodeState *nodeState, int logLevel, int64_t nodeId); void printNodeArray(NodeAddressArray *nodesArray); void printNodeHeader(NodeAddressHeaders *headers); void printNodeEntry(NodeAddressHeaders *headers, NodeAddress *node); bool nodestateFilterArrayGroup(CurrentNodeStateArray *nodesArray, const char *name); #endif /* NODESTATE_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/parsing.c000066400000000000000000000652161414244367200223760ustar00rootroot00000000000000/* * src/bin/pg_autoctl/parsing.c * API for parsing the output of some PostgreSQL server commands. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "parson.h" #include "log.h" #include "nodestate_utils.h" #include "parsing.h" #include "string_utils.h" static bool parse_controldata_field_dbstate(const char *controlDataString, DBState *state); static bool parse_controldata_field_uint32(const char *controlDataString, const char *fieldName, uint32_t *dest); static bool parse_controldata_field_uint64(const char *controlDataString, const char *fieldName, uint64_t *dest); static bool parse_controldata_field_lsn(const char *controlDataString, const char *fieldName, char lsn[]); static bool parse_bool_with_len(const char *value, size_t len, bool *result); static int nodeAddressCmpByNodeId(const void *a, const void *b); #define RE_MATCH_COUNT 10 /* * Simple Regexp matching that returns the first matching element. */ char * regexp_first_match(const char *string, const char *regex) { regex_t compiledRegex; regmatch_t m[RE_MATCH_COUNT]; if (string == NULL) { return NULL; } int status = regcomp(&compiledRegex, regex, REG_EXTENDED | REG_NEWLINE); if (status != 0) { /* * regerror() returns how many bytes are actually needed to host the * error message, and truncates the error message when it doesn't fit * in given size. If the message has been truncated, then we add an * ellispis to our log entry. * * We could also dynamically allocate memory for the error message, but * the error might be "out of memory" already... */ char message[BUFSIZE]; size_t bytes = regerror(status, &compiledRegex, message, BUFSIZE); log_error("Failed to compile regex \"%s\": %s%s", regex, message, bytes < BUFSIZE ? "..." : ""); regfree(&compiledRegex); return NULL; } /* * regexec returns 0 if the regular expression matches; otherwise, it * returns a nonzero value. */ int matchStatus = regexec(&compiledRegex, string, RE_MATCH_COUNT, m, 0); regfree(&compiledRegex); /* We're interested into 1. re matches 2. captured at least one group */ if (matchStatus != 0 || m[0].rm_so == -1 || m[1].rm_so == -1) { return NULL; } else { regoff_t start = m[1].rm_so; regoff_t finish = m[1].rm_eo; int length = finish - start + 1; char *result = (char *) malloc(length * sizeof(char)); if (result == NULL) { log_error(ALLOCATION_FAILED_ERROR); return NULL; } strlcpy(result, string + start, length); return result; } return NULL; } /* * Parse the version number output from pg_ctl --version: * pg_ctl (PostgreSQL) 10.3 */ bool parse_version_number(const char *version_string, char *pg_version_string, size_t size, int *pg_version) { char *match = regexp_first_match(version_string, "([0-9.]+)"); if (match == NULL) { log_error("Failed to parse Postgres version number \"%s\"", version_string); return false; } /* first, copy the version number in our expected result string buffer */ strlcpy(pg_version_string, match, size); if (!parse_pg_version_string(pg_version_string, pg_version)) { /* errors have already been logged */ free(match); return false; } free(match); return true; } /* * parse_dotted_version_string parses a major.minor dotted version string such * as "12.6" into a single number in the same format as the pg_control_version, * such as 1206. */ bool parse_dotted_version_string(const char *pg_version_string, int *pg_version) { /* now, parse the numbers into an integer, ala pg_control_version */ bool dotFound = false; char major[INTSTRING_MAX_DIGITS] = { 0 }; char minor[INTSTRING_MAX_DIGITS] = { 0 }; int majorIdx = 0; int minorIdx = 0; if (pg_version_string == NULL) { log_debug("BUG: parse_pg_version_string got NULL"); return false; } for (int i = 0; pg_version_string[i] != '\0'; i++) { if (pg_version_string[i] == '.') { if (dotFound) { log_error("Failed to parse Postgres version number \"%s\"", pg_version_string); return false; } dotFound = true; continue; } if (dotFound) { minor[minorIdx++] = pg_version_string[i]; } else { major[majorIdx++] = pg_version_string[i]; } } /* Postgres alpha/beta versions report version "14" instead of "14.0" */ if (!dotFound) { strlcpy(minor, "0", INTSTRING_MAX_DIGITS); } int maj = 0; int min = 0; if (!stringToInt(major, &maj) || !stringToInt(minor, &min)) { log_error("Failed to parse Postgres version number \"%s\"", pg_version_string); return false; } /* transform "12.6" into 1206, that is 12 * 100 + 6 */ *pg_version = (maj * 100) + min; return true; } /* * parse_pg_version_string parses a Postgres version string such as "12.6" into * a single number in the same format as the pg_control_version, such as 1206. */ bool parse_pg_version_string(const char *pg_version_string, int *pg_version) { return parse_dotted_version_string(pg_version_string, pg_version); } /* * parse_pgaf_version_string parses a pg_auto_failover version string such as * "1.4" into a single number in the same format as the pg_control_version, * such as 104. */ bool parse_pgaf_extension_version_string(const char *version_string, int *version) { return parse_dotted_version_string(version_string, version); } /* * Parse the first 3 lines of output from pg_controldata: * * pg_control version number: 1002 * Catalog version number: 201707211 * Database system identifier: 6534312872085436521 * */ bool parse_controldata(PostgresControlData *pgControlData, const char *control_data_string) { if (!parse_controldata_field_dbstate(control_data_string, &(pgControlData->state)) || !parse_controldata_field_uint32(control_data_string, "pg_control version number", &(pgControlData->pg_control_version)) || !parse_controldata_field_uint32(control_data_string, "Catalog version number", &(pgControlData->catalog_version_no)) || !parse_controldata_field_uint64(control_data_string, "Database system identifier", &(pgControlData->system_identifier)) || !parse_controldata_field_lsn(control_data_string, "Latest checkpoint location", pgControlData->latestCheckpointLSN) || !parse_controldata_field_uint32(control_data_string, "Latest checkpoint's TimeLineID", &(pgControlData->timeline_id))) { log_error("Failed to parse pg_controldata output"); return false; } return true; } #define streq(x, y) ((x != NULL) && (y != NULL) && (strcmp(x, y) == 0)) /* * parse_controldata_field_dbstate matches pg_controldata output for Database * cluster state and fills in the value string as an enum value. */ static bool parse_controldata_field_dbstate(const char *controlDataString, DBState *state) { char regex[BUFSIZE] = { 0 }; sformat(regex, BUFSIZE, "Database cluster state: *(.*)$"); char *match = regexp_first_match(controlDataString, regex); if (match == NULL) { return false; } if (streq(match, "starting up")) { *state = DB_STARTUP; } else if (streq(match, "shut down")) { *state = DB_SHUTDOWNED; } else if (streq(match, "shut down in recovery")) { *state = DB_SHUTDOWNED_IN_RECOVERY; } else if (streq(match, "shutting down")) { *state = DB_SHUTDOWNING; } else if (streq(match, "in crash recovery")) { *state = DB_IN_CRASH_RECOVERY; } else if (streq(match, "in archive recovery")) { *state = DB_IN_ARCHIVE_RECOVERY; } else if (streq(match, "in production")) { *state = DB_IN_PRODUCTION; } else { log_error("Failed to parse database cluster state \"%s\"", match); free(match); return false; } free(match); return true; } /* * parse_controldata_field_uint32 matches pg_controldata output for a field * name and gets its value as an uint64_t. It returns false when something went * wrong, and true when the value can be used. */ static bool parse_controldata_field_uint32(const char *controlDataString, const char *fieldName, uint32_t *dest) { char regex[BUFSIZE]; sformat(regex, BUFSIZE, "^%s: *([0-9]+)$", fieldName); char *match = regexp_first_match(controlDataString, regex); if (match == NULL) { return false; } if (!stringToUInt32(match, dest)) { log_error("Failed to parse number \"%s\": %m", match); free(match); return false; } free(match); return true; } /* * parse_controldata_field_uint64 matches pg_controldata output for a field * name and gets its value as an uint64_t. It returns false when something went * wrong, and true when the value can be used. */ static bool parse_controldata_field_uint64(const char *controlDataString, const char *fieldName, uint64_t *dest) { char regex[BUFSIZE]; sformat(regex, BUFSIZE, "^%s: *([0-9]+)$", fieldName); char *match = regexp_first_match(controlDataString, regex); if (match == NULL) { return false; } if (!stringToUInt64(match, dest)) { log_error("Failed to parse number \"%s\": %m", match); free(match); return false; } free(match); return true; } /* * parse_controldata_field_lsn matches pg_controldata output for a field name * and gets its value as a string, in an area that must be pre-allocated with * at least PG_LSN_MAXLENGTH bytes. */ static bool parse_controldata_field_lsn(const char *controlDataString, const char *fieldName, char lsn[]) { char regex[BUFSIZE]; sformat(regex, BUFSIZE, "^%s: *([0-9A-F]+/[0-9A-F]+)$", fieldName); char *match = regexp_first_match(controlDataString, regex); if (match == NULL) { return false; } strlcpy(lsn, match, PG_LSN_MAXLENGTH); free(match); return true; } /* * parse_notification_message parses pgautofailover state change notifications, * which are sent in the JSON format. */ bool parse_state_notification_message(CurrentNodeState *nodeState, const char *message) { JSON_Value *json = json_parse_string(message); JSON_Object *jsobj = json_value_get_object(json); log_trace("parse_state_notification_message: %s", message); if (json_type(json) != JSONObject) { log_error("Failed to parse JSON notification message: \"%s\"", message); json_value_free(json); return false; } char *str = (char *) json_object_get_string(jsobj, "type"); if (strcmp(str, "state") != 0) { log_error("Failed to parse JSOBJ notification state message: " "jsobj object type is not \"state\" as expected"); json_value_free(json); return false; } str = (char *) json_object_get_string(jsobj, "formation"); if (str == NULL) { log_error("Failed to parse formation in JSON " "notification message \"%s\"", message); json_value_free(json); return false; } strlcpy(nodeState->formation, str, sizeof(nodeState->formation)); double number = json_object_get_number(jsobj, "groupId"); nodeState->groupId = (int) number; number = json_object_get_number(jsobj, "nodeId"); nodeState->node.nodeId = (int) number; str = (char *) json_object_get_string(jsobj, "name"); if (str == NULL) { log_error("Failed to parse node name in JSON " "notification message \"%s\"", message); json_value_free(json); return false; } strlcpy(nodeState->node.name, str, sizeof(nodeState->node.name)); str = (char *) json_object_get_string(jsobj, "host"); if (str == NULL) { log_error("Failed to parse node host in JSON " "notification message \"%s\"", message); json_value_free(json); return false; } strlcpy(nodeState->node.host, str, sizeof(nodeState->node.host)); number = json_object_get_number(jsobj, "port"); nodeState->node.port = (int) number; str = (char *) json_object_get_string(jsobj, "reportedState"); if (str == NULL) { log_error("Failed to parse reportedState in JSON " "notification message \"%s\"", message); json_value_free(json); return false; } nodeState->reportedState = NodeStateFromString(str); str = (char *) json_object_get_string(jsobj, "goalState"); if (str == NULL) { log_error("Failed to parse goalState in JSON " "notification message \"%s\"", message); json_value_free(json); return false; } nodeState->goalState = NodeStateFromString(str); str = (char *) json_object_get_string(jsobj, "health"); if (streq(str, "unknown")) { nodeState->health = -1; } else if (streq(str, "bad")) { nodeState->health = 0; } else if (streq(str, "good")) { nodeState->health = 1; } else { log_error("Failed to parse health in JSON " "notification message \"%s\"", message); json_value_free(json); return false; } json_value_free(json); return true; } /* * Try to interpret value as boolean value. Valid values are: true, * false, yes, no, on, off, 1, 0; as well as unique prefixes thereof. * If the string parses okay, return true, else false. * If okay and result is not NULL, return the value in *result. * * Copied from PostgreSQL sources * file : src/backend/utils/adt/bool.c */ static bool parse_bool_with_len(const char *value, size_t len, bool *result) { switch (*value) { case 't': case 'T': { if (pg_strncasecmp(value, "true", len) == 0) { if (result) { *result = true; } return true; } break; } case 'f': case 'F': { if (pg_strncasecmp(value, "false", len) == 0) { if (result) { *result = false; } return true; } break; } case 'y': case 'Y': { if (pg_strncasecmp(value, "yes", len) == 0) { if (result) { *result = true; } return true; } break; } case 'n': case 'N': { if (pg_strncasecmp(value, "no", len) == 0) { if (result) { *result = false; } return true; } break; } case 'o': case 'O': { /* 'o' is not unique enough */ if (pg_strncasecmp(value, "on", (len > 2 ? len : 2)) == 0) { if (result) { *result = true; } return true; } else if (pg_strncasecmp(value, "off", (len > 2 ? len : 2)) == 0) { if (result) { *result = false; } return true; } break; } case '1': { if (len == 1) { if (result) { *result = true; } return true; } break; } case '0': { if (len == 1) { if (result) { *result = false; } return true; } break; } default: { break; } } if (result) { *result = false; /* suppress compiler warning */ } return false; } /* * parse_bool parses boolean text value (true/false/on/off/yes/no/1/0) and * puts the boolean value back in the result field if it is not NULL. * The function returns true on successful parse, returns false if any parse * error is encountered. */ bool parse_bool(const char *value, bool *result) { return parse_bool_with_len(value, strlen(value), result); } /* * parse_pguri_info_key_vals decomposes elements of a Postgres connection * string (URI) into separate arrays of keywords and values as expected by * PQconnectdbParams. */ bool parse_pguri_info_key_vals(const char *pguri, KeyVal *overrides, URIParams *uriParameters, bool checkForCompleteURI) { char *errmsg; PQconninfoOption *conninfo, *option; bool foundHost = false; bool foundUser = false; bool foundPort = false; bool foundDBName = false; int paramIndex = 0; conninfo = PQconninfoParse(pguri, &errmsg); if (conninfo == NULL) { log_error("Failed to parse pguri \"%s\": %s", pguri, errmsg); PQfreemem(errmsg); return false; } for (option = conninfo; option->keyword != NULL; option++) { char *value = NULL; int ovIndex = 0; /* * If the keyword is in our overrides array, use the value from the * override values. Yeah that's O(n*m) but here m is expected to be * something very small, like 3 (typically: sslmode, sslrootcert, * sslcrl). */ for (ovIndex = 0; ovIndex < overrides->count; ovIndex++) { if (strcmp(overrides->keywords[ovIndex], option->keyword) == 0) { value = overrides->values[ovIndex]; } } /* not found in the override, keep the original, or skip */ if (value == NULL) { if (option->val == NULL || strcmp(option->val, "") == 0) { continue; } else { value = option->val; } } if (strcmp(option->keyword, "host") == 0 || strcmp(option->keyword, "hostaddr") == 0) { foundHost = true; strlcpy(uriParameters->hostname, option->val, MAXCONNINFO); } else if (strcmp(option->keyword, "port") == 0) { foundPort = true; strlcpy(uriParameters->port, option->val, MAXCONNINFO); } else if (strcmp(option->keyword, "user") == 0) { foundUser = true; strlcpy(uriParameters->username, option->val, MAXCONNINFO); } else if (strcmp(option->keyword, "dbname") == 0) { foundDBName = true; strlcpy(uriParameters->dbname, option->val, MAXCONNINFO); } else if (!IS_EMPTY_STRING_BUFFER(value)) { /* make a copy in our key/val arrays */ strlcpy(uriParameters->parameters.keywords[paramIndex], option->keyword, MAXCONNINFO); strlcpy(uriParameters->parameters.values[paramIndex], value, MAXCONNINFO); ++uriParameters->parameters.count; ++paramIndex; } } PQconninfoFree(conninfo); /* * Display an error message per missing field, and only then return false * if we're missing any one of those. */ if (checkForCompleteURI) { if (!foundHost) { log_error("Failed to find hostname in the pguri \"%s\"", pguri); } if (!foundPort) { log_error("Failed to find port in the pguri \"%s\"", pguri); } if (!foundUser) { log_error("Failed to find username in the pguri \"%s\"", pguri); } if (!foundDBName) { log_error("Failed to find dbname in the pguri \"%s\"", pguri); } return foundHost && foundPort && foundUser && foundDBName; } else { return true; } } /* * buildPostgresURIfromPieces builds a Postgres connection string from keywords * and values, in a user friendly way. The pguri parameter should point to a * memory area that has been allocated by the caller and has at least * MAXCONNINFO bytes. */ bool buildPostgresURIfromPieces(URIParams *uriParams, char *pguri) { int index = 0; sformat(pguri, MAXCONNINFO, "postgres://%s@%s:%s/%s?", uriParams->username, uriParams->hostname, uriParams->port, uriParams->dbname); for (index = 0; index < uriParams->parameters.count; index++) { if (index == 0) { sformat(pguri, MAXCONNINFO, "%s%s=%s", pguri, uriParams->parameters.keywords[index], uriParams->parameters.values[index]); } else { sformat(pguri, MAXCONNINFO, "%s&%s=%s", pguri, uriParams->parameters.keywords[index], uriParams->parameters.values[index]); } } return true; } /* * parse_pguri_ssl_settings parses SSL settings from a Postgres connection * string. Given the following connection string * * "postgres://autoctl_node@localhost:5500/pg_auto_failover?sslmode=prefer" * * we then have an ssl->active = 1, ssl->sslMode = SSL_MODE_PREFER, etc. */ bool parse_pguri_ssl_settings(const char *pguri, SSLOptions *ssl) { URIParams params = { 0 }; KeyVal overrides = { 0 }; bool checkForCompleteURI = true; /* initialize SSL Params values */ if (!parse_pguri_info_key_vals(pguri, &overrides, ¶ms, checkForCompleteURI)) { /* errors have already been logged */ return false; } for (int index = 0; index < params.parameters.count; index++) { char *key = params.parameters.keywords[index]; char *val = params.parameters.values[index]; if (streq(key, "sslmode")) { ssl->sslMode = pgsetup_parse_sslmode(val); strlcpy(ssl->sslModeStr, val, sizeof(ssl->sslModeStr)); if (ssl->sslMode > SSL_MODE_DISABLE) { ssl->active = true; } } else if (streq(key, "sslrootcert")) { strlcpy(ssl->caFile, val, sizeof(ssl->caFile)); } else if (streq(key, "sslcrl")) { strlcpy(ssl->crlFile, val, sizeof(ssl->crlFile)); } else if (streq(key, "sslcert")) { strlcpy(ssl->serverCert, val, sizeof(ssl->serverCert)); } else if (streq(key, "sslkey")) { strlcpy(ssl->serverKey, val, sizeof(ssl->serverKey)); } } /* cook-in defaults when the parsed URL contains no SSL settings */ if (ssl->sslMode == SSL_MODE_UNKNOWN) { ssl->active = true; ssl->sslMode = SSL_MODE_PREFER; strlcpy(ssl->sslModeStr, pgsetup_sslmode_to_string(ssl->sslMode), sizeof(ssl->sslModeStr)); } return true; } /* * nodeAddressCmpByNodeId sorts two given nodeAddress by comparing their * nodeId. We use this function to be able to pg_qsort() an array of nodes, * such as when parsing from a JSON file. */ static int nodeAddressCmpByNodeId(const void *a, const void *b) { NodeAddress *nodeA = (NodeAddress *) a; NodeAddress *nodeB = (NodeAddress *) b; return nodeA->nodeId - nodeB->nodeId; } /* * parseLSN is based on the Postgres code for pg_lsn_in_internal found at * src/backend/utils/adt/pg_lsn.c in the Postgres source repository. In the * pg_auto_failover context we don't need to typedef uint64 XLogRecPtr; so we * just use uint64_t internally. */ #define MAXPG_LSNCOMPONENT 8 bool parseLSN(const char *str, uint64_t *lsn) { int len1, len2; uint32 id, off; /* Sanity check input format. */ len1 = strspn(str, "0123456789abcdefABCDEF"); if (len1 < 1 || len1 > MAXPG_LSNCOMPONENT || str[len1] != '/') { return false; } len2 = strspn(str + len1 + 1, "0123456789abcdefABCDEF"); if (len2 < 1 || len2 > MAXPG_LSNCOMPONENT || str[len1 + 1 + len2] != '\0') { return false; } /* Decode result. */ id = (uint32) strtoul(str, NULL, 16); off = (uint32) strtoul(str + len1 + 1, NULL, 16); *lsn = ((uint64) id << 32) | off; return true; } /* * parseNodesArrayFromFile parses a Nodes Array from a JSON file, that contains * an array of JSON object with the following properties: node_id, node_lsn, * node_host, node_name, node_port, and potentially node_is_primary. */ bool parseNodesArray(const char *nodesJSON, NodeAddressArray *nodesArray, int64_t nodeId) { JSON_Value *template = json_parse_string("[{" "\"node_id\": 0," "\"node_lsn\": \"\"," "\"node_name\": \"\"," "\"node_host\": \"\"," "\"node_port\": 0," "\"node_is_primary\": false" "}]"); int nodesArrayIndex = 0; int primaryCount = 0; JSON_Value *json = json_parse_string(nodesJSON); /* validate the JSON input as an array of object with required fields */ if (json_validate(template, json) == JSONFailure) { log_error("Failed to parse nodes array which is expected " "to contain a JSON Array of Objects with properties " "[{node_id:number, node_name:string, " "node_host:string, node_port:number, node_lsn:string, " "node_is_primary:boolean}, ...]"); json_value_free(template); json_value_free(json); return false; } JSON_Array *jsArray = json_value_get_array(json); int len = json_array_get_count(jsArray); if (NODE_ARRAY_MAX_COUNT < len) { log_error("Failed to parse nodes array which contains " "%d nodes: pg_autoctl supports up to %d nodes", len, NODE_ARRAY_MAX_COUNT); json_value_free(template); json_value_free(json); return false; } nodesArray->count = len; for (int i = 0; i < len; i++) { NodeAddress *node = &(nodesArray->nodes[nodesArrayIndex]); JSON_Object *jsObj = json_array_get_object(jsArray, i); int jsNodeId = (int) json_object_get_number(jsObj, "node_id"); uint64_t lsn = 0; /* we install the keeper.otherNodes array, so skip ourselves */ if (jsNodeId == nodeId) { --(nodesArray->count); continue; } node->nodeId = jsNodeId; strlcpy(node->name, json_object_get_string(jsObj, "node_name"), sizeof(node->name)); strlcpy(node->host, json_object_get_string(jsObj, "node_host"), sizeof(node->host)); node->port = (int) json_object_get_number(jsObj, "node_port"); strlcpy(node->lsn, json_object_get_string(jsObj, "node_lsn"), sizeof(node->lsn)); if (!parseLSN(node->lsn, &lsn)) { log_error("Failed to parse nodes array LSN value \"%s\"", node->lsn); json_value_free(template); json_value_free(json); return false; } node->isPrimary = json_object_get_boolean(jsObj, "node_is_primary"); if (node->isPrimary) { ++primaryCount; if (primaryCount > 1) { log_error("Failed to parse nodes array: more than one node " "is listed with \"node_is_primary\" true."); json_value_free(template); json_value_free(json); return false; } } ++nodesArrayIndex; } json_value_free(template); json_value_free(json); /* now ensure the array is sorted by nodeId */ (void) pg_qsort(nodesArray->nodes, nodesArray->count, sizeof(NodeAddress), nodeAddressCmpByNodeId); /* check that every node id is unique in our array */ for (int i = 0; i < (nodesArray->count - 1); i++) { int currentNodeId = nodesArray->nodes[i].nodeId; int nextNodeId = nodesArray->nodes[i + 1].nodeId; if (currentNodeId == nextNodeId) { log_error("Failed to parse nodes array: more than one node " "is listed with the same nodeId %d", currentNodeId); return false; } } return true; } /* * uri_contains_password takes a Postgres connection string and checks to see * if it contains a parameter called password. Returns true if a password * keyword is present in the connection string. */ static bool uri_contains_password(const char *pguri) { char *errmsg; PQconninfoOption *conninfo, *option; conninfo = PQconninfoParse(pguri, &errmsg); if (conninfo == NULL) { log_error("Failed to parse pguri: %s", errmsg); PQfreemem(errmsg); return false; } /* * Look for a populated password connection parameter */ for (option = conninfo; option->keyword != NULL; option++) { if (strcmp(option->keyword, "password") == 0 && option->val != NULL && !IS_EMPTY_STRING_BUFFER(option->val)) { PQconninfoFree(conninfo); return true; } } PQconninfoFree(conninfo); return false; } /* * parse_and_scrub_connection_string takes a Postgres connection string and * populates scrubbedPguri with the password replaced with **** for logging. * The scrubbedPguri parameter should point to a memory area that has been * allocated by the caller and has at least MAXCONNINFO bytes. */ bool parse_and_scrub_connection_string(const char *pguri, char *scrubbedPguri) { URIParams uriParams = { 0 }; KeyVal overrides = { 0 }; if (uri_contains_password(pguri)) { overrides = (KeyVal) { .count = 1, .keywords = { "password" }, .values = { "****" } }; } bool checkForCompleteURI = false; if (!parse_pguri_info_key_vals(pguri, &overrides, &uriParams, checkForCompleteURI)) { return false; } buildPostgresURIfromPieces(&uriParams, scrubbedPguri); return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/parsing.h000066400000000000000000000055071414244367200224000ustar00rootroot00000000000000/* * src/bin/pg_autoctl/parsing.c * API for parsing the output of some PostgreSQL server commands. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef PARSING_H #define PARSING_H #include #include "monitor.h" #include "nodestate_utils.h" #include "pgctl.h" char * regexp_first_match(const char *string, const char *re); bool parse_version_number(const char *version_string, char *pg_version_string, size_t size, int *pg_version); bool parse_dotted_version_string(const char *pg_version_string, int *pg_version); bool parse_pg_version_string(const char *pg_version_string, int *pg_version); bool parse_pgaf_extension_version_string(const char *version_string, int *version); bool parse_controldata(PostgresControlData *pgControlData, const char *control_data_string); bool parse_state_notification_message(CurrentNodeState *nodeState, const char *message); bool parse_bool(const char *value, bool *result); #define boolToString(value) (value) ? "true" : "false" /* * To parse Postgres URI we need to store keywords and values in separate * arrays of strings, because that's the libpq way of doing things. * * keywords and values are arrays of string and the arrays must be large enough * to fit all the connection parameters (of which we count 36 at the moment on * the Postgres documentation). * * See https://www.postgresql.org/docs/current/libpq-connect.html * * So here we use 64 entries each of MAXCONNINFO, to ensure we have enough room * to store all the parts of a typicallay MAXCONNINFO bounded full URI. That * amounts to 64kB of memory, so that's not even a luxury. */ typedef struct KeyVal { int count; char keywords[64][MAXCONNINFO]; char values[64][MAXCONNINFO]; } KeyVal; /* * In our own internal processing of Postgres URIs, we want to have some of the * URL parts readily accessible by name rather than mixed in the KeyVal * structure. * * That's mostly becase we want to produce an URI with the following form: * * postgres://user@host:port/dbname?opt=val */ typedef struct URIParams { char username[MAXCONNINFO]; char hostname[MAXCONNINFO]; char port[MAXCONNINFO]; char dbname[MAXCONNINFO]; KeyVal parameters; } URIParams; bool parse_pguri_info_key_vals(const char *pguri, KeyVal *overrides, URIParams *uriParameters, bool checkForCompleteURI); bool buildPostgresURIfromPieces(URIParams *uriParams, char *pguri); bool parse_pguri_ssl_settings(const char *pguri, SSLOptions *ssl); bool parse_and_scrub_connection_string(const char *pguri, char *scrubbedPguri); bool parseLSN(const char *str, uint64_t *lsn); bool parseNodesArray(const char *nodesJSON, NodeAddressArray *nodesArray, int64_t nodeId); #endif /* PARSING_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/pgctl.c000066400000000000000000002137701414244367200220440ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pgctl.c * API for controling PostgreSQL, using its binary tooling (pg_ctl, * pg_controldata, pg_basebackup and such). * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "defaults.h" #include "env_utils.h" #include "file_utils.h" #include "log.h" #include "parsing.h" #include "pgctl.h" #include "pgsql.h" #include "pgsetup.h" #include "pgtuning.h" #include "signals.h" #include "string_utils.h" #define RUN_PROGRAM_IMPLEMENTATION #include "runprogram.h" #define AUTOCTL_CONF_INCLUDE_COMMENT \ " # Auto-generated by pg_auto_failover, do not remove\n" #define AUTOCTL_CONF_INCLUDE_LINE "include '" AUTOCTL_DEFAULTS_CONF_FILENAME "'" #define AUTOCTL_SB_CONF_INCLUDE_LINE "include '" AUTOCTL_STANDBY_CONF_FILENAME "'" static bool pg_include_config(const char *configFilePath, const char *configIncludeLine, const char *configIncludeComment); static bool ensure_default_settings_file_exists(const char *configFilePath, GUC *settings, PostgresSetup *pgSetup, const char *hostname, bool includeTuning); static bool prepare_guc_settings_from_pgsetup(const char *configFilePath, PQExpBuffer config, GUC *settings, PostgresSetup *pgSetup, const char *hostname, bool includeTuning); static void log_program_output(Program prog, int outLogLevel, int errorLogLevel); static bool prepare_recovery_settings(const char *pgdata, ReplicationSource *replicationSource, char *primaryConnInfo, char *primarySlotName, char *targetLSN, char *targetAction, char *targetTimeline); static bool escape_recovery_conf_string(char *destination, int destinationSize, const char *recoveryConfString); static bool prepare_primary_conninfo(char *primaryConnInfo, int primaryConnInfoSize, const char *primaryHost, int primaryPort, const char *replicationUsername, const char *dbname, const char *replicationPassword, const char *applicationName, SSLOptions sslOptions, bool escape); static bool prepare_conninfo_sslmode(PQExpBuffer buffer, SSLOptions sslOptions); static bool pg_write_recovery_conf(const char *pgdata, ReplicationSource *replicationSource); static bool pg_write_standby_signal(const char *pgdata, ReplicationSource *replicationSource); /* * Get pg_ctl --version output in pgSetup->pg_version. */ bool pg_ctl_version(PostgresSetup *pgSetup) { Program prog = run_program(pgSetup->pg_ctl, "--version", NULL); char pg_version_string[PG_VERSION_STRING_MAX] = { 0 }; int pg_version = 0; if (prog.returnCode != 0) { errno = prog.error; log_error("Failed to run \"pg_ctl --version\" using program \"%s\": %m", pgSetup->pg_ctl); free_program(&prog); return false; } if (!parse_version_number(prog.stdOut, pg_version_string, PG_VERSION_STRING_MAX, &pg_version)) { /* errors have already been logged */ free_program(&prog); return false; } free_program(&prog); strlcpy(pgSetup->pg_version, pg_version_string, PG_VERSION_STRING_MAX); return true; } /* * set_pg_ctl_from_PG_CONFIG sets given pgSetup->pg_ctl to the pg_ctl binary * installed in the bindir of the target Postgres installation: * * $(${PG_CONFIG} --bindir)/pg_ctl */ bool set_pg_ctl_from_config_bindir(PostgresSetup *pgSetup, const char *pg_config) { char pg_ctl[MAXPGPATH] = { 0 }; if (!file_exists(pg_config)) { log_debug("set_pg_ctl_from_config_bindir: file not found: \"%s\"", pg_config); return false; } Program prog = run_program(pg_config, "--bindir", NULL); char *lines[1]; if (prog.returnCode != 0) { errno = prog.error; log_error("Failed to run \"pg_config --bindir\" using program \"%s\": %m", pg_config); free_program(&prog); return false; } if (splitLines(prog.stdOut, lines, 1) != 1) { log_error("Unable to parse output from pg_config --bindir"); free_program(&prog); return false; } char *bindir = lines[0]; join_path_components(pg_ctl, bindir, "pg_ctl"); /* we're now done with the Program and its output */ free_program(&prog); if (!file_exists(pg_ctl)) { log_error("Failed to find pg_ctl at \"%s\" from PG_CONFIG at \"%s\"", pgSetup->pg_ctl, pg_config); return false; } strlcpy(pgSetup->pg_ctl, pg_ctl, sizeof(pgSetup->pg_ctl)); return true; } /* * Read some of the information from pg_controldata output. */ bool pg_controldata(PostgresSetup *pgSetup, bool missing_ok) { char globalControlPath[MAXPGPATH] = { 0 }; char pg_controldata_path[MAXPGPATH] = { 0 }; if (pgSetup->pgdata[0] == '\0' || pgSetup->pg_ctl[0] == '\0') { log_error("BUG: pg_controldata: missing pgSetup pgdata or pg_ctl"); return false; } /* globalControlFilePath = $PGDATA/global/pg_control */ join_path_components(globalControlPath, pgSetup->pgdata, "global/pg_control"); /* * Refrain from doing too many pg_controldata checks, only proceed when the * PGDATA/global/pg_control file exists on-disk: that's the first check * that pg_controldata does anyway. */ if (!file_exists(globalControlPath)) { return false; } /* now find the pg_controldata binary */ path_in_same_directory(pgSetup->pg_ctl, "pg_controldata", pg_controldata_path); log_debug("%s %s", pg_controldata_path, pgSetup->pgdata); /* We parse the output of pg_controldata, make sure it's as expected */ setenv("LANG", "C", 1); Program prog = run_program(pg_controldata_path, pgSetup->pgdata, NULL); if (prog.returnCode == 0) { if (prog.stdOut == NULL) { /* happens sometimes, and I don't know why */ log_warn("Got empty output from `%s %s`, trying again in 1s", pg_controldata_path, pgSetup->pgdata); sleep(1); return pg_controldata(pgSetup, missing_ok); } if (!parse_controldata(&pgSetup->control, prog.stdOut)) { log_error("%s %s", pg_controldata_path, pgSetup->pgdata); log_warn("Failed to parse pg_controldata output:\n%s", prog.stdOut); free_program(&prog); return false; } free_program(&prog); return true; } else { int errorLogLevel = missing_ok ? LOG_DEBUG : LOG_ERROR; (void) log_program_output(prog, LOG_INFO, errorLogLevel); log_level(errorLogLevel, "Failed to run \"%s\" on \"%s\", see above for details", pg_controldata_path, pgSetup->pgdata); free_program(&prog); return missing_ok; } } /* * set_pg_ctl_from_PG_CONFIG sets the path to pg_ctl following the exported * environment variable PG_CONFIG, when it is found in the environment. * * Postgres developer environments often define PG_CONFIG in the environment to * build extensions for a specific version of Postgres. Let's use the hint here * too. */ bool set_pg_ctl_from_PG_CONFIG(PostgresSetup *pgSetup) { char PG_CONFIG[MAXPGPATH] = { 0 }; if (!env_exists("PG_CONFIG")) { /* then we don't use PG_CONFIG to find pg_ctl */ return false; } if (!get_env_copy("PG_CONFIG", PG_CONFIG, sizeof(PG_CONFIG))) { /* errors have already been logged */ return false; } if (!file_exists(PG_CONFIG)) { log_error("Failed to find a file for PG_CONFIG environment value \"%s\"", PG_CONFIG); return false; } if (!set_pg_ctl_from_config_bindir(pgSetup, PG_CONFIG)) { /* errors have already been logged */ return false; } if (!pg_ctl_version(pgSetup)) { log_fatal("Failed to get version info from %s --version", pgSetup->pg_ctl); return false; } log_debug("Found pg_ctl for PostgreSQL %s at %s following PG_CONFIG", pgSetup->pg_version, pgSetup->pg_ctl); return true; } /* * set_pg_ctl_from_pg_config sets the path to pg_ctl by using pg_config * --bindir when there is a single pg_config found in the PATH. * * When using debian/ubuntu packaging then pg_config is installed as part as * the postgresql-common package in /usr/bin, whereas pg_ctl is installed in a * major version dependent location such as /usr/lib/postgresql/12/bin, and * those locations are not included in the PATH. * * So when we can't find pg_ctl anywhere in the PATH, we look for pg_config * instead, and then use pg_config --bindir to discover the pg_ctl we can use. */ bool set_pg_ctl_from_pg_config(PostgresSetup *pgSetup) { SearchPath all_pg_configs = { 0 }; SearchPath pg_configs = { 0 }; if (!search_path("pg_config", &all_pg_configs)) { return false; } if (!search_path_deduplicate_symlinks(&all_pg_configs, &pg_configs)) { log_error("Failed to resolve symlinks found in PATH entries, " "see above for details"); return false; } switch (pg_configs.found) { case 0: { log_warn("Failed to find either pg_ctl or pg_config in PATH"); return false; } case 1: { if (!set_pg_ctl_from_config_bindir(pgSetup, pg_configs.matches[0])) { /* errors have already been logged */ return false; } if (!pg_ctl_version(pgSetup)) { log_fatal("Failed to get version info from %s --version", pgSetup->pg_ctl); return false; } log_debug("Found pg_ctl for PostgreSQL %s at %s from pg_config " "found in PATH at \"%s\"", pgSetup->pg_version, pgSetup->pg_ctl, pg_configs.matches[0]); return true; } default: { log_info("Found more than one pg_config entry in current PATH:"); for (int i = 0; i < pg_configs.found; i++) { PostgresSetup currentPgSetup = { 0 }; strlcpy(currentPgSetup.pg_ctl, pg_configs.matches[i], sizeof(currentPgSetup.pg_ctl)); if (!pg_ctl_version(¤tPgSetup)) { /* * Because of this it's possible that there's now only a * single working version of pg_ctl found in PATH. If * that's the case we will still not use that by default, * since the users intention is unclear. They might have * wanted to use the version of pg_ctl that we could not * parse the version string for. So we warn and continue, * the user should make their intention clear by using the * --pg_ctl option (or changing PATH). */ log_warn("Failed to get version info from %s --version", currentPgSetup.pg_ctl); continue; } log_info("Found \"%s\" for pg version %s", currentPgSetup.pg_ctl, currentPgSetup.pg_version); } log_info("HINT: export PG_CONFIG to a specific pg_config entry"); return false; } } return false; } /* * Find "pg_ctl" programs in the PATH. If a single one exists, set its absolute * location in pg_ctl, and the PostgreSQL version number in pg_version. * * Returns how many "pg_ctl" programs have been found in the PATH. */ bool config_find_pg_ctl(PostgresSetup *pgSetup) { SearchPath all_pg_ctls = { 0 }; SearchPath pg_ctls = { 0 }; pgSetup->pg_ctl[0] = '\0'; pgSetup->pg_version[0] = '\0'; /* * Postgres developer environments often define PG_CONFIG in the * environment to build extensions for a specific version of Postgres. * Let's use the hint here too. */ if (set_pg_ctl_from_PG_CONFIG(pgSetup)) { return true; } /* no PG_CONFIG. let's use the more classic approach with PATH instead */ if (!search_path("pg_ctl", &all_pg_ctls)) { return false; } if (!search_path_deduplicate_symlinks(&all_pg_ctls, &pg_ctls)) { log_error("Failed to resolve symlinks found in PATH entries, " "see above for details"); return false; } if (pg_ctls.found == 1) { char *program = pg_ctls.matches[0]; strlcpy(pgSetup->pg_ctl, program, MAXPGPATH); if (!pg_ctl_version(pgSetup)) { log_fatal("Failed to get version info from \"%s\" --version", pgSetup->pg_ctl); return false; } log_debug("Found pg_ctl for PostgreSQL %s at \"%s\"", pgSetup->pg_version, pgSetup->pg_ctl); return true; } else { /* * Then, first look for pg_config --bindir with pg_config in PATH, * we might have a single entry there, as is the case on a typical * debian/ubuntu packaging, in /usr/bin/pg_config installed from * the postgresql-common package. */ PostgresSetup pgSetupFromPgConfig = { 0 }; if (pg_ctls.found == 0) { log_debug("Failed to find pg_ctl in PATH, looking for pg_config"); } else { log_debug("Found %d entries for pg_ctl in PATH, " "looking for pg_config", pg_ctls.found); } if (set_pg_ctl_from_pg_config(&pgSetupFromPgConfig)) { strlcpy(pgSetup->pg_ctl, pgSetupFromPgConfig.pg_ctl, sizeof(pgSetup->pg_ctl)); strlcpy(pgSetup->pg_version, pgSetupFromPgConfig.pg_version, sizeof(pgSetup->pg_version)); return true; } /* * We failed to find a single pg_config in $PATH, error out and * complain about the situation with enough details that the user * can understand our struggle in picking a Postgres major version * for them. */ log_info("Found more than one pg_ctl entry in current PATH, " "and failed to find a single pg_config entry in current PATH"); for (int i = 0; i < pg_ctls.found; i++) { PostgresSetup currentPgSetup = { 0 }; strlcpy(currentPgSetup.pg_ctl, pg_ctls.matches[i], MAXPGPATH); if (!pg_ctl_version(¤tPgSetup)) { /* * Because of this it's possible that there's now only a * single working version of pg_ctl found in PATH. If * that's the case we will still not use that by default, * since the users intention is unclear. They might have * wanted to use the version of pg_ctl that we could not * parse the version string for. So we warn and continue, * the user should make their intention clear by using the * --pg_ctl option (or setting PG_CONFIG, or PATH). */ log_warn("Failed to get version info from \"%s\" --version", currentPgSetup.pg_ctl); continue; } log_info("Found \"%s\" for pg version %s", currentPgSetup.pg_ctl, currentPgSetup.pg_version); } log_error("Found several pg_ctl in PATH, please provide --pgctl"); return false; } } /* * find_pg_config_from_pg_ctl finds the path to pg_config from the known path * to pg_ctl. If that exists, we first use the pg_config binary found in the * same directory as the pg_ctl binary itself. * * Otherwise, we have a look at the PG_CONFIG environment variable. * * Finally, we search in the PATH list for all the matches, and for each of * them we run pg_config --bindir, and if that's the directory where we have * our known pg_ctl, that's our pg_config. * * Rationale: when using debian, the postgresql-common package installs a * single entry for pg_config in /usr/bin/pg_config, and that's the system * default. * * A version specific file path found in /usr/lib/postgresql/11/bin/pg_config * when installing Postgres 11 is installed from the package * postgresql-server-dev-11. * * There is no single default entry for pg_ctl, that said, so we are using the * specific path /usr/lib/postgresql/11/bin/pg_config here. * * So depending on what packages have been deployed on this specific debian * instance, we might or might not find a pg_config binary in the same * directory as pg_ctl. * * Note that we could register the full path to whatever pg_config version we * use at pg_autoctl create time, but in most cases that is going to be * /usr/bin/pg_config, and it will point to a new pg_ctl (version 13 for * instance) when you apt-get upgrade your debian testing distribution and it * just migrated from Postgres 11 to Postgres 13 (bullseye cycle just did that * in december 2020). * * Either package libpq-dev or postgresql-server-dev-11 (or another version) * must be isntalled for this to work. */ bool find_pg_config_from_pg_ctl(const char *pg_ctl, char *pg_config, size_t size) { char pg_config_path[MAXPGPATH] = { 0 }; /* * 1. try pg_ctl directory */ path_in_same_directory(pg_ctl, "pg_config", pg_config_path); if (file_exists(pg_config_path)) { log_debug("find_pg_config_from_pg_ctl: \"%s\" " "in same directory as pg_ctl", pg_config_path); strlcpy(pg_config, pg_config_path, size); return true; } /* * 2. try PG_CONFIG from the environment, and check pg_config --bindir */ if (env_exists("PG_CONFIG")) { PostgresSetup pgSetup = { 0 }; char PG_CONFIG[MAXPGPATH] = { 0 }; /* check that the pg_config we found relates to the given pg_ctl */ if (get_env_copy("PG_CONFIG", PG_CONFIG, sizeof(PG_CONFIG)) && file_exists(PG_CONFIG) && set_pg_ctl_from_config_bindir(&pgSetup, PG_CONFIG) && strcmp(pgSetup.pg_ctl, pg_ctl) == 0) { log_debug("find_pg_config_from_pg_ctl: \"%s\" " "from PG_CONFIG environment variable", pg_config_path); strlcpy(pg_config, pg_config_path, size); return true; } } /* * 3. search our PATH for pg_config entries and keep the first one that * relates to our known pg_ctl. */ SearchPath all_pg_configs = { 0 }; SearchPath pg_configs = { 0 }; if (!search_path("pg_config", &all_pg_configs)) { return false; } if (!search_path_deduplicate_symlinks(&all_pg_configs, &pg_configs)) { log_error("Failed to resolve symlinks found in PATH entries, " "see above for details"); return false; } for (int i = 0; i < pg_configs.found; i++) { PostgresSetup pgSetup = { 0 }; if (set_pg_ctl_from_config_bindir(&pgSetup, pg_configs.matches[i]) && strcmp(pgSetup.pg_ctl, pg_ctl) == 0) { log_debug("find_pg_config_from_pg_ctl: \"%s\" " "from PATH search", pg_configs.matches[i]); strlcpy(pg_config, pg_configs.matches[i], size); return true; } } return false; } /* * find_extension_control_file ensures that the extension is present in the * given Postgres installation. This does the equivalent of: * ls -l $(pg_config --sharedir)/extension/pg_stat_statements.control */ bool find_extension_control_file(const char *pg_ctl, const char *extName) { char pg_config_path[MAXPGPATH] = { 0 }; char extension_path[MAXPGPATH] = { 0 }; char *share_dir; char extension_control_file_name[MAXPGPATH] = { 0 }; char *lines[1]; log_debug("Checking if the %s extension is installed", extName); if (!find_pg_config_from_pg_ctl(pg_ctl, pg_config_path, MAXPGPATH)) { log_warn("Failed to find pg_config from pg_ctl at \"%s\"", pg_ctl); return false; } Program prog = run_program(pg_config_path, "--sharedir", NULL); if (prog.returnCode == 0) { if (!prog.stdOut) { log_error("Got empty output from pg_config --sharedir"); free_program(&prog); return false; } if (splitLines(prog.stdOut, lines, 1) != 1) { log_error("Unable to parse output from pg_config --sharedir"); free_program(&prog); return false; } share_dir = lines[0]; join_path_components(extension_path, share_dir, "extension"); sformat(extension_control_file_name, MAXPGPATH, "%s.control", extName); join_path_components(extension_path, extension_path, extension_control_file_name); if (!file_exists(extension_path)) { log_error("Failed to find extension control file \"%s\"", extension_path); free_program(&prog); return false; } } else { (void) log_program_output(prog, LOG_INFO, LOG_ERROR); log_error("Failed to run \"%s\", see above for details", pg_config_path); free_program(&prog); return false; } free_program(&prog); return true; } /* * pg_add_auto_failover_default_settings ensures the pg_auto_failover default * settings are included in postgresql.conf. For simplicity, this function * reads the whole contents of postgresql.conf into memory. */ bool pg_add_auto_failover_default_settings(PostgresSetup *pgSetup, const char *hostname, const char *configFilePath, GUC *settings) { bool includeTuning = true; char pgAutoFailoverDefaultsConfigPath[MAXPGPATH]; /* * Write the default settings to postgresql-auto-failover.conf. * * postgresql-auto-failover.conf needs to be placed alongside * postgresql.conf for the include to work. Determine the path by finding * the parent directory of postgresql.conf. */ path_in_same_directory(configFilePath, AUTOCTL_DEFAULTS_CONF_FILENAME, pgAutoFailoverDefaultsConfigPath); if (!ensure_default_settings_file_exists(pgAutoFailoverDefaultsConfigPath, settings, pgSetup, hostname, includeTuning)) { return false; } return pg_include_config(configFilePath, AUTOCTL_CONF_INCLUDE_LINE, AUTOCTL_CONF_INCLUDE_COMMENT); } /* * pg_auto_failover_default_settings_file_exists returns true when our expected * postgresql-auto-failover.conf file exists in PGDATA. */ bool pg_auto_failover_default_settings_file_exists(PostgresSetup *pgSetup) { char pgAutoFailoverDefaultsConfigPath[MAXPGPATH] = { 0 }; char *contents = NULL; long size = 0L; join_path_components(pgAutoFailoverDefaultsConfigPath, pgSetup->pgdata, AUTOCTL_DEFAULTS_CONF_FILENAME); /* make sure the file exists and is not empty (race conditions) */ if (!read_file_if_exists(pgAutoFailoverDefaultsConfigPath, &contents, &size)) { return false; } /* we don't actually need the contents here */ free(contents); bool fileExistsWithContent = size > 0; return fileExistsWithContent; } /* * pg_include_config adds an include line to postgresql.conf to include the * given configuration file, with a comment refering pg_auto_failover. */ static bool pg_include_config(const char *configFilePath, const char *configIncludeLine, const char *configIncludeComment) { char *currentConfContents = NULL; long currentConfSize = 0L; /* read the current postgresql.conf contents */ if (!read_file(configFilePath, ¤tConfContents, ¤tConfSize)) { return false; } /* find the include 'postgresql-auto-failover.conf' line */ char *includeLine = strstr(currentConfContents, configIncludeLine); if (includeLine != NULL && (includeLine == currentConfContents || includeLine[-1] == '\n')) { log_debug("%s found in \"%s\"", configIncludeLine, configFilePath); /* defaults settings are already included */ free(currentConfContents); return true; } log_debug("Adding %s to \"%s\"", configIncludeLine, configFilePath); /* build the new postgresql.conf contents */ PQExpBuffer newConfContents = createPQExpBuffer(); if (newConfContents == NULL) { log_error("Failed to allocate memory"); free(currentConfContents); return false; } appendPQExpBufferStr(newConfContents, configIncludeLine); appendPQExpBufferStr(newConfContents, configIncludeComment); appendPQExpBufferStr(newConfContents, currentConfContents); /* done with the old postgresql.conf contents */ free(currentConfContents); /* memory allocation could have failed while building string */ if (PQExpBufferBroken(newConfContents)) { log_error("Failed to allocate memory"); destroyPQExpBuffer(newConfContents); return false; } /* write the new postgresql.conf */ if (!write_file(newConfContents->data, newConfContents->len, configFilePath)) { destroyPQExpBuffer(newConfContents); return false; } destroyPQExpBuffer(newConfContents); return true; } /* * ensure_default_settings_file_exists writes the postgresql-auto-failover.conf * file to the database directory. */ static bool ensure_default_settings_file_exists(const char *configFilePath, GUC *settings, PostgresSetup *pgSetup, const char *hostname, bool includeTuning) { PQExpBuffer defaultConfContents = createPQExpBuffer(); if (defaultConfContents == NULL) { log_error("Failed to allocate memory"); return false; } if (!prepare_guc_settings_from_pgsetup(configFilePath, defaultConfContents, settings, pgSetup, hostname, includeTuning)) { /* errors have already been logged */ destroyPQExpBuffer(defaultConfContents); return false; } if (file_exists(configFilePath)) { char *currentDefaultConfContents = NULL; long currentDefaultConfSize = 0L; if (!read_file(configFilePath, ¤tDefaultConfContents, ¤tDefaultConfSize)) { /* technically, we could still try writing, but this is pretty * suspicious */ destroyPQExpBuffer(defaultConfContents); return false; } if (strcmp(currentDefaultConfContents, defaultConfContents->data) == 0) { /* file is there and has the same contents, nothing to do */ log_debug("Default settings file \"%s\" exists", configFilePath); free(currentDefaultConfContents); destroyPQExpBuffer(defaultConfContents); return true; } log_info("Contents of \"%s\" have changed, overwriting", configFilePath); free(currentDefaultConfContents); } else { log_debug("Configuration file \"%s\" doesn't exists yet, creating", configFilePath); } if (!write_file(defaultConfContents->data, defaultConfContents->len, configFilePath)) { destroyPQExpBuffer(defaultConfContents); return false; } log_debug("Wrote file \"%s\" with content:\n%s", configFilePath, defaultConfContents->data); destroyPQExpBuffer(defaultConfContents); return true; } /* * prepare_guc_settings_from_pgsetup replaces some of the given GUC settings * with dynamic values found in the pgSetup argument, and prepare them in the * expected format for a postgresql.conf file in the given PQExpBuffer. * * While most of our settings are handle in a static way and thus known at * compile time, some of them can be provided by our users, such as * listen_addresses, port, and SSL related configuration parameters. */ #define streq(x, y) ((x != NULL) && (y != NULL) && (strcmp(x, y) == 0)) static bool prepare_guc_settings_from_pgsetup(const char *configFilePath, PQExpBuffer config, GUC *settings, PostgresSetup *pgSetup, const char *hostname, bool includeTuning) { char tuning[BUFSIZE] = { 0 }; int settingIndex = 0; appendPQExpBufferStr(config, "# Settings by pg_auto_failover\n"); /* replace placeholder values with actual pgSetup values */ for (settingIndex = 0; settings[settingIndex].name != NULL; settingIndex++) { GUC *setting = &settings[settingIndex]; /* * Settings for "listen_addresses" and "port" are replaced with the * respective values present in pgSetup allowing those to be dynamic. * * At the moment our "needs quote" heuristic is pretty simple. * There's the one parameter within those that we hardcode from * pg_auto_failover that needs quoting, and that's * listen_addresses. * * The reason why POSTGRES_DEFAULT_LISTEN_ADDRESSES is not quoting * the value directly in the constant is that we are using that * value both in the configuration file and at the pg_ctl start * --options "-h *" command line. * * At the command line, using --options "-h '*'" would give: * could not create listen socket for "'*'" */ if (streq(setting->name, "listen_addresses")) { appendPQExpBuffer(config, "%s = '%s'\n", setting->name, pgSetup->listen_addresses); } else if (streq(setting->name, "password_encryption")) { /* * Set password_encryption if the authMethod is password based. */ if (streq(pgSetup->authMethod, "md5") || streq(pgSetup->authMethod, "scram-sha-256")) { appendPQExpBuffer(config, "%s = '%s'\n", setting->name, pgSetup->authMethod); } else if (streq(pgSetup->authMethod, "password")) { /* * The "password" auth method supports only the "md5" and * "scram-sha-256" password encryption settings. * Default the encryption setting to "scram-sha-256" in this * case, as it is the more secure alternative. */ appendPQExpBuffer(config, "%s = '%s'\n", setting->name, "scram-sha-256"); } } else if (streq(setting->name, "port")) { appendPQExpBuffer(config, "%s = %d\n", setting->name, pgSetup->pgport); } else if (streq(setting->name, "ssl")) { appendPQExpBuffer(config, "%s = %s\n", setting->name, pgSetup->ssl.active == 0 ? "off" : "on"); } else if (streq(setting->name, "ssl_ca_file")) { if (!IS_EMPTY_STRING_BUFFER(pgSetup->ssl.caFile)) { appendPQExpBuffer(config, "%s = '%s'\n", setting->name, pgSetup->ssl.caFile); } } else if (streq(setting->name, "ssl_crl_file")) { if (!IS_EMPTY_STRING_BUFFER(pgSetup->ssl.crlFile)) { appendPQExpBuffer(config, "%s = '%s'\n", setting->name, pgSetup->ssl.crlFile); } } else if (streq(setting->name, "ssl_cert_file")) { if (!IS_EMPTY_STRING_BUFFER(pgSetup->ssl.serverCert)) { appendPQExpBuffer(config, "%s = '%s'\n", setting->name, pgSetup->ssl.serverCert); } } else if (streq(setting->name, "ssl_key_file")) { if (!IS_EMPTY_STRING_BUFFER(pgSetup->ssl.serverKey)) { appendPQExpBuffer(config, "%s = '%s'\n", setting->name, pgSetup->ssl.serverKey); } } else if (streq(setting->name, "recovery_target_lsn")) { if (streq(setting->value, "'immediate'")) { appendPQExpBuffer(config, "recovery_target = 'immediate'\n"); } else { appendPQExpBuffer(config, "%s = %s\n", setting->name, setting->value); } } else if (streq(setting->name, "citus.node_conninfo")) { appendPQExpBuffer(config, "%s = '", setting->name); /* add sslmode, sslrootcert, and sslcrl if needed */ if (!prepare_conninfo_sslmode(config, pgSetup->ssl)) { /* errors have already been logged */ return false; } appendPQExpBufferStr(config, "'\n"); } else if (streq(setting->name, "citus.use_secondary_nodes")) { if (!IS_EMPTY_STRING_BUFFER(pgSetup->citusClusterName)) { appendPQExpBuffer(config, "%s = 'always'\n", setting->name); } } else if (streq(setting->name, "citus.cluster_name")) { if (!IS_EMPTY_STRING_BUFFER(pgSetup->citusClusterName)) { appendPQExpBuffer(config, "%s = '%s'\n", setting->name, pgSetup->citusClusterName); } } else if (streq(setting->name, "citus.local_hostname")) { if (hostname != NULL && !IS_EMPTY_STRING_BUFFER(hostname)) { appendPQExpBuffer(config, "%s = '%s'\n", setting->name, hostname); } } else if (setting->value != NULL && !IS_EMPTY_STRING_BUFFER(setting->value)) { appendPQExpBuffer(config, "%s = %s\n", setting->name, setting->value); } else if (setting->value == NULL || IS_EMPTY_STRING_BUFFER(setting->value)) { /* * Our GUC entry has a NULL (or empty) value. Skip the setting. * * In cases that's expected, such as when removing primary_conninfo * from the recovery.conf settings so that we disconnect from the * primary node being demoted. * * Still log about it, in case it might happen when it's not * expected. */ log_debug("GUC setting \"%s\" has a NULL value", setting->name); } else { /* the GUC setting in the array has not been processed */ log_error("BUG: GUC settings \"%s\" has not been processed", setting->name); return false; } } if (includeTuning) { if (!pgtuning_prepare_guc_settings(postgres_tuning, tuning, sizeof(tuning))) { log_warn("Failed to compute Postgres basic tuning for this system"); } appendPQExpBuffer(config, "\n%s\n", tuning); } /* memory allocation could have failed while building string */ if (PQExpBufferBroken(config)) { log_error("Failed to allocate memory while preparing config file \"%s\"", configFilePath); destroyPQExpBuffer(config); return false; } return true; } /* * Call pg_basebackup, using a temporary directory for the duration of the data * transfer. */ bool pg_basebackup(const char *pgdata, const char *pg_ctl, ReplicationSource *replicationSource) { int returnCode; char pg_basebackup[MAXPGPATH]; NodeAddress *primaryNode = &(replicationSource->primaryNode); char primaryConnInfo[MAXCONNINFO] = { 0 }; char *args[16]; int argsIndex = 0; char command[BUFSIZE]; log_debug("mkdir -p \"%s\"", replicationSource->backupDir); if (!ensure_empty_dir(replicationSource->backupDir, 0700)) { /* errors have already been logged. */ return false; } /* call pg_basebackup */ path_in_same_directory(pg_ctl, "pg_basebackup", pg_basebackup); setenv("PGCONNECT_TIMEOUT", POSTGRES_CONNECT_TIMEOUT, 1); if (!IS_EMPTY_STRING_BUFFER(replicationSource->password)) { setenv("PGPASSWORD", replicationSource->password, 1); } setenv("PGAPPNAME", replicationSource->applicationName, 1); if (!prepare_primary_conninfo(primaryConnInfo, MAXCONNINFO, primaryNode->host, primaryNode->port, replicationSource->userName, NULL, /* no database */ NULL, /* no password here */ replicationSource->applicationName, replicationSource->sslOptions, false)) /* do not escape this one */ { /* errors have already been logged. */ return false; } args[argsIndex++] = (char *) pg_basebackup; args[argsIndex++] = "-w"; args[argsIndex++] = "-d"; args[argsIndex++] = primaryConnInfo; args[argsIndex++] = "--pgdata"; args[argsIndex++] = replicationSource->backupDir; args[argsIndex++] = "-U"; args[argsIndex++] = replicationSource->userName; args[argsIndex++] = "--verbose"; args[argsIndex++] = "--progress"; args[argsIndex++] = "--max-rate"; args[argsIndex++] = replicationSource->maximumBackupRate; args[argsIndex++] = "--wal-method=stream"; /* we don't use a replication slot e.g. when upstream is a standby */ if (!IS_EMPTY_STRING_BUFFER(replicationSource->slotName)) { args[argsIndex++] = "--slot"; args[argsIndex++] = replicationSource->slotName; } args[argsIndex] = NULL; /* * We do not want to call setsid() when running this program, as the * pg_basebackup subprogram is not intended to be its own session leader, * but remain a sub-process in the same group as pg_autoctl. */ Program program = { 0 }; (void) initialize_program(&program, args, false); program.processBuffer = &processBufferCallback; /* log the exact command line we're using */ int commandSize = snprintf_program_command_line(&program, command, BUFSIZE); if (commandSize >= BUFSIZE) { /* we only display the first BUFSIZE bytes of the real command */ log_info("%s...", command); } else { log_info("%s", command); } (void) execute_subprogram(&program); returnCode = program.returnCode; free_program(&program); if (returnCode != 0) { log_error("Failed to run pg_basebackup: exit code %d", returnCode); return false; } /* replace $pgdata with the backup directory */ if (directory_exists(pgdata)) { if (!rmtree(pgdata, true)) { log_error("Failed to remove directory \"%s\": %m", pgdata); return false; } } log_debug("mv \"%s\" \"%s\"", replicationSource->backupDir, pgdata); if (rename(replicationSource->backupDir, pgdata) != 0) { log_error( "Failed to install pg_basebackup dir " " \"%s\" in \"%s\": %m", replicationSource->backupDir, pgdata); return false; } return true; } /* * pg_rewind runs the pg_rewind program to rewind the given database directory * to a state where it can follow the given primary. We need the ability to * connect to the node. */ bool pg_rewind(const char *pgdata, const char *pg_ctl, ReplicationSource *replicationSource) { int returnCode; char pg_rewind[MAXPGPATH] = { 0 }; NodeAddress *primaryNode = &(replicationSource->primaryNode); char primaryConnInfo[MAXCONNINFO] = { 0 }; char *args[7]; int argsIndex = 0; char command[BUFSIZE]; /* call pg_rewind*/ path_in_same_directory(pg_ctl, "pg_rewind", pg_rewind); setenv("PGCONNECT_TIMEOUT", POSTGRES_CONNECT_TIMEOUT, 1); if (!IS_EMPTY_STRING_BUFFER(replicationSource->password)) { setenv("PGPASSWORD", replicationSource->password, 1); } if (!prepare_primary_conninfo(primaryConnInfo, MAXCONNINFO, primaryNode->host, primaryNode->port, replicationSource->userName, "postgres", /* pg_rewind needs a database */ NULL, /* no password here */ replicationSource->applicationName, replicationSource->sslOptions, false)) /* do not escape this one */ { /* errors have already been logged. */ return false; } args[argsIndex++] = (char *) pg_rewind; args[argsIndex++] = "--target-pgdata"; args[argsIndex++] = (char *) pgdata; args[argsIndex++] = "--source-server"; args[argsIndex++] = primaryConnInfo; args[argsIndex++] = "--progress"; args[argsIndex] = NULL; /* * We do not want to call setsid() when running this program, as the * pg_rewind subprogram is not intended to be its own session leader, but * remain a sub-process in the same group as pg_autoctl. */ Program program = { 0 }; (void) initialize_program(&program, args, false); program.processBuffer = &processBufferCallback; /* log the exact command line we're using */ int commandSize = snprintf_program_command_line(&program, command, BUFSIZE); if (commandSize >= BUFSIZE) { /* we only display the first BUFSIZE bytes of the real command */ log_info("%s...", command); } else { log_info("%s", command); } (void) execute_subprogram(&program); returnCode = program.returnCode; free_program(&program); if (returnCode != 0) { log_error("Failed to run pg_rewind: exit code %d", returnCode); return false; } return true; } /* log_program_output logs the output of the given program. */ static void log_program_output(Program prog, int outLogLevel, int errorLogLevel) { if (prog.stdOut != NULL) { char *outLines[BUFSIZE]; int lineCount = splitLines(prog.stdOut, outLines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_level(outLogLevel, "%s", outLines[lineNumber]); } } if (prog.stdErr != NULL) { char *errorLines[BUFSIZE]; int lineCount = splitLines(prog.stdErr, errorLines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_level(errorLogLevel, "%s", errorLines[lineNumber]); } } } /* * pg_ctl_initdb initializes a PostgreSQL directory from scratch by calling * "pg_ctl initdb", and returns true when this was successful. Beware that it * will inherit from the environment, such as LC_COLLATE and LC_ALL etc. * * No provision is made to control (sanitize?) that environment. */ bool pg_ctl_initdb(const char *pg_ctl, const char *pgdata) { /* initdb takes time, so log about the operation BEFORE doing it */ log_info("Initialising a PostgreSQL cluster at \"%s\"", pgdata); log_info("%s initdb -s -D %s --option '--auth=trust'", pg_ctl, pgdata); Program program = run_program(pg_ctl, "initdb", "--silent", "--pgdata", pgdata, /* avoid warning message */ "--option", "'--auth=trust'", NULL); bool success = program.returnCode == 0; if (program.returnCode != 0) { (void) log_program_output(program, LOG_INFO, LOG_ERROR); log_fatal("Failed to initialize Postgres cluster at \"%s\", " "see above for details", pgdata); } else { /* we might still have important information to read there */ (void) log_program_output(program, LOG_INFO, LOG_WARN); } free_program(&program); return success; } /* * pg_ctl_postgres runs the "postgres" command-line in the current process, * with the same options as we would use in pg_ctl_start. pg_ctl_postgres does * not fork a Postgres process in the background, we keep the control over the * postmaster process. Think exec() rather then fork(). * * This function will take over the current standard output and standard error * file descriptor, closing them and then giving control to them to Postgres * itself. This function is meant to be called in the child process of a fork() * call done by the caller. */ bool pg_ctl_postgres(const char *pg_ctl, const char *pgdata, int pgport, char *listen_addresses, bool listen) { char postgres[MAXPGPATH]; char logfile[MAXPGPATH]; char *args[12]; int argsIndex = 0; char env_pg_regress_sock_dir[MAXPGPATH]; char command[BUFSIZE]; /* call postgres directly */ path_in_same_directory(pg_ctl, "postgres", postgres); /* prepare startup.log file in PGDATA */ join_path_components(logfile, pgdata, "startup.log"); args[argsIndex++] = (char *) postgres; args[argsIndex++] = "-D"; args[argsIndex++] = (char *) pgdata; args[argsIndex++] = "-p"; args[argsIndex++] = (char *) intToString(pgport).strValue; if (listen) { if (IS_EMPTY_STRING_BUFFER(listen_addresses)) { log_error("BUG: pg_ctl_postgres is given an empty listen_addresses " "with argument listen set to true"); return false; } args[argsIndex++] = "-h"; args[argsIndex++] = (char *) listen_addresses; } else { args[argsIndex++] = "-h"; args[argsIndex++] = ""; } if (env_exists("PG_REGRESS_SOCK_DIR")) { if (!get_env_copy("PG_REGRESS_SOCK_DIR", env_pg_regress_sock_dir, MAXPGPATH)) { /* errors have already been logged */ return false; } args[argsIndex++] = "-k"; args[argsIndex++] = (char *) env_pg_regress_sock_dir; } args[argsIndex] = NULL; /* * We do not want to call setsid() when running this program, as the * postgres subprogram is not intended to be its own session leader, but * remain a sub-process in the same group as pg_autoctl. */ Program program = { 0 }; (void) initialize_program(&program, args, false); /* we want to redirect the output to logfile */ int logFileDescriptor = open(logfile, FOPEN_FLAGS_W, 0644); if (logFileDescriptor == -1) { log_error("Failed to open file \"%s\": %m", logfile); } program.capture = false; /* redirect output, don't capture */ program.stdOutFd = logFileDescriptor; program.stdErrFd = logFileDescriptor; /* log the exact command line we're using */ int commandSize = snprintf_program_command_line(&program, command, BUFSIZE); if (commandSize >= BUFSIZE) { /* we only display the first BUFSIZE bytes of the real command */ log_info("%s...", command); } else { log_info("%s", command); } (void) execute_program(&program); return program.returnCode == 0; } /* * pg_log_startup logs the PGDATA/startup.log file contents so that our users * have enough information about why Postgres failed to start when that * happens. */ bool pg_log_startup(const char *pgdata, int logLevel) { char pgLogDirPath[MAXPGPATH] = { 0 }; char pgStartupPath[MAXPGPATH] = { 0 }; char *fileContents; long fileSize; /* logLevel to use when introducing which file path logs come from */ int pathLogLevel = logLevel <= LOG_DEBUG ? LOG_DEBUG : LOG_WARN; struct stat pgStartupStat; struct dirent *logFileDirEntry = NULL; /* prepare startup.log file in PGDATA */ join_path_components(pgStartupPath, pgdata, "startup.log"); if (read_file(pgStartupPath, &fileContents, &fileSize) && fileSize > 0) { char *lines[BUFSIZE]; int lineCount = splitLines(fileContents, lines, BUFSIZE); int lineNumber = 0; log_level(pathLogLevel, "Postgres logs from \"%s\":", pgStartupPath); for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_level(logLevel, "%s", lines[lineNumber]); } free(fileContents); } /* * Add in the most recent Postgres log file if it's been created after the * startup.log file, it might contain very useful information, such as a * FATAL line(s). * * Given that we setup Postgres to use the logging_collector, we expect * there to be a single Postgres log file in the "log" directory that was * created later than the "startup.log" file, and we expect the file to be * rather short. * * Also we setup log_directory to be "log" so that's where we are looking * into. */ /* prepare PGDATA/log directory path */ join_path_components(pgLogDirPath, pgdata, "log"); if (!directory_exists(pgLogDirPath)) { /* then there's no other log files to process here */ return true; } /* get the time of last modification of the startup.log file */ if (lstat(pgStartupPath, &pgStartupStat) != 0) { log_error("Failed to get file information for \"%s\": %m", pgStartupPath); return false; } int64_t pgStartupMtime = ST_MTIME_S(pgStartupStat); /* open and scan through the Postgres log directory */ DIR *logDir = opendir(pgLogDirPath); if (logDir == NULL) { log_error("Failed to open Postgres log directory \"%s\": %m", pgLogDirPath); return false; } while ((logFileDirEntry = readdir(logDir)) != NULL) { char pgLogFilePath[MAXPGPATH] = { 0 }; struct stat pgLogFileStat; /* our logFiles are regular files, skip . and .. and others */ if (logFileDirEntry->d_type != DT_REG) { continue; } /* build the absolute file path for the logfile */ join_path_components(pgLogFilePath, pgLogDirPath, logFileDirEntry->d_name); /* get the file information for the current logFile */ if (lstat(pgLogFilePath, &pgLogFileStat) != 0) { log_error("Failed to get file information for \"%s\": %m", pgLogFilePath); return false; } int64_t pgLogFileMtime = ST_MTIME_S(pgLogFileStat); /* * Compare modification times and only add to our logs the content * from the Postgres log file that was created after the * startup.log file. */ if (pgLogFileMtime >= pgStartupMtime) { char *fileContents; long fileSize; log_level(pathLogLevel, "Postgres logs from \"%s\":", pgLogFilePath); if (read_file(pgLogFilePath, &fileContents, &fileSize) && fileSize > 0) { char *lines[BUFSIZE]; int lineCount = splitLines(fileContents, lines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { if (strstr(lines[lineNumber], "FATAL") != NULL) { log_fatal("%s", lines[lineNumber]); } else if (strstr(lines[lineNumber], "ERROR") != NULL) { log_error("%s", lines[lineNumber]); } else { log_level(logLevel, "%s", lines[lineNumber]); } } free(fileContents); } } } closedir(logDir); /* now add the contents of the recovery configuration */ (void) pg_log_recovery_setup(pgdata, logLevel); return true; } /* * pg_log_recovery_setup logs the current Postgres recovery settings from * either the recovery.conf file or the standby setup. In case things go wrong * in the Postgres version detection mechanism, or upgrades, or clean-up, this * logs all the configuration files found rather than only those we expect we * should find. */ bool pg_log_recovery_setup(const char *pgdata, int logLevel) { char *filenames[] = { "recovery.conf", "standby.signal", AUTOCTL_STANDBY_CONF_FILENAME, NULL }; for (int i = 0; filenames[i] != NULL; i++) { char recoveryConfPath[MAXPGPATH] = { 0 }; char *fileContents; long fileSize; join_path_components(recoveryConfPath, pgdata, filenames[i]); if (file_exists(recoveryConfPath)) { if (!read_file(recoveryConfPath, &fileContents, &fileSize)) { /* errors have already been logged */ continue; } if (fileSize > 0) { log_debug("Configuration file \"%s\":\n%s", recoveryConfPath, fileContents); } else { log_debug("Configuration file \"%s\" is empty", recoveryConfPath); } free(fileContents); } } return true; } /* * pg_ctl_stop tries to stop a PostgreSQL server by running a "pg_ctl stop" * command. If the server was stopped successfully, or if the server is not * running at all, it returns true. */ bool pg_ctl_stop(const char *pg_ctl, const char *pgdata) { const bool log_output = true; log_info("%s --pgdata %s --wait stop --mode fast", pg_ctl, pgdata); Program program = run_program(pg_ctl, "--pgdata", pgdata, "--wait", "stop", "--mode", "fast", NULL); /* * Case 1. "pg_ctl stop" was successful, so we could stop the PostgreSQL * server successfully. */ if (program.returnCode == 0) { free_program(&program); return true; } /* * Case 2. The data directory doesn't exist. So we assume PostgreSQL is * not running, so stopping the PostgreSQL server was successful. */ bool pgdata_exists = directory_exists(pgdata); if (!pgdata_exists) { log_info("pgdata \"%s\" does not exist, consider this as PostgreSQL " "not running", pgdata); free_program(&program); return true; } /* * Case 3. "pg_ctl stop" returns non-zero return code when PostgreSQL is not * running at all. So we double-check with "pg_ctl status", and return * success if the PostgreSQL server is not running. Otherwise, we return * failure. * * See https://www.postgresql.org/docs/current/static/app-pg-ctl.html */ int status = pg_ctl_status(pg_ctl, pgdata, log_output); if (status == PG_CTL_STATUS_NOT_RUNNING) { log_info("pg_ctl stop failed, but PostgreSQL is not running anyway"); free_program(&program); return true; } log_info("Stopping PostgreSQL server failed. pg_ctl status returned: %d", status); if (log_output) { (void) log_program_output(program, LOG_INFO, LOG_ERROR); } free_program(&program); return false; } /* * pg_ctl_status gets the status of the PostgreSQL server by running * "pg_ctl status". Output of this command is logged if log_output is true. * Return code of this command is returned. */ int pg_ctl_status(const char *pg_ctl, const char *pgdata, bool log_output) { Program program = run_program(pg_ctl, "status", "-D", pgdata, NULL); int returnCode = program.returnCode; log_level(log_output ? LOG_INFO : LOG_DEBUG, "%s status -D %s [%d]", pg_ctl, pgdata, returnCode); if (log_output) { (void) log_program_output(program, LOG_INFO, LOG_ERROR); } free_program(&program); return returnCode; } /* * pg_ctl_promote promotes a standby by running "pg_ctl promote" */ bool pg_ctl_promote(const char *pg_ctl, const char *pgdata) { Program program = run_program(pg_ctl, "promote", "-D", pgdata, "--no-wait", NULL); int returnCode = program.returnCode; log_debug("%s promote -D %s --no-wait", pg_ctl, pgdata); if (program.stdErr != NULL) { log_error("%s", program.stdErr); } if (returnCode != 0) { /* pg_ctl promote will have logged errors */ free_program(&program); return false; } free_program(&program); return true; } /* * pg_setup_standby_mode sets up standby mode by either writing a recovery.conf * file or adding the configuration items to postgresql.conf and then creating * a standby.signal file in PGDATA. */ bool pg_setup_standby_mode(uint32_t pg_control_version, const char *pgdata, const char *pg_ctl, ReplicationSource *replicationSource) { if (pg_control_version < 1000) { log_fatal("pg_auto_failover does not support PostgreSQL before " "Postgres 10, we have pg_control version number %d from " "pg_controldata \"%s\"", pg_control_version, pgdata); return false; } /* * Check our primary_conninfo connection string by attempting to connect in * replication mode and issuing a IDENTIFY_SYSTEM command. */ if (!IS_EMPTY_STRING_BUFFER(replicationSource->primaryNode.host) && !pgctl_identify_system(replicationSource)) { log_error("Failed to setup standby mode: can't connect to the primary. " "See above for details"); return false; } if (pg_control_version < 1200) { /* * Before Postgres 12 we used to place recovery configuration in a * specific file recovery.conf, located alongside postgresql.conf. * Controling whether the server would start in PITR or standby mode * was controlled by a setting in the recovery.conf file. */ return pg_write_recovery_conf(pgdata, replicationSource); } else { /* * Starting in Postgres 12 we need to add our recovery configuration to * the main postgresql.conf file and create an empty standby.signal * file to trigger starting the server in standby mode. */ return pg_write_standby_signal(pgdata, replicationSource); } } /* * pg_write_recovery_conf writes a recovery.conf file to a postgres data * directory with the given primary connection info and replication slot name. */ static bool pg_write_recovery_conf(const char *pgdata, ReplicationSource *replicationSource) { char recoveryConfPath[MAXPGPATH]; /* prepare storage areas for parameters */ char primaryConnInfo[MAXCONNINFO] = { 0 }; char primarySlotName[MAXCONNINFO] = { 0 }; char targetLSN[PG_LSN_MAXLENGTH] = { 0 }; char targetAction[NAMEDATALEN] = { 0 }; char targetTimeline[NAMEDATALEN] = { 0 }; GUC recoverySettingsStandby[] = { { "standby_mode", "'on'" }, { "primary_conninfo", (char *) primaryConnInfo }, { "primary_slot_name", (char *) primarySlotName }, { "recovery_target_timeline", (char *) targetTimeline }, { NULL, NULL } }; GUC recoverySettingsTargetLSN[] = { { "standby_mode", "'on'" }, { "primary_conninfo", (char *) primaryConnInfo }, { "primary_slot_name", (char *) primarySlotName }, { "recovery_target_timeline", (char *) targetTimeline }, { "recovery_target_lsn", (char *) targetLSN }, { "recovery_target_inclusive", "'true'" }, { "recovery_target_action", (char *) targetAction }, { NULL, NULL } }; GUC *recoverySettings = IS_EMPTY_STRING_BUFFER(replicationSource->targetLSN) ? recoverySettingsStandby : recoverySettingsTargetLSN; bool includeTuning = false; join_path_components(recoveryConfPath, pgdata, "recovery.conf"); log_info("Writing recovery configuration to \"%s\"", recoveryConfPath); if (!prepare_recovery_settings(pgdata, replicationSource, primaryConnInfo, primarySlotName, targetLSN, targetAction, targetTimeline)) { /* errors have already been logged */ return false; } return ensure_default_settings_file_exists(recoveryConfPath, recoverySettings, NULL, NULL, includeTuning); } /* * pg_write_standby_signal writes the ${PGDATA}/standby.signal file that is in * use starting with Postgres 12 for starting a standby server. The file only * needs to exists, and the setup is to be found in the main Postgres * configuration file. */ static bool pg_write_standby_signal(const char *pgdata, ReplicationSource *replicationSource) { char standbyConfigFilePath[MAXPGPATH] = { 0 }; char signalFilePath[MAXPGPATH] = { 0 }; char configFilePath[MAXPGPATH] = { 0 }; /* prepare storage areas for parameters */ char primaryConnInfo[MAXCONNINFO] = { 0 }; char primarySlotName[MAXCONNINFO] = { 0 }; char targetLSN[PG_LSN_MAXLENGTH] = { 0 }; char targetAction[NAMEDATALEN] = { 0 }; char targetTimeline[NAMEDATALEN] = { 0 }; GUC recoverySettingsStandby[] = { { "primary_conninfo", (char *) primaryConnInfo }, { "primary_slot_name", (char *) primarySlotName }, { "recovery_target_timeline", (char *) targetTimeline }, { NULL, NULL } }; GUC recoverySettingsTargetLSN[] = { { "primary_conninfo", (char *) primaryConnInfo }, { "primary_slot_name", (char *) primarySlotName }, { "recovery_target_timeline", (char *) targetTimeline }, { "recovery_target_lsn", (char *) targetLSN }, { "recovery_target_inclusive", "'true'" }, { "recovery_target_action", targetAction }, { NULL, NULL } }; GUC *recoverySettings = IS_EMPTY_STRING_BUFFER(replicationSource->targetLSN) ? recoverySettingsStandby : recoverySettingsTargetLSN; bool includeTuning = false; log_trace("pg_write_standby_signal"); if (!prepare_recovery_settings(pgdata, replicationSource, primaryConnInfo, primarySlotName, targetLSN, targetAction, targetTimeline)) { /* errors have already been logged */ return false; } /* set our configuration file paths, all found in PGDATA */ join_path_components(signalFilePath, pgdata, "standby.signal"); join_path_components(configFilePath, pgdata, "postgresql.conf"); join_path_components(standbyConfigFilePath, pgdata, AUTOCTL_STANDBY_CONF_FILENAME); /* * First install the standby.signal file, so that if there's a problem * later and Postgres is started, it is started as a standby, with missing * configuration. */ /* only logs about this the first time */ if (!file_exists(signalFilePath)) { log_info("Creating the standby signal file at \"%s\", " "and replication setup at \"%s\"", signalFilePath, standbyConfigFilePath); } if (!write_file("", 0, signalFilePath)) { /* write_file logs I/O error */ return false; } /* * Now write the standby settings to postgresql-auto-failover-standby.conf * and include that file from postgresql.conf. * * we pass NULL as pgSetup because we know it won't be used... */ if (!ensure_default_settings_file_exists(standbyConfigFilePath, recoverySettings, NULL, NULL, includeTuning)) { return false; } /* * We successfully created the standby.signal file, so Postgres will start * as a standby. If we fail to install the standby settings, then we return * false here and let the main loop try again. At least Postgres won't * start as a cloned single accepting writes. */ if (!pg_include_config(configFilePath, AUTOCTL_SB_CONF_INCLUDE_LINE, AUTOCTL_CONF_INCLUDE_COMMENT)) { log_error("Failed to prepare \"%s\" with standby settings", standbyConfigFilePath); return false; } return true; } /* * prepare_recovery_settings prepares the settings that we need to install in * either recovery.conf or our own postgresql-auto-failover-standby.conf * depending on the Postgres major version. */ static bool prepare_recovery_settings(const char *pgdata, ReplicationSource *replicationSource, char *primaryConnInfo, char *primarySlotName, char *targetLSN, char *targetAction, char *targetTimeline) { bool escape = true; NodeAddress *primaryNode = &(replicationSource->primaryNode); /* when reaching REPORT_LSN we set recovery with no primary conninfo */ if (!IS_EMPTY_STRING_BUFFER(primaryNode->host)) { log_debug("prepare_recovery_settings: " "primary node %" PRId64 " \"%s\" (%s:%d)", primaryNode->nodeId, primaryNode->name, primaryNode->host, primaryNode->port); if (!prepare_primary_conninfo(primaryConnInfo, MAXCONNINFO, primaryNode->host, primaryNode->port, replicationSource->userName, NULL, /* no database */ replicationSource->password, replicationSource->applicationName, replicationSource->sslOptions, escape)) { /* errors have already been logged. */ return false; } } else { log_debug("prepare_recovery_settings: no primary node!"); } /* * We don't always have a replication slot name to use when connecting to a * standby node. */ if (!IS_EMPTY_STRING_BUFFER(replicationSource->slotName)) { sformat(primarySlotName, MAXCONNINFO, "'%s'", replicationSource->slotName); } /* The default target timeline is 'latest' */ if (IS_EMPTY_STRING_BUFFER(replicationSource->targetTimeline)) { sformat(targetTimeline, NAMEDATALEN, "'latest'"); } else { sformat(targetTimeline, NAMEDATALEN, "'%s'", replicationSource->targetTimeline); } /* We use the targetLSN only when doing a WAL fast_forward operation */ if (!IS_EMPTY_STRING_BUFFER(replicationSource->targetLSN)) { sformat(targetLSN, PG_LSN_MAXLENGTH, "'%s'", replicationSource->targetLSN); } /* The default target Action is 'pause' */ if (IS_EMPTY_STRING_BUFFER(replicationSource->targetAction)) { sformat(targetAction, NAMEDATALEN, "'pause'"); } else { sformat(targetAction, NAMEDATALEN, "'%s'", replicationSource->targetAction); } return true; } /* * pg_cleanup_standby_mode cleans-up the replication settings for the local * instance of Postgres found at pgdata. * * - remove either recovery.conf or standby.signal * * - when using Postgres 12 also make postgresql-auto-failover-standby.conf an * empty file, so that we can still include it, but it has no effect. */ bool pg_cleanup_standby_mode(uint32_t pg_control_version, const char *pg_ctl, const char *pgdata, PGSQL *pgsql) { if (pg_control_version < 1200) { char recoveryConfPath[MAXPGPATH]; join_path_components(recoveryConfPath, pgdata, "recovery.conf"); log_debug("pg_cleanup_standby_mode: rm \"%s\"", recoveryConfPath); if (!unlink_file(recoveryConfPath)) { /* errors have already been logged */ return false; } } else { char standbyConfigFilePath[MAXPGPATH]; char signalFilePath[MAXPGPATH]; join_path_components(signalFilePath, pgdata, "standby.signal"); join_path_components(standbyConfigFilePath, pgdata, AUTOCTL_STANDBY_CONF_FILENAME); log_debug("pg_cleanup_standby_mode: rm \"%s\"", signalFilePath); if (!unlink_file(signalFilePath)) { /* errors have already been logged */ return false; } /* empty out the standby configuration file */ log_debug("pg_cleanup_standby_mode: > \"%s\"", standbyConfigFilePath); if (!write_file("", 0, standbyConfigFilePath)) { /* write_file logs I/O error */ return false; } } return true; } /* * escape_recovery_conf_string escapes a string that is used in a recovery.conf * file by converting single quotes into two single quotes. * * The result is written to destination and the length of the result. */ static bool escape_recovery_conf_string(char *destination, int destinationSize, const char *recoveryConfString) { int charIndex = 0; int length = strlen(recoveryConfString); int escapedStringLength = 0; /* we are going to add at least 3 chars: two quotes and a NUL character */ if (destinationSize < (length + 3)) { log_error("BUG: failed to escape recovery parameter value \"%s\" " "in a buffer of %d bytes", recoveryConfString, destinationSize); return false; } destination[escapedStringLength++] = '\''; for (charIndex = 0; charIndex < length; charIndex++) { char currentChar = recoveryConfString[charIndex]; if (currentChar == '\'') { destination[escapedStringLength++] = '\''; if (destinationSize < escapedStringLength) { log_error( "BUG: failed to escape recovery parameter value \"%s\" " "in a buffer of %d bytes, stopped at index %d", recoveryConfString, destinationSize, charIndex); return false; } } destination[escapedStringLength++] = currentChar; if (destinationSize < escapedStringLength) { log_error("BUG: failed to escape recovery parameter value \"%s\" " "in a buffer of %d bytes, stopped at index %d", recoveryConfString, destinationSize, charIndex); return false; } } destination[escapedStringLength++] = '\''; destination[escapedStringLength] = '\0'; return true; } /* * prepare_primary_conninfo prepares a connection string to the primary server. * The connection string may be used unquoted in a command line calling either * pg_basebackup ro pg_rewind, or may be used quoted in the primary_conninfo * setting for PostgreSQL. * * Also, pg_rewind needs a database to connect to. */ static bool prepare_primary_conninfo(char *primaryConnInfo, int primaryConnInfoSize, const char *primaryHost, int primaryPort, const char *replicationUsername, const char *dbname, const char *replicationPassword, const char *applicationName, SSLOptions sslOptions, bool escape) { int size = 0; char escaped[BUFSIZE]; if (IS_EMPTY_STRING_BUFFER(primaryHost)) { log_debug("prepare_primary_conninfo: missing primary hostname"); bzero((void *) primaryConnInfo, primaryConnInfoSize); return true; } PQExpBuffer buffer = createPQExpBuffer(); if (buffer == NULL) { log_error("Failed to allocate memory"); return false; } /* application_name shows up in pg_stat_replication on the primary */ appendPQExpBuffer(buffer, "application_name=%s", applicationName); appendPQExpBuffer(buffer, " host=%s", primaryHost); appendPQExpBuffer(buffer, " port=%d", primaryPort); appendPQExpBuffer(buffer, " user=%s", replicationUsername); if (dbname != NULL) { appendPQExpBuffer(buffer, " dbname=%s", dbname); } if (replicationPassword != NULL && !IS_EMPTY_STRING_BUFFER(replicationPassword)) { appendPQExpBuffer(buffer, " password=%s", replicationPassword); } appendPQExpBufferStr(buffer, " "); if (!prepare_conninfo_sslmode(buffer, sslOptions)) { /* errors have already been logged */ destroyPQExpBuffer(buffer); return false; } /* memory allocation could have failed while building string */ if (PQExpBufferBroken(buffer)) { log_error("Failed to allocate memory"); destroyPQExpBuffer(buffer); return false; } if (escape) { if (!escape_recovery_conf_string(escaped, BUFSIZE, buffer->data)) { /* errors have already been logged. */ destroyPQExpBuffer(buffer); return false; } /* now copy the buffer into primaryConnInfo for the caller */ size = sformat(primaryConnInfo, primaryConnInfoSize, "%s", escaped); if (size == -1 || size > primaryConnInfoSize) { log_error("BUG: the escaped primary_conninfo requires %d bytes and " "pg_auto_failover only support up to %d bytes", size, primaryConnInfoSize); destroyPQExpBuffer(buffer); return false; } } else { strlcpy(primaryConnInfo, buffer->data, primaryConnInfoSize); } destroyPQExpBuffer(buffer); return true; } /* * prepare_conninfo_sslmode adds the sslmode setting to the buffer, which is * used as a connection string. */ static bool prepare_conninfo_sslmode(PQExpBuffer buffer, SSLOptions sslOptions) { if (sslOptions.sslMode == SSL_MODE_UNKNOWN) { if (sslOptions.active) { /* that's a bug really */ log_error("SSL is active in the configuration, " "but sslmode is unknown"); return false; } return true; } appendPQExpBuffer(buffer, "sslmode=%s", pgsetup_sslmode_to_string(sslOptions.sslMode)); if (sslOptions.sslMode >= SSL_MODE_VERIFY_CA) { /* ssl revocation list might not be provided, it's ok */ if (!IS_EMPTY_STRING_BUFFER(sslOptions.crlFile)) { appendPQExpBuffer(buffer, " sslrootcert=%s sslcrl=%s", sslOptions.caFile, sslOptions.crlFile); } else { appendPQExpBuffer(buffer, " sslrootcert=%s", sslOptions.caFile); } } return true; } /* * pgctl_identify_system connects with replication=1 to our target node and run * the IDENTIFY_SYSTEM command to check that HBA is ready. */ bool pgctl_identify_system(ReplicationSource *replicationSource) { NodeAddress *primaryNode = &(replicationSource->primaryNode); char primaryConnInfo[MAXCONNINFO] = { 0 }; char primaryConnInfoReplication[MAXCONNINFO] = { 0 }; PGSQL replicationClient = { 0 }; if (!prepare_primary_conninfo(primaryConnInfo, MAXCONNINFO, primaryNode->host, primaryNode->port, replicationSource->userName, NULL, /* no database */ replicationSource->password, replicationSource->applicationName, replicationSource->sslOptions, false)) /* no need for escaping */ { /* errors have already been logged. */ return false; } /* * Per https://www.postgresql.org/docs/12/protocol-replication.html: * * To initiate streaming replication, the frontend sends the replication * parameter in the startup message. A Boolean value of true (or on, yes, * 1) tells the backend to go into physical replication walsender mode, * wherein a small set of replication commands, shown below, can be issued * instead of SQL statements. */ int len = sformat(primaryConnInfoReplication, MAXCONNINFO, "%s replication=1", primaryConnInfo); if (len >= MAXCONNINFO) { log_warn("Failed to call IDENTIFY_SYSTEM: primary_conninfo too large"); return false; } if (!pgsql_init(&replicationClient, primaryConnInfoReplication, PGSQL_CONN_UPSTREAM)) { /* errors have already been logged */ return false; } if (!pgsql_identify_system(&replicationClient, &(replicationSource->system))) { /* errors have already been logged */ return false; } return true; } /* * pg_is_running returns true if PostgreSQL is running. */ bool pg_is_running(const char *pg_ctl, const char *pgdata) { return pg_ctl_status(pg_ctl, pgdata, false) == 0; } /* * pg_create_self_signed_cert creates self-signed certificates for the local * Postgres server and places the private key in $PGDATA/server.key and the * public certificate in $PGDATA/server.cert * * We simply follow Postgres documentation at: * https://www.postgresql.org/docs/current/ssl-tcp.html#SSL-CERTIFICATE-CREATION * * openssl req -new -x509 -days 365 -nodes -text -out server.crt \ * -keyout server.key -subj "/CN=dbhost.yourdomain.com" */ bool pg_create_self_signed_cert(PostgresSetup *pgSetup, const char *hostname) { char subject[BUFSIZE] = { 0 }; char openssl[MAXPGPATH] = { 0 }; if (!search_path_first("openssl", openssl, LOG_ERROR)) { /* errors have already been logged */ return false; } /* ensure PGDATA has been normalized */ if (!normalize_filename(pgSetup->pgdata, pgSetup->pgdata, MAXPGPATH)) { return false; } int size = sformat(pgSetup->ssl.serverKey, MAXPGPATH, "%s/server.key", pgSetup->pgdata); if (size == -1 || size > MAXPGPATH) { log_error("BUG: the ssl server key file path requires %d bytes and " "pg_auto_failover only support up to %d bytes", size, MAXPGPATH); return false; } size = sformat(pgSetup->ssl.serverCert, MAXPGPATH, "%s/server.crt", pgSetup->pgdata); if (size == -1 || size > MAXPGPATH) { log_error("BUG: the ssl server key file path requires %d bytes and " "pg_auto_failover only support up to %d bytes", size, MAXPGPATH); return false; } size = sformat(subject, BUFSIZE, "/CN=%s", hostname); if (size == -1 || size > BUFSIZE) { log_error("BUG: the ssl subject \"/CN=%s\" requires %d bytes and" "pg_auto_failover only support up to %d bytes", hostname, size, BUFSIZE); return false; } log_info(" %s req -new -x509 -days 365 -nodes -text " "-out %s -keyout %s -subj \"%s\"", openssl, pgSetup->ssl.serverCert, pgSetup->ssl.serverKey, subject); Program program = run_program(openssl, "req", "-new", "-x509", "-days", "365", "-nodes", "-text", "-out", pgSetup->ssl.serverCert, "-keyout", pgSetup->ssl.serverKey, "-subj", subject, NULL); if (program.returnCode != 0) { (void) log_program_output(program, LOG_INFO, LOG_ERROR); log_error("openssl failed with return code: %d", program.returnCode); free_program(&program); return false; } (void) log_program_output(program, LOG_DEBUG, LOG_DEBUG); free_program(&program); /* * Then do: chmod og-rwx server.key */ if (chmod(pgSetup->ssl.serverKey, S_IRUSR | S_IWUSR) != 0) { log_error("Failed to chmod og-rwx \"%s\": %m", pgSetup->ssl.serverKey); return false; } return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/pgctl.h000066400000000000000000000051731414244367200220450ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pgctl.h * API for controling PostgreSQL, using its binary tooling (pg_ctl, * pg_controldata, pg_basebackup and such). * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef PGCTL_H #define PGCTL_H #include #include #include #include "postgres_fe.h" #include "utils/pidfile.h" #include "defaults.h" #include "file_utils.h" #include "pgsetup.h" #include "pgsql.h" #define AUTOCTL_DEFAULTS_CONF_FILENAME "postgresql-auto-failover.conf" #define AUTOCTL_STANDBY_CONF_FILENAME "postgresql-auto-failover-standby.conf" #define PG_CTL_STATUS_NOT_RUNNING 3 bool pg_controldata(PostgresSetup *pgSetup, bool missing_ok); bool set_pg_ctl_from_PG_CONFIG(PostgresSetup *pgSetup); bool set_pg_ctl_from_pg_config(PostgresSetup *pgSetup); bool config_find_pg_ctl(PostgresSetup *pgSetup); bool find_extension_control_file(const char *pg_ctl, const char *extName); bool pg_ctl_version(PostgresSetup *pgSetup); bool set_pg_ctl_from_config_bindir(PostgresSetup *pgSetup, const char *pg_config); bool find_pg_config_from_pg_ctl(const char *pg_ctl, char *pg_config, size_t size); bool pg_add_auto_failover_default_settings(PostgresSetup *pgSetup, const char *hostname, const char *configFilePath, GUC *settings); bool pg_auto_failover_default_settings_file_exists(PostgresSetup *pgSetup); bool pg_basebackup(const char *pgdata, const char *pg_ctl, ReplicationSource *replicationSource); bool pg_rewind(const char *pgdata, const char *pg_ctl, ReplicationSource *replicationSource); bool pg_ctl_initdb(const char *pg_ctl, const char *pgdata); bool pg_ctl_postgres(const char *pg_ctl, const char *pgdata, int pgport, char *listen_addresses, bool listen); bool pg_log_startup(const char *pgdata, int logLevel); bool pg_log_recovery_setup(const char *pgdata, int logLevel); bool pg_ctl_stop(const char *pg_ctl, const char *pgdata); int pg_ctl_status(const char *pg_ctl, const char *pgdata, bool log_output); bool pg_ctl_promote(const char *pg_ctl, const char *pgdata); bool pg_setup_standby_mode(uint32_t pg_control_version, const char *pg_ctl, const char *pgdata, ReplicationSource *replicationSource); bool pg_cleanup_standby_mode(uint32_t pg_control_version, const char *pg_ctl, const char *pgdata, PGSQL *pgsql); bool pgctl_identify_system(ReplicationSource *replicationSource); bool pg_is_running(const char *pg_ctl, const char *pgdata); bool pg_create_self_signed_cert(PostgresSetup *pgSetup, const char *hostname); #endif /* PGCTL_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/pghba.c000066400000000000000000000507431414244367200220130ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pghba.c * Functions for manipulating pg_hba.conf * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "defaults.h" #include "file_utils.h" #include "ipaddr.h" #include "parsing.h" #include "pgctl.h" #include "pghba.h" #include "pgsetup.h" #include "log.h" #define HBA_LINE_COMMENT " # Auto-generated by pg_auto_failover" static bool pghba_append_rule_to_buffer(PQExpBuffer buffer, bool ssl, HBADatabaseType databaseType, const char *database, const char *username, const char *host, const char *authenticationScheme); static void append_database_field(PQExpBuffer destination, HBADatabaseType databaseType, const char *databaseName); static void append_hostname_or_cidr(PQExpBuffer destination, const char *host); static int escape_hba_string(char *destination, const char *hbaString); /* * pghba_append_rule_to_buffer creates a new HBA rule with the given database, * username, host and authentication scheme in the given buffer. */ static bool pghba_append_rule_to_buffer(PQExpBuffer buffer, bool ssl, HBADatabaseType databaseType, const char *database, const char *username, const char *host, const char *authenticationScheme) { if (ssl) { appendPQExpBufferStr(buffer, "hostssl "); } else { appendPQExpBufferStr(buffer, "host "); } append_database_field(buffer, databaseType, database); appendPQExpBufferStr(buffer, " "); if (username) { char escapedUsername[BUFSIZE] = { 0 }; (void) escape_hba_string(escapedUsername, username); appendPQExpBufferStr(buffer, escapedUsername); appendPQExpBufferStr(buffer, " "); } else { appendPQExpBufferStr(buffer, "all "); } append_hostname_or_cidr(buffer, host); appendPQExpBuffer(buffer, " %s", authenticationScheme); /* memory allocation could have failed while building string */ if (PQExpBufferBroken(buffer)) { log_error("Failed to allocate memory"); destroyPQExpBuffer(buffer); return false; } return true; } /* * pghba_ensure_host_rule_exists ensures that a host rule exists in the * pg_hba file with the given database, username, host and authentication * scheme. */ bool pghba_ensure_host_rule_exists(const char *hbaFilePath, bool ssl, HBADatabaseType databaseType, const char *database, const char *username, const char *host, const char *authenticationScheme, HBAEditLevel hbaLevel) { char *currentHbaContents = NULL; long currentHbaSize = 0L; PQExpBuffer hbaLineBuffer = createPQExpBuffer(); char ipaddr[BUFSIZE] = { 0 }; if (hbaLineBuffer == NULL) { log_error("Failed to allocate memory"); return false; } /* * When using a hostname in the HBA host field, Postgres is very picky * about the matching rules. We have an opportunity here to check the same * DNS and reverse DNS rules as Postgres, and warn our users when we see * something that we know Postgres won't be happy with. * * HBA & DNS is hard. */ bool useHostname = false; if (!pghba_check_hostname(host, ipaddr, sizeof(ipaddr), &useHostname)) { /* errors have already been logged (DNS failure) */ } if (!useHostname) { log_warn("Using IP address \"%s\" in HBA file " "instead of hostname \"%s\"", ipaddr, host); } if (!pghba_append_rule_to_buffer(hbaLineBuffer, ssl, databaseType, database, username, useHostname ? host : ipaddr, authenticationScheme)) { /* errors have already been logged */ /* done with the new HBA line buffer */ destroyPQExpBuffer(hbaLineBuffer); return false; } log_debug("Ensuring the HBA file \"%s\" contains the line: %s", hbaFilePath, hbaLineBuffer->data); if (!read_file(hbaFilePath, ¤tHbaContents, ¤tHbaSize)) { /* read_file logs an error */ /* done with the new HBA line buffer */ destroyPQExpBuffer(hbaLineBuffer); return false; } char *includeLine = strstr(currentHbaContents, hbaLineBuffer->data); /* * If the rule was found and it starts on a new line. We can * skip adding it. */ if (includeLine != NULL && (includeLine == currentHbaContents || includeLine[-1] == '\n')) { log_debug("Line already exists in %s, skipping %s", hbaFilePath, hbaLineBuffer->data); destroyPQExpBuffer(hbaLineBuffer); free(currentHbaContents); return true; } /* * When the option --skip-pg-hba has been used, we still WARN about the HBA * rule that we need, so that users can review their HBA settings and * provisioning. */ if (hbaLevel <= HBA_EDIT_SKIP) { log_warn("Skipping HBA edits (per --skip-pg-hba) for rule: %s", hbaLineBuffer->data); destroyPQExpBuffer(hbaLineBuffer); free(currentHbaContents); return true; } /* build the new postgresql.conf contents */ PQExpBuffer newHbaContents = createPQExpBuffer(); if (newHbaContents == NULL) { log_error("Failed to allocate memory"); destroyPQExpBuffer(hbaLineBuffer); free(currentHbaContents); return false; } appendPQExpBufferStr(newHbaContents, currentHbaContents); appendPQExpBufferStr(newHbaContents, hbaLineBuffer->data); appendPQExpBufferStr(newHbaContents, HBA_LINE_COMMENT "\n"); /* done with the old pg_hba.conf contents */ free(currentHbaContents); /* done with the new HBA line buffer */ destroyPQExpBuffer(hbaLineBuffer); /* memory allocation could have failed while building string */ if (PQExpBufferBroken(newHbaContents)) { log_error("Failed to allocate memory"); destroyPQExpBuffer(newHbaContents); return false; } /* write the new pg_hba.conf */ if (!write_file(newHbaContents->data, newHbaContents->len, hbaFilePath)) { /* write_file logs an error */ destroyPQExpBuffer(newHbaContents); return false; } destroyPQExpBuffer(newHbaContents); log_debug("Wrote new %s", hbaFilePath); return true; } /* * pghba_ensure_host_rules_exist ensures that we have all the rules needed for * the given array of nodes, as retrived from the monitor for our formation and * group, presumably. * * Each node in the array needs two rules: * * host(ssl) replication "pgautofailover_replicator" hostname/ip trust * host(ssl) "dbname" "pgautofailover_replicator" hostname/ip trust */ bool pghba_ensure_host_rules_exist(const char *hbaFilePath, NodeAddressArray *nodesArray, bool ssl, const char *database, const char *username, const char *authenticationScheme, HBAEditLevel hbaLevel) { PQExpBuffer newHbaContents = createPQExpBuffer(); char *currentHbaContents = NULL; long currentHbaSize = 0L; char *includeLine = NULL; int nodeIndex = 0; int hbaLinesAdded = 0; PQExpBuffer hbaLineReplicationBuffer = NULL; PQExpBuffer hbaLineDatabaseBuffer = NULL; if (newHbaContents == NULL) { log_error("Failed to allocate memory"); return false; } if (!read_file(hbaFilePath, ¤tHbaContents, ¤tHbaSize)) { /* read_file logs an error */ destroyPQExpBuffer(newHbaContents); return false; } /* always begin with the existing HBA file */ appendPQExpBufferStr(newHbaContents, currentHbaContents); for (nodeIndex = 0; nodeIndex < nodesArray->count; nodeIndex++) { NodeAddress *node = &(nodesArray->nodes[nodeIndex]); int hbaLinesIndex = 0; PQExpBuffer hbaLines[3] = { 0 }; bool useHostname = true; char ipaddr[BUFSIZE] = { 0 }; /* done with the new HBA line buffers (and safe to call on NULL) */ destroyPQExpBuffer(hbaLineReplicationBuffer); destroyPQExpBuffer(hbaLineDatabaseBuffer); /* we need new buffers now */ hbaLineReplicationBuffer = createPQExpBuffer(); hbaLineDatabaseBuffer = createPQExpBuffer(); if (hbaLineReplicationBuffer == NULL || hbaLineDatabaseBuffer == NULL) { log_error("Failed to allocate memory"); /* done with the old pg_hba.conf contents */ free(currentHbaContents); destroyPQExpBuffer(newHbaContents); /* done with the new HBA line buffers */ destroyPQExpBuffer(hbaLineReplicationBuffer); destroyPQExpBuffer(hbaLineDatabaseBuffer); return false; } if (hbaLevel >= HBA_EDIT_MINIMAL) { /* * When using a hostname in the HBA host field, Postgres is very * picky about the matching rules. We have an opportunity here to * check the same DNS and reverse DNS rules as Postgres, and warn * our users when we see something that we know Postgres won't be * happy with. * * HBA & DNS is hard. */ if (!pghba_check_hostname(node->host, ipaddr, sizeof(ipaddr), &useHostname)) { /* errors have already been logged (DNS failure) */ } if (!useHostname) { log_warn("Using IP address \"%s\" in HBA file " "instead of hostname \"%s\"", ipaddr, node->host); } } log_debug("pghba_ensure_host_rules_exist: %" PRId64 " \"%s\" (%s:%d)", node->nodeId, node->name, useHostname ? node->host : ipaddr, node->port); if (!pghba_append_rule_to_buffer(hbaLineReplicationBuffer, ssl, HBA_DATABASE_REPLICATION, NULL, username, useHostname ? node->host : ipaddr, authenticationScheme)) { /* errors have already been logged */ /* done with the old pg_hba.conf contents */ free(currentHbaContents); destroyPQExpBuffer(newHbaContents); /* done with the new HBA line buffers (and safe to call on NULL) */ destroyPQExpBuffer(hbaLineReplicationBuffer); destroyPQExpBuffer(hbaLineDatabaseBuffer); return false; } if (!pghba_append_rule_to_buffer(hbaLineDatabaseBuffer, ssl, HBA_DATABASE_DBNAME, database, username, useHostname ? node->host : ipaddr, authenticationScheme)) { /* errors have already been logged */ /* done with the old pg_hba.conf contents */ free(currentHbaContents); destroyPQExpBuffer(newHbaContents); /* done with the new HBA line buffers (and safe to call on NULL) */ destroyPQExpBuffer(hbaLineReplicationBuffer); destroyPQExpBuffer(hbaLineDatabaseBuffer); return false; } log_info("%s HBA rules for node %" PRId64 " \"%s\" (%s:%d)", hbaLevel < HBA_EDIT_MINIMAL ? "Checking for" : "Ensuring", node->nodeId, node->name, useHostname ? node->host : ipaddr, node->port); hbaLines[0] = hbaLineReplicationBuffer; hbaLines[1] = hbaLineDatabaseBuffer; for (hbaLinesIndex = 0; hbaLines[hbaLinesIndex] != NULL; hbaLinesIndex++) { PQExpBuffer hbaLineBuffer = hbaLines[hbaLinesIndex]; log_debug("Ensuring the HBA file \"%s\" contains the line: %s", hbaFilePath, hbaLineBuffer->data); includeLine = strstr(currentHbaContents, hbaLineBuffer->data); /* * If the rule was found and it starts on a new line. We can * skip adding it. */ if (includeLine != NULL && (includeLine == currentHbaContents || includeLine[-1] == '\n')) { log_debug("Line already exists in %s, skipping %s", hbaFilePath, hbaLineBuffer->data); continue; } /* * When the option --skip-pg-hba has been used, we still WARN about * the HBA rule that we need, so that users can review their HBA * settings and provisioning. */ if (hbaLevel < HBA_EDIT_MINIMAL) { log_warn("Skipping HBA edits (per --skip-pg-hba) for rule: %s", hbaLineBuffer->data); } else { /* now append the line to the new HBA file contents */ log_info("Adding HBA rule: %s", hbaLineBuffer->data); appendPQExpBufferStr(newHbaContents, hbaLineBuffer->data); appendPQExpBufferStr(newHbaContents, HBA_LINE_COMMENT "\n"); ++hbaLinesAdded; } } } /* done with the new HBA line buffers (and safe to call on NULL) */ destroyPQExpBuffer(hbaLineReplicationBuffer); destroyPQExpBuffer(hbaLineDatabaseBuffer); /* done with the old pg_hba.conf contents */ free(currentHbaContents); /* memory allocation could have failed while building string */ if (PQExpBufferBroken(newHbaContents)) { log_error("Failed to allocate memory"); destroyPQExpBuffer(newHbaContents); return false; } /* write the new pg_hba.conf, unless --skip-pg-hba has been used */ if (hbaLevel >= HBA_EDIT_MINIMAL && hbaLinesAdded > 0) { log_info("Writing new HBA rules in \"%s\"", hbaFilePath); if (!write_file(newHbaContents->data, newHbaContents->len, hbaFilePath)) { /* write_file logs an error */ destroyPQExpBuffer(newHbaContents); return false; } } destroyPQExpBuffer(newHbaContents); log_debug("Wrote new %s", hbaFilePath); return true; } /* * append_database_field writes the database field to destination according to * the databaseType. If the type is HBA_DATABASE_DBNAME then the databaseName * is written in quoted form. */ static void append_database_field(PQExpBuffer destination, HBADatabaseType databaseType, const char *databaseName) { switch (databaseType) { case HBA_DATABASE_ALL: { appendPQExpBufferStr(destination, "all"); break; } case HBA_DATABASE_REPLICATION: { appendPQExpBufferStr(destination, "replication"); break; } case HBA_DATABASE_DBNAME: default: { /* Postgres database names are NAMEDATALEN (64), BUFSIZE is 1024 */ char escapedDatabaseName[BUFSIZE] = { 0 }; (void) escape_hba_string(escapedDatabaseName, databaseName); appendPQExpBufferStr(destination, escapedDatabaseName); break; } } } /* * append_hostname_or_cidr checks whether the host is an IP and if so converts * it to a CIDR and writes it to destination. Otherwise, convert_ip_to_cidr * writes the host directly to the destination. */ static void append_hostname_or_cidr(PQExpBuffer destination, const char *host) { switch (ip_address_type(host)) { case IPTYPE_V4: { appendPQExpBuffer(destination, "%s/32", host); break; } case IPTYPE_V6: { appendPQExpBuffer(destination, "%s/128", host); break; } case IPTYPE_NONE: default: { appendPQExpBufferStr(destination, host); break; } } } /* * escape_hba_string escapes a string that is used in a pg_hba.conf file * and writes it to the destination. escape_hba_string returns the number * of characters written. * * While this is not documented, the code in hba.c (next_token) implements * two double-quotes as a literal double quote. */ static int escape_hba_string(char *destination, const char *hbaString) { int charIndex = 0; int length = strlen(hbaString); int escapedStringLength = 0; destination[escapedStringLength++] = '"'; for (charIndex = 0; charIndex < length; charIndex++) { char currentChar = hbaString[charIndex]; if (currentChar == '"') { destination[escapedStringLength++] = '"'; } destination[escapedStringLength++] = currentChar; } destination[escapedStringLength++] = '"'; destination[escapedStringLength] = '\0'; return escapedStringLength; } /* * pghba_enable_lan_cidr adds our local CIDR network notation (e.g. * 192.168.0.0/23) to the HBA file of the PostgreSQL server, so that any node * in the local network may connect already. * * Failure is a warning only. * * In normal cases, pgdata is NULL and pghba_enable_lan_cidr queries the local * PostgreSQL server for the location of its HBA file. * * When initializing a PostgreSQL cluster in a test environment using * PG_REGRESS_SOCK_DIR="" and --listen options, then we have to add an HBA rule * before starting PostgreSQL, otherwise we don't have a path to connect to it. * In that case we pass in PGDATA and pghba_enable_lan_cidr uses the file * PGDATA/pg_hba.conf as the hbaFilePath: we just did `pg_ctl initdb` after * all, it should be safe. */ bool pghba_enable_lan_cidr(PGSQL *pgsql, bool ssl, HBADatabaseType databaseType, const char *database, const char *hostname, const char *username, const char *authenticationScheme, HBAEditLevel hbaLevel, const char *pgdata) { char hbaFilePath[MAXPGPATH]; char ipAddr[BUFSIZE]; char cidr[BUFSIZE]; /* Compute the CIDR notation for our hostname */ if (!findHostnameLocalAddress(hostname, ipAddr, BUFSIZE)) { int logLevel = hbaLevel <= HBA_EDIT_SKIP ? LOG_WARN : LOG_FATAL; log_level(logLevel, "Failed to find IP address for hostname \"%s\", " "see above for details", hostname); /* when --skip-pg-hba is used, we don't mind the failure here */ return hbaLevel == HBA_EDIT_SKIP; } if (!fetchLocalCIDR(ipAddr, cidr, BUFSIZE)) { log_warn("Failed to determine network configuration for " "IP address \"%s\", skipping HBA settings", ipAddr); /* when --skip-pg-hba is used, we don't mind the failure here */ return hbaLevel == HBA_EDIT_SKIP; } log_debug("HBA: adding CIDR from hostname \"%s\"", hostname); log_debug("HBA: local ip address: %s", ipAddr); log_debug("HBA: CIDR address to open: %s", cidr); log_info("Granting connection privileges on %s", cidr); /* The caller gives pgdata when PostgreSQL is not yet running */ if (pgdata == NULL) { if (!pgsql_get_hba_file_path(pgsql, hbaFilePath, MAXPGPATH)) { /* unexpected */ log_error("Failed to obtain the HBA file path from the local " "PostgreSQL server."); return false; } } else { sformat(hbaFilePath, MAXPGPATH, "%s/pg_hba.conf", pgdata); } /* * We still go on when skipping HBA, so that we display a useful message to * the user with the specific rule we are skipping here. */ if (!pghba_ensure_host_rule_exists(hbaFilePath, ssl, databaseType, database, username, cidr, authenticationScheme, hbaLevel)) { log_error("Failed to add the local network to PostgreSQL HBA file: " "couldn't modify the pg_hba file"); return false; } /* * pgdata is given when PostgreSQL is not yet running, don't reload then... */ if (pgdata == NULL && hbaLevel >= HBA_EDIT_MINIMAL && !pgsql_reload_conf(pgsql)) { log_error("Failed to reload PostgreSQL configuration for new HBA rule"); return false; } return true; } /* * hba_check_hostname returns true when the DNS setting looks compatible with * Postgres expectations for an HBA hostname entry. * * https://www.postgresql.org/docs/current/auth-pg-hba-conf.html * * If a host name is specified (anything that is not an IP address range or a * special key word is treated as a host name), that name is compared with the * result of a reverse name resolution of the client's IP address (e.g., * reverse DNS lookup, if DNS is used). Host name comparisons are case * insensitive. If there is a match, then a forward name resolution (e.g., * forward DNS lookup) is performed on the host name to check whether any of * the addresses it resolves to are equal to the client's IP address. If both * directions match, then the entry is considered to match. (The host name that * is used in pg_hba.conf should be the one that address-to-name resolution of * the client's IP address returns, otherwise the line won't be matched. Some * host name databases allow associating an IP address with multiple host * names, but the operating system will only return one host name when asked to * resolve an IP address.) */ bool pghba_check_hostname(const char *hostname, char *ipaddr, size_t size, bool *useHostname) { /* * IP addresses do not require any DNS properties/lookups. Also hostname * won't contain a '/' character, but CIDR notations would, such as * 1.2.3.4/32 or ::1/128. We don't want to trust ip_address_type() value of * IPTYPE_NONE when we find a '/' character in the hostname. * */ if (strchr(hostname, '/') || ip_address_type(hostname) != IPTYPE_NONE) { *useHostname = true; return true; } bool foundHostnameFromAddress = false; if (!resolveHostnameForwardAndReverse(hostname, ipaddr, size, &foundHostnameFromAddress)) { /* errors have already been logged (DNS failure) */ *useHostname = true; return false; } if (foundHostnameFromAddress) { *useHostname = true; log_debug("pghba_check_hostname: \"%s\" <-> %s", hostname, ipaddr); return true; } *useHostname = false; /* warn users about possible DNS misconfiguration */ log_warn("Failed to resolve hostname \"%s\" to an IP address that " "resolves back to the hostname on a reverse DNS lookup.", hostname); log_warn("Postgres might deny connection attempts from \"%s\", " "even with the new HBA rules.", hostname); log_warn("Hint: correct setup of HBA with host names requires proper " "reverse DNS setup. You might want to use IP addresses."); /* we could successfully check that we should not use the hostname */ return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/pghba.h000066400000000000000000000025711414244367200220140ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pghba.h * API for manipulating pg_hba.conf files * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef PGHBA_H #define PGHBA_H #include "pgsetup.h" #include "pgsql.h" /* supported HBA database values */ typedef enum HBADatabaseType { HBA_DATABASE_ALL, HBA_DATABASE_REPLICATION, HBA_DATABASE_DBNAME } HBADatabaseType; bool pghba_ensure_host_rule_exists(const char *hbaFilePath, bool ssl, HBADatabaseType, const char *database, const char *username, const char *hostname, const char *authenticationScheme, HBAEditLevel hbaLevel); bool pghba_ensure_host_rules_exist(const char *hbaFilePath, NodeAddressArray *nodesArray, bool ssl, const char *database, const char *username, const char *authenticationScheme, HBAEditLevel hbaLevel); bool pghba_enable_lan_cidr(PGSQL *pgsql, bool ssl, HBADatabaseType databaseType, const char *database, const char *hostname, const char *username, const char *authenticationScheme, HBAEditLevel hbaLevel, const char *pgdata); bool pghba_check_hostname(const char *hostname, char *ipaddr, size_t size, bool *useHostname); #endif /* PGHBA_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/pgsetup.c000066400000000000000000001444061414244367200224210ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pgsetup.c * Discovers a PostgreSQL setup by calling pg_controldata and reading * postmaster.pid file, getting clues from the process environment and from * user given hints (options). * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include "parson.h" #include "postgres_fe.h" #include "pqexpbuffer.h" #include "defaults.h" #include "env_utils.h" #include "log.h" #include "parsing.h" #include "pgctl.h" #include "signals.h" #include "string_utils.h" static bool get_pgpid(PostgresSetup *pgSetup, bool pgIsNotRunningIsOk); static PostmasterStatus pmStatusFromString(const char *postmasterStatus); /* * Discover PostgreSQL environment from given clues, or a partial setup. * * This routines check the PATH for pg_ctl, and is ok when there's a single * entry in found. It then uses either given PGDATA or the environment value * and runs a pg_controldata to get system identifier and PostgreSQL version * numbers. Then it reads PGDATA/postmaster.pid to get the pid and the port of * the running PostgreSQL server. Then it can connects to it and see if it's in * recovery. */ bool pg_setup_init(PostgresSetup *pgSetup, PostgresSetup *options, bool missing_pgdata_is_ok, bool pg_is_not_running_is_ok) { int errors = 0; /* * Make sure that we keep the options->nodeKind in the pgSetup. */ pgSetup->pgKind = options->pgKind; /* * Also make sure that we keep the pg_controldata results if we have them. */ pgSetup->control = options->control; /* * Also make sure that we keep the hbaLevel to edit. Remember that * --skip-pg-hba is registered in the config as --auth skip. */ if (strcmp(options->authMethod, "skip") == 0) { pgSetup->hbaLevel = HBA_EDIT_SKIP; strlcpy(pgSetup->hbaLevelStr, options->authMethod, NAMEDATALEN); } else { pgSetup->hbaLevel = options->hbaLevel; strlcpy(pgSetup->hbaLevelStr, options->hbaLevelStr, NAMEDATALEN); } /* * Make sure that we keep the SSL options too. */ pgSetup->ssl.active = options->ssl.active; pgSetup->ssl.createSelfSignedCert = options->ssl.createSelfSignedCert; pgSetup->ssl.sslMode = options->ssl.sslMode; strlcpy(pgSetup->ssl.sslModeStr, options->ssl.sslModeStr, SSL_MODE_STRLEN); strlcpy(pgSetup->ssl.caFile, options->ssl.caFile, MAXPGPATH); strlcpy(pgSetup->ssl.crlFile, options->ssl.crlFile, MAXPGPATH); strlcpy(pgSetup->ssl.serverCert, options->ssl.serverCert, MAXPGPATH); strlcpy(pgSetup->ssl.serverKey, options->ssl.serverKey, MAXPGPATH); /* Also make sure we keep the citus specific clusterName option */ strlcpy(pgSetup->citusClusterName, options->citusClusterName, NAMEDATALEN); /* check or find pg_ctl, unless we already have it */ if (IS_EMPTY_STRING_BUFFER(pgSetup->pg_ctl) || IS_EMPTY_STRING_BUFFER(pgSetup->pg_version)) { if (!IS_EMPTY_STRING_BUFFER(options->pg_ctl)) { /* copy over pg_ctl and pg_version */ strlcpy(pgSetup->pg_ctl, options->pg_ctl, MAXPGPATH); strlcpy(pgSetup->pg_version, options->pg_version, PG_VERSION_STRING_MAX); /* we might not have fetched the version yet */ if (IS_EMPTY_STRING_BUFFER(pgSetup->pg_version)) { /* also cache the version in options */ if (!pg_ctl_version(options)) { /* we already logged about it */ return false; } strlcpy(pgSetup->pg_version, options->pg_version, sizeof(pgSetup->pg_version)); log_debug("pg_setup_init: %s version %s", pgSetup->pg_ctl, pgSetup->pg_version); } } else { if (!config_find_pg_ctl(pgSetup)) { /* config_find_pg_ctl already logged errors */ errors++; } } } /* check or find PGDATA */ if (options->pgdata[0] != '\0') { strlcpy(pgSetup->pgdata, options->pgdata, MAXPGPATH); } else { if (!get_env_pgdata(pgSetup->pgdata)) { log_error("Failed to set PGDATA either from the environment " "or from --pgdata"); errors++; } } if (!missing_pgdata_is_ok && !directory_exists(pgSetup->pgdata)) { log_fatal("Database directory \"%s\" not found", pgSetup->pgdata); return false; } else if (!missing_pgdata_is_ok) { char globalControlPath[MAXPGPATH] = { 0 }; /* globalControlFilePath = $PGDATA/global/pg_control */ join_path_components(globalControlPath, pgSetup->pgdata, "global/pg_control"); if (!file_exists(globalControlPath)) { log_error("PGDATA exists but is not a Postgres directory, " "see above for details"); return false; } } /* get the real path of PGDATA now */ if (directory_exists(pgSetup->pgdata)) { if (!normalize_filename(pgSetup->pgdata, pgSetup->pgdata, MAXPGPATH)) { /* errors have already been logged */ return false; } } /* check of find username */ if (options->username[0] != '\0') { strlcpy(pgSetup->username, options->username, NAMEDATALEN); } else { /* * If a PGUSER environment variable is defined, take the value from * there. Otherwise we attempt to connect without username. In that * case the username will be determined based on the current user. */ if (!get_env_copy_with_fallback("PGUSER", pgSetup->username, NAMEDATALEN, "")) { /* errors have already been logged */ return false; } } /* check or find dbname */ if (options->dbname[0] != '\0') { strlcpy(pgSetup->dbname, options->dbname, NAMEDATALEN); } else { /* * If a PGDATABASE environment variable is defined, take the value from * there. Otherwise we attempt to connect without a database name, and * the default will use the username here instead. */ if (!get_env_copy_with_fallback("PGDATABASE", pgSetup->dbname, NAMEDATALEN, DEFAULT_DATABASE_NAME)) { /* errors have already been logged */ return false; } } /* * Read the postmaster.pid file to find out pid, port and unix socket * directory of a running PostgreSQL instance. */ bool pgIsReady = pg_setup_is_ready(pgSetup, pg_is_not_running_is_ok); if (!pgIsReady && !pg_is_not_running_is_ok) { /* errors have already been logged */ errors++; } /* * check or find PGHOST * * By order of preference, we use: * --pghost command line option * PGDATA/postmaster.pid * PGHOST from the environment */ if (options->pghost[0] != '\0') { strlcpy(pgSetup->pghost, options->pghost, _POSIX_HOST_NAME_MAX); } else { /* read_pg_pidfile might already have set pghost for us */ if (pgSetup->pghost[0] == '\0') { /* * We can (at least try to) connect without host= in the connection * string, so missing PGHOST and --pghost isn't an error. */ if (!get_env_copy_with_fallback("PGHOST", pgSetup->pghost, _POSIX_HOST_NAME_MAX, "")) { /* errors have already been logged */ return false; } } } /* * In test environment we might disable unix socket directories. In that * case, we need to have an host to connect to, accepting to connect * without host= in the connection string is not going to cut it. */ if (IS_EMPTY_STRING_BUFFER(pgSetup->pghost)) { if (env_found_empty("PG_REGRESS_SOCK_DIR")) { log_error("PG_REGRESS_SOCK_DIR is set to \"\" to disable unix " "socket directories, now --pghost is mandatory, " "but unset."); errors++; } } /* check or find PGPORT * * By order or preference, we use: * --pgport command line option * PGDATA/postmaster.pid * PGPORT from the environment * POSTGRES_PORT from our hard coded defaults (5432, see defaults.h) */ if (options->pgport > 0) { pgSetup->pgport = options->pgport; } else { /* if we have a running cluster, just use its port */ if (pgSetup->pidFile.pid > 0 && pgSetup->pidFile.port > 0) { pgSetup->pgport = pgSetup->pidFile.port; } else { /* * no running cluster, what about using PGPORT then? */ pgSetup->pgport = pgsetup_get_pgport(); } } /* Set proxy port */ if (options->proxyport > 0) { pgSetup->proxyport = options->proxyport; } /* * If --listen is given, then set our listen_addresses to this value */ if (!IS_EMPTY_STRING_BUFFER(options->listen_addresses)) { strlcpy(pgSetup->listen_addresses, options->listen_addresses, MAXPGPATH); } else { /* * The default listen_addresses is '*', because we are dealing with a * cluster setup and 'localhost' isn't going to cut it: the monitor and * the coordinator nodes need to be able to connect to our local node * using a connection string with hostname:port. */ strlcpy(pgSetup->listen_addresses, POSTGRES_DEFAULT_LISTEN_ADDRESSES, MAXPGPATH); } /* * If --auth is given, then set our authMethod to this value * otherwise it remains empty */ if (!IS_EMPTY_STRING_BUFFER(options->authMethod)) { strlcpy(pgSetup->authMethod, options->authMethod, NAMEDATALEN); } pgSetup->settings = options->settings; /* * And we always double-check with PGDATA/postmaster.pid if we have it, and * we should have it in the normal/expected case. */ if (pgIsReady && pgSetup->pidFile.pid > 0 && pgSetup->pgport != pgSetup->pidFile.port) { log_error("Given --pgport %d doesn't match PostgreSQL " "port %d from \"%s/postmaster.pid\"", pgSetup->pgport, pgSetup->pidFile.port, pgSetup->pgdata); errors++; } /* * When we have a PGDATA and Postgres is not running, we need to grab more * information about the local installation: pg_controldata can give us the * pg-_control_version, catalog_version_no, and system_identifier. */ if (errors == 0) { /* * Only run pg_controldata when Postgres is not running, otherwise we * get the same information later from an SQL query, see * pgsql_get_postgres_metadata. */ if (!pg_setup_is_running(pgSetup) && pgSetup->control.pg_control_version == 0) { pg_controldata(pgSetup, missing_pgdata_is_ok); if (pgSetup->control.pg_control_version == 0) { /* we already logged about it */ if (!missing_pgdata_is_ok) { errors++; } } else { log_debug("Found PostgreSQL system %" PRIu64 " at \"%s\", " "version %u, catalog version %u", pgSetup->control.system_identifier, pgSetup->pgdata, pgSetup->control.pg_control_version, pgSetup->control.catalog_version_no); } } } /* * Sometimes `pg_ctl start` returns with success and Postgres is still in * crash recovery replaying WAL files, in the "starting" state rather than * the "ready" state. * * In that case, we wait until Postgres is ready for connections. The whole * pg_autoctl code is expecting to be able to connect to Postgres, so * there's no point in returning now and having the next connection attempt * fail with something like the following: * * ERROR Connection to database failed: FATAL: the database system is * starting up */ if (pgSetup->pidFile.port > 0 && pgSetup->pgport == pgSetup->pidFile.port) { if (!pgIsReady) { if (!pg_is_not_running_is_ok) { log_error("Failed to read Postgres pidfile, " "see above for details"); return false; } } } if (errors > 0) { log_fatal("Failed to discover PostgreSQL setup, " "please fix previous errors."); return false; } return true; } /* * Read the first line of the PGDATA/postmaster.pid file to get Postgres PID. */ static bool get_pgpid(PostgresSetup *pgSetup, bool pgIsNotRunningIsOk) { char *contents = NULL; long fileSize = 0; char pidfile[MAXPGPATH]; char *lines[1]; int pid = -1; /* when !pgIsNotRunningIsOk then log_error(), otherwise log_debug() */ int logLevel = pgIsNotRunningIsOk ? LOG_TRACE : LOG_ERROR; join_path_components(pidfile, pgSetup->pgdata, "postmaster.pid"); if (!read_file_if_exists(pidfile, &contents, &fileSize)) { log_level(logLevel, "Failed to open file \"%s\": %m", pidfile); if (!pgIsNotRunningIsOk) { log_info("Is PostgreSQL at \"%s\" up and running?", pgSetup->pgdata); } return false; } if (fileSize == 0) { /* yeah, that happens (race condition, kind of) */ log_debug("The PID file \"%s\" is empty", pidfile); free(contents); return false; } else if (splitLines(contents, lines, 1) != 1 || !stringToInt(lines[0], &pid)) { log_warn("Invalid data in PID file \"%s\"", pidfile); free(contents); return false; } free(contents); contents = NULL; /* postmaster PID (or negative of a standalone backend's PID) */ if (pid < 0) { int standalonePid = -1 * pid; if (kill(standalonePid, 0) == 0) { pgSetup->pidFile.pid = pid; return true; } log_debug("Read a stale standalone pid in \"postmaster.pid\": %d", pid); return false; } else if (pid > 0 && pid <= INT_MAX) { if (kill(pid, 0) == 0) { pgSetup->pidFile.pid = pid; return true; } else { int logLevel = pgIsNotRunningIsOk ? LOG_DEBUG : LOG_WARN; log_level(logLevel, "Read a stale pid in \"postmaster.pid\": %d", pid); return false; } } else { /* that's more like a bug, really */ log_error("Invalid PID \"%d\" read in \"postmaster.pid\"", pid); return false; } } /* * Read the PGDATA/postmaster.pid file to get the port number of the running * server we're asked to keep highly available. */ bool read_pg_pidfile(PostgresSetup *pgSetup, bool pgIsNotRunningIsOk, int maxRetries) { FILE *fp; int lineno; char line[BUFSIZE]; char pidfile[MAXPGPATH]; join_path_components(pidfile, pgSetup->pgdata, "postmaster.pid"); if ((fp = fopen_read_only(pidfile)) == NULL) { /* * Maybe we're attempting to read the file during Postgres start-up * phase and we just got where the file is replaced, when going from * standalone backend to full service. */ if (maxRetries > 0) { log_trace("read_pg_pidfile: \"%s\" does not exist [%d]", pidfile, maxRetries); pg_usleep(250 * 1000); /* wait for 250ms and try again */ return read_pg_pidfile(pgSetup, pgIsNotRunningIsOk, maxRetries - 1); } if (!pgIsNotRunningIsOk) { log_error("Failed to open file \"%s\": %m", pidfile); log_info("Is PostgreSQL at \"%s\" up and running?", pgSetup->pgdata); } return false; } for (lineno = 1; lineno <= LOCK_FILE_LINE_PM_STATUS; lineno++) { if (fgets(line, sizeof(line), fp) == NULL) { /* later lines are added during start-up, will appear later */ if (lineno > LOCK_FILE_LINE_PORT) { /* that's retry-able */ fclose(fp); if (maxRetries == 0) { /* partial read is ok, pgSetup keeps track */ return true; } pg_usleep(250 * 1000); /* sleep for 250ms */ log_trace("read_pg_pidfile: fgets is NULL for lineno %d, retry %d", lineno, maxRetries); return read_pg_pidfile(pgSetup, pgIsNotRunningIsOk, maxRetries - 1); } else { /* don't use %m to print errno, errno is not set by fgets */ log_error("Failed to read line %d from file \"%s\"", lineno, pidfile); fclose(fp); return false; } } int lineLength = strlen(line); /* chomp the ending Newline (\n) */ if (lineLength > 0) { line[lineLength - 1] = '\0'; lineLength = strlen(line); } if (lineno == LOCK_FILE_LINE_PID) { int pid = 0; if (!stringToInt(line, &pid)) { log_error("Postgres pidfile does not contain a valid pid %s", line); return false; } /* a standalone backend pid is negative, we signal the actual pid */ pgSetup->pidFile.pid = abs(pid); if (kill(pgSetup->pidFile.pid, 0) != 0) { log_error("Postgres pidfile contains pid %d, " "which is not running", pgSetup->pidFile.pid); /* well then reset the PID to our unknown value */ pgSetup->pidFile.pid = 0; return false; } if (pid < 0) { /* standalone backend during the start-up process */ break; } } if (lineno == LOCK_FILE_LINE_PORT) { if (!stringToUShort(line, &pgSetup->pidFile.port)) { log_error("Postgres pidfile does not contain a valid port %s", line); return false; } } if (lineno == LOCK_FILE_LINE_SOCKET_DIR) { if (lineLength > 0) { int n = strlcpy(pgSetup->pghost, line, _POSIX_HOST_NAME_MAX); if (n >= _POSIX_HOST_NAME_MAX) { log_error("Failed to read unix socket directory \"%s\" " "from file \"%s\": the directory name is %d " "characters long, " "and pg_autoctl only accepts up to %d characters", line, pidfile, n, _POSIX_HOST_NAME_MAX - 1); return false; } } } if (lineno == LOCK_FILE_LINE_PM_STATUS) { if (lineLength > 0) { pgSetup->pm_status = pmStatusFromString(line); } } } fclose(fp); log_trace("read_pg_pidfile: pid %d, port %d, host %s, status \"%s\"", pgSetup->pidFile.pid, pgSetup->pidFile.port, pgSetup->pghost, pmStatusToString(pgSetup->pm_status)); return true; } /* * fprintf_pg_setup prints to given STREAM the current setting found in * pgSetup. */ void fprintf_pg_setup(FILE *stream, PostgresSetup *pgSetup) { int pgversion = 0; (void) parse_pg_version_string(pgSetup->pg_version, &pgversion); fformat(stream, "pgdata: %s\n", pgSetup->pgdata); fformat(stream, "pg_ctl: %s\n", pgSetup->pg_ctl); fformat(stream, "pg_version: \"%s\" (%d)\n", pgSetup->pg_version, pgversion); fformat(stream, "pghost: %s\n", pgSetup->pghost); fformat(stream, "pgport: %d\n", pgSetup->pgport); fformat(stream, "proxyport: %d\n", pgSetup->proxyport); fformat(stream, "pid: %d\n", pgSetup->pidFile.pid); fformat(stream, "is in recovery: %s\n", pgSetup->is_in_recovery ? "yes" : "no"); fformat(stream, "Control cluster state: %s\n", dbstateToString(pgSetup->control.state)); fformat(stream, "Control Version: %u\n", pgSetup->control.pg_control_version); fformat(stream, "Catalog Version: %u\n", pgSetup->control.catalog_version_no); fformat(stream, "System Identifier: %" PRIu64 "\n", pgSetup->control.system_identifier); fformat(stream, "Latest checkpoint LSN: %s\n", pgSetup->control.latestCheckpointLSN); fformat(stream, "Postmaster status: %s\n", pmStatusToString(pgSetup->pm_status)); fflush(stream); } /* * pg_setup_as_json copies in the given pre-allocated string the json * representation of the pgSetup. */ bool pg_setup_as_json(PostgresSetup *pgSetup, JSON_Value *js) { JSON_Object *jsobj = json_value_get_object(js); char system_identifier[BUFSIZE]; json_object_set_string(jsobj, "pgdata", pgSetup->pgdata); json_object_set_string(jsobj, "pg_ctl", pgSetup->pg_ctl); json_object_set_string(jsobj, "version", pgSetup->pg_version); json_object_set_string(jsobj, "host", pgSetup->pghost); json_object_set_number(jsobj, "port", (double) pgSetup->pgport); json_object_set_number(jsobj, "proxyport", (double) pgSetup->proxyport); json_object_set_number(jsobj, "pid", (double) pgSetup->pidFile.pid); json_object_set_boolean(jsobj, "in_recovery", pgSetup->is_in_recovery); json_object_dotset_number(jsobj, "control.version", (double) pgSetup->control.pg_control_version); json_object_dotset_number(jsobj, "control.catalog_version", (double) pgSetup->control.catalog_version_no); sformat(system_identifier, BUFSIZE, "%" PRIu64, pgSetup->control.system_identifier); json_object_dotset_string(jsobj, "control.system_identifier", system_identifier); json_object_dotset_string(jsobj, "postmaster.status", pmStatusToString(pgSetup->pm_status)); return true; } /* * pg_setup_get_local_connection_string build a connecting string to connect * to the local postgres server and writes it to connectionString, which should * be at least MAXCONNINFO in size. */ bool pg_setup_get_local_connection_string(PostgresSetup *pgSetup, char *connectionString) { char pg_regress_sock_dir[MAXPGPATH] = { 0 }; bool pg_regress_sock_dir_exists = env_exists("PG_REGRESS_SOCK_DIR"); PQExpBuffer connStringBuffer = createPQExpBuffer(); if (connStringBuffer == NULL) { log_error("Failed to allocate memory"); return false; } appendPQExpBuffer(connStringBuffer, "port=%d dbname=%s", pgSetup->pgport, pgSetup->dbname); if (pg_regress_sock_dir_exists && !get_env_copy("PG_REGRESS_SOCK_DIR", pg_regress_sock_dir, MAXPGPATH)) { /* errors have already been logged */ destroyPQExpBuffer(connStringBuffer); return false; } /* * When PG_REGRESS_SOCK_DIR is set and empty, we force the connection * string to use "localhost" (TCP/IP hostname for IP 127.0.0.1 or ::1, * usually), even when the configuration setup is using a unix directory * setting. */ if (env_found_empty("PG_REGRESS_SOCK_DIR") && (IS_EMPTY_STRING_BUFFER(pgSetup->pghost) || pgSetup->pghost[0] == '/')) { appendPQExpBufferStr(connStringBuffer, " host=localhost"); } else if (!IS_EMPTY_STRING_BUFFER(pgSetup->pghost)) { if (pg_regress_sock_dir_exists && strlen(pg_regress_sock_dir) > 0 && strcmp(pgSetup->pghost, pg_regress_sock_dir) != 0) { /* * It might turn out ok (stray environment), but in case of * connection error, this warning should be useful to debug the * situation. */ log_warn("PG_REGRESS_SOCK_DIR is set to \"%s\", " "and our setup is using \"%s\"", pg_regress_sock_dir, pgSetup->pghost); } appendPQExpBuffer(connStringBuffer, " host=%s", pgSetup->pghost); } if (!IS_EMPTY_STRING_BUFFER(pgSetup->username)) { appendPQExpBuffer(connStringBuffer, " user=%s", pgSetup->username); } if (PQExpBufferBroken(connStringBuffer)) { log_error("Failed to allocate memory"); destroyPQExpBuffer(connStringBuffer); return false; } if (strlcpy(connectionString, connStringBuffer->data, MAXCONNINFO) >= MAXCONNINFO) { log_error("Failed to copy connection string \"%s\" which is %lu bytes " "long, pg_autoctl only supports connection strings up to " " %lu bytes", connStringBuffer->data, (unsigned long) connStringBuffer->len, (unsigned long) MAXCONNINFO); destroyPQExpBuffer(connStringBuffer); return false; } destroyPQExpBuffer(connStringBuffer); return true; } /* * pg_setup_pgdata_exists returns true when PGDATA exists, hosts a * global/pg_control file (so that it looks like a Postgres cluster) and when * the pg_controldata probe was successful. */ bool pg_setup_pgdata_exists(PostgresSetup *pgSetup) { char globalControlPath[MAXPGPATH] = { 0 }; /* make sure our cached value in pgSetup still makes sense */ if (!directory_exists(pgSetup->pgdata)) { return false; } /* globalControlFilePath = $PGDATA/global/pg_control */ join_path_components(globalControlPath, pgSetup->pgdata, "global/pg_control"); if (!file_exists(globalControlPath)) { return false; } /* * Now that we know that PGDATA exists, let's grab the system identifier if * we don't have it already. */ if (pgSetup->control.system_identifier == 0) { bool missingPgdataIsOk = false; /* errors are logged from within pg_controldata */ (void) pg_controldata(pgSetup, missingPgdataIsOk); return pgSetup->control.system_identifier != 0; } return true; } /* * pg_setup_pgdata_exists returns true when the pg_controldata probe was * susccessful. */ bool pg_setup_is_running(PostgresSetup *pgSetup) { bool pgIsNotRunningIsOk = true; return pgSetup->pidFile.pid != 0 /* if we don't have the PID yet, try reading it now */ || (get_pgpid(pgSetup, pgIsNotRunningIsOk) && pgSetup->pidFile.pid > 0); } /* * pg_setup_is_ready returns true when the postmaster.pid file has a "ready" * status in it, which we parse in pgSetup->pm_status. */ bool pg_setup_is_ready(PostgresSetup *pgSetup, bool pgIsNotRunningIsOk) { char globalControlPath[MAXPGPATH] = { 0 }; /* globalControlFilePath = $PGDATA/global/pg_control */ join_path_components(globalControlPath, pgSetup->pgdata, "global/pg_control"); if (!file_exists(globalControlPath)) { return false; } /* * Invalidate in-memory Postmaster status cache. * * This makes sure we enter the main loop and attempt to read the * postmaster.pid file at least once: if Postgres was stopped, then the * file that we've read previously might not exists anymore. */ pgSetup->pm_status = POSTMASTER_STATUS_UNKNOWN; /* * Sometimes `pg_ctl start` returns with success and Postgres is still * in crash recovery replaying WAL files, in the "starting" state * rather than the "ready" state. * * In that case, we wait until Postgres is ready for connections. The * whole pg_autoctl code is expecting to be able to connect to * Postgres, so there's no point in returning now and having the next * connection attempt fail with something like the following: * * ERROR Connection to database failed: FATAL: the database system is * starting up */ while (pgSetup->pm_status != POSTMASTER_STATUS_READY) { int maxRetries = 5; if (!get_pgpid(pgSetup, pgIsNotRunningIsOk)) { /* * We failed to read the Postgres pid file, and infinite * looping might not help here anymore. Better give control * back to the launching process (might be init scripts, * systemd or the like) so that they may log a transient * failure and try again. */ if (!pgIsNotRunningIsOk) { log_error("Failed to get Postgres pid, " "see above for details"); } /* * we failed to get Postgres pid from the first line of its pid * file, so we consider that Postgres is not running, thus not * ready. */ return false; } /* * When starting up we might read the postmaster.pid file too * early, when Postgres is still in its "standalone backend" phase. * Let's give it 250ms before trying again then. */ if (pgSetup->pidFile.pid < 0) { pg_usleep(250 * 1000); continue; } /* * Here, we know that Postgres is running, and we even have its * PID. Time to try and read the rest of the PID file. This might * fail when the file isn't complete yet, in which case we're going * to retry. */ if (!read_pg_pidfile(pgSetup, pgIsNotRunningIsOk, maxRetries)) { log_warn("Failed to read Postgres \"postmaster.pid\" file"); return false; } /* avoid an extra wait if that's possible */ if (pgSetup->pm_status == POSTMASTER_STATUS_READY) { break; } log_debug("postmaster status is \"%s\", retrying in %ds.", pmStatusToString(pgSetup->pm_status), PG_AUTOCTL_KEEPER_RETRY_TIME_MS); pg_usleep(PG_AUTOCTL_KEEPER_RETRY_TIME_MS * 1000); } if (pgSetup->pm_status != POSTMASTER_STATUS_UNKNOWN) { log_trace("pg_setup_is_ready: %s", pmStatusToString(pgSetup->pm_status)); } return pgSetup->pm_status == POSTMASTER_STATUS_READY; } /* * pg_setup_wait_until_is_ready loops over pg_setup_is_running() and returns * when Postgres is ready. The loop tries every 100ms up to the given timeout, * given in seconds. */ bool pg_setup_wait_until_is_ready(PostgresSetup *pgSetup, int timeout, int logLevel) { uint64_t startTime = time(NULL); int attempts = 0; pid_t previousPostgresPid = pgSetup->pidFile.pid; bool pgIsRunning = false; bool pgIsReady = false; bool missingPgdataIsOk = false; bool postgresNotRunningIsOk = true; log_trace("pg_setup_wait_until_is_ready"); for (attempts = 1; !pgIsRunning; attempts++) { uint64_t now = time(NULL); /* sleep 100 ms in between postmaster.pid probes */ pg_usleep(100 * 1000); pgIsRunning = get_pgpid(pgSetup, postgresNotRunningIsOk) && pgSetup->pidFile.pid > 0; /* let's not be THAT verbose about it */ if ((attempts - 1) % 10 == 0) { log_debug("pg_setup_wait_until_is_ready(): postgres %s, " "pid %d (was %d), after %ds and %d attempt(s)", pgIsRunning ? "is running" : "is not running", pgSetup->pidFile.pid, previousPostgresPid, (int) (now - startTime), attempts); } /* we're done if we reach the timeout */ if ((now - startTime) >= timeout) { break; } } /* * Now update our pgSetup from the running database, including versions and * all we can discover. */ if (pgIsRunning && previousPostgresPid != pgSetup->pidFile.pid) { /* * Update our pgSetup view of Postgres once we have made sure it's * running. */ PostgresSetup newPgSetup = { 0 }; if (!pg_setup_init(&newPgSetup, pgSetup, missingPgdataIsOk, postgresNotRunningIsOk)) { /* errors have already been logged */ log_error("pg_setup_wait_until_is_ready: pg_setup_init is false"); return false; } *pgSetup = newPgSetup; /* avoid an extra pg_setup_is_ready call if we're all good already */ pgIsReady = pgSetup->pm_status == POSTMASTER_STATUS_READY; } /* * Ok so we have a postmaster.pid file with a pid > 0 (not a standalone * backend, the service has started). Postgres might still be "starting" * rather than "ready" though, so let's continue our attempts and make sure * that Postgres is ready. */ for (; !pgIsReady; attempts++) { uint64_t now = time(NULL); pgIsReady = pg_setup_is_ready(pgSetup, postgresNotRunningIsOk); /* let's not be THAT verbose about it */ if ((attempts - 1) % 10 == 0) { log_debug("pg_setup_wait_until_is_ready(): pgstatus is %s, " "pid %d (was %d), after %ds and %d attempt(s)", pmStatusToString(pgSetup->pm_status), pgSetup->pidFile.pid, previousPostgresPid, (int) (now - startTime), attempts); } /* we're done if we reach the timeout */ if ((now - startTime) >= timeout) { break; } /* sleep 100 ms in between postmaster.pid probes */ pg_usleep(100 * 1000); } if (!pgIsReady) { /* offer more diagnostic information to the user */ postgresNotRunningIsOk = false; pgIsReady = pg_setup_is_ready(pgSetup, postgresNotRunningIsOk); log_trace("pg_setup_wait_until_is_ready returns %s [%s]", pgIsReady ? "true" : "false", pmStatusToString(pgSetup->pm_status)); return pgIsReady; } /* here we know that pgIsReady is true */ log_level(logLevel, "Postgres is now serving PGDATA \"%s\" on port %d with pid %d", pgSetup->pgdata, pgSetup->pgport, pgSetup->pidFile.pid); return true; } /* * pg_setup_wait_until_is_stopped loops over pg_ctl_status() and returns when * Postgres is stopped. The loop tries every 100ms up to the given timeout, * given in seconds. */ bool pg_setup_wait_until_is_stopped(PostgresSetup *pgSetup, int timeout, int logLevel) { uint64_t startTime = time(NULL); int attempts = 0; int status = -1; pid_t previousPostgresPid = pgSetup->pidFile.pid; bool missingPgdataIsOk = false; bool postgresNotRunningIsOk = true; for (attempts = 1; status != PG_CTL_STATUS_NOT_RUNNING; attempts++) { uint64_t now = time(NULL); /* * If we don't have a postmaster.pid consider that Postgres is not * running. */ if (!get_pgpid(pgSetup, postgresNotRunningIsOk)) { return true; } /* we don't log the output for pg_ctl_status here */ status = pg_ctl_status(pgSetup->pg_ctl, pgSetup->pgdata, false); log_trace("keeper_update_postgres_expected_status(): " "pg_ctl status is %d (we expect %d: not running), " "after %ds and %d attempt(s)", status, PG_CTL_STATUS_NOT_RUNNING, (int) (now - startTime), attempts); if (status == PG_CTL_STATUS_NOT_RUNNING) { return true; } /* we're done if we reach the timeout */ if ((now - startTime) >= timeout) { break; } /* wait for 100 ms and try again */ pg_usleep(100 * 1000); } /* update settings from running database */ if (previousPostgresPid != pgSetup->pidFile.pid) { /* * Update our pgSetup view of Postgres once we have made sure it's * running. */ PostgresSetup newPgSetup = { 0 }; if (!pg_setup_init(&newPgSetup, pgSetup, missingPgdataIsOk, postgresNotRunningIsOk)) { /* errors have already been logged */ return false; } *pgSetup = newPgSetup; log_level(logLevel, "Postgres is now stopped for PGDATA \"%s\"", pgSetup->pgdata); } return status == PG_CTL_STATUS_NOT_RUNNING; } /* * pg_setup_role returns an enum value representing which role the local * PostgreSQL instance currently has. We detect primary and secondary when * Postgres is running, and either recovery or unknown when Postgres is not * running. */ PostgresRole pg_setup_role(PostgresSetup *pgSetup) { char *pgdata = pgSetup->pgdata; if (pg_setup_is_running(pgSetup)) { /* * Here we have either a recovery or a standby node. We don't know for * sure with just that piece of information. * * If we are using Postgres 12+ and there's a standby.signal file in * PGDATA, that's a strong hint that we can't have in previous version * short of parsing recovery.conf. * * Remember that in versions before Postgres 12 the standby_mode was * not exposed as a GUC so we can't inquire about that either. We would * have to parse the recovery.conf file for getting the standby mode. * * It's easier to just return POSTGRES_ROLE_RECOVERY in that case, and * let the caller figure out that this might be POSTGRES_ROLE_STANDBY. * At the moment the callers don't need that level of detail anyway. */ if (pgSetup->is_in_recovery) { char recoverySignalPath[MAXPGPATH] = { 0 }; join_path_components(recoverySignalPath, pgdata, "standby.signal"); if (file_exists(recoverySignalPath)) { return POSTGRES_ROLE_STANDBY; } else { /* We are in recovery, we don't know if we are a standby */ return POSTGRES_ROLE_RECOVERY; } } /* * Here it's running and SELECT pg_is_in_recovery() is false, so we * know we are talking about a primary server. */ else { return POSTGRES_ROLE_PRIMARY; } } else { /* * PostgreSQL is not running, we don't know yet... what we know is that * to be a standby the file $PDGATA/recovery.conf needs to be setup (up * to version 11 included), or the file $PGDATA/standby.signal needs to * exists (starting with version 12). A recovery.signal file starting * in Postgres 12 also indicates that we're not a primary server. * * There's no way that a Postgres instance is going to be a recovery or * standby node without one of those files existing: */ char standbyFilesArray[][MAXPGPATH] = { "recovery.conf", "recovery.signal", "standby.signal" }; PostgresRole standbyRoleArray[] = { /* default to recovery, might be a standby */ POSTGRES_ROLE_RECOVERY, /* recovery.conf */ POSTGRES_ROLE_RECOVERY, /* recovery.signal */ POSTGRES_ROLE_STANDBY /* standby.signal */ }; int pos = 0, count = 3; for (pos = 0; pos < count; pos++) { char filePath[MAXPGPATH] = { 0 }; join_path_components(filePath, pgdata, standbyFilesArray[pos]); if (file_exists(filePath)) { return standbyRoleArray[pos]; } } /* * Postgres is not running, and there's no file around in PGDATA that * allows us to have a strong opinion on whether this instance is a * primary or a standby. It might be either. */ return POSTGRES_ROLE_UNKNOWN; } return POSTGRES_ROLE_UNKNOWN; } /* * pg_setup_get_username returns pgSetup->username when it exists, otherwise it * looksup the username in passwd. Lastly it fallsback to the USER environment * variable. When nothing works it returns DEFAULT_USERNAME PGUSER is only used * when creating our configuration for the first time. */ char * pg_setup_get_username(PostgresSetup *pgSetup) { char userEnv[NAMEDATALEN] = { 0 }; /* use a configured username if provided */ if (!IS_EMPTY_STRING_BUFFER(pgSetup->username)) { return pgSetup->username; } log_trace("username not configured"); /* use the passwd file to find the username, same as whoami */ uid_t uid = geteuid(); struct passwd *pw = getpwuid(uid); if (pw) { log_trace("username found in passwd: %s", pw->pw_name); strlcpy(pgSetup->username, pw->pw_name, sizeof(pgSetup->username)); return pgSetup->username; } /* fallback on USER from env if the user cannot be found in passwd */ if (get_env_copy("USER", userEnv, NAMEDATALEN)) { log_trace("username found in USER environment variable: %s", userEnv); strlcpy(pgSetup->username, userEnv, sizeof(pgSetup->username)); return pgSetup->username; } log_trace("username fallback to default: %s", DEFAULT_USERNAME); strlcpy(pgSetup->username, DEFAULT_USERNAME, sizeof(pgSetup->username)); return pgSetup->username; } /* * pg_setup_get_auth_method returns pgSetup->authMethod when it exists, * otherwise it returns DEFAULT_AUTH_METHOD */ char * pg_setup_get_auth_method(PostgresSetup *pgSetup) { if (!IS_EMPTY_STRING_BUFFER(pgSetup->authMethod)) { return pgSetup->authMethod; } log_trace("auth method not configured, falling back to default value : %s", DEFAULT_AUTH_METHOD); return DEFAULT_AUTH_METHOD; } /* * pg_setup_skip_hba_edits returns true when the user had setup pg_autoctl to * skip editing HBA entries. */ bool pg_setup_skip_hba_edits(PostgresSetup *pgSetup) { return pgSetup->hbaLevel == HBA_EDIT_SKIP; } /* * pg_setup_set_absolute_pgdata uses realpath(3) to make sure that we re using * the absolute real pathname for PGDATA in our setup, so that services will * work correctly after keeper/monitor init, even when initializing in a * relative path and starting the service from elsewhere. This function returns * true if the pgdata path has been updated in the setup. */ bool pg_setup_set_absolute_pgdata(PostgresSetup *pgSetup) { return normalize_filename(pgSetup->pgdata, pgSetup->pgdata, MAXPGPATH); } /* * nodeKindFromString returns a PgInstanceKind from a string. */ PgInstanceKind nodeKindFromString(const char *nodeKind) { PgInstanceKind kindArray[] = { NODE_KIND_UNKNOWN, NODE_KIND_UNKNOWN, NODE_KIND_STANDALONE, NODE_KIND_CITUS_COORDINATOR, NODE_KIND_CITUS_WORKER }; char *kindList[] = { "", "unknown", "standalone", "coordinator", "worker", NULL }; for (int listIndex = 0; kindList[listIndex] != NULL; listIndex++) { char *candidate = kindList[listIndex]; if (strcmp(nodeKind, candidate) == 0) { PgInstanceKind pgKind = kindArray[listIndex]; log_trace("nodeKindFromString: \"%s\" ➜ %d", nodeKind, pgKind); return pgKind; } } log_fatal("Failed to parse nodeKind \"%s\"", nodeKind); /* never happens, make compiler happy */ return NODE_KIND_UNKNOWN; } /* * nodeKindToString returns a textual representatin of given PgInstanceKind. * This must be kept in sync with src/monitor/formation_metadata.c function * FormationKindFromNodeKindString. */ char * nodeKindToString(PgInstanceKind kind) { switch (kind) { case NODE_KIND_STANDALONE: { return "standalone"; } case NODE_KIND_CITUS_COORDINATOR: { return "coordinator"; } case NODE_KIND_CITUS_WORKER: { return "worker"; } default: log_fatal("nodeKindToString: unknown node kind %d", kind); return NULL; } /* can't happen, keep compiler happy */ return NULL; } /* * pmStatusFromString parses the Postgres postmaster.pid PM_STATUS line into * our own enum to represent the value. */ static PostmasterStatus pmStatusFromString(const char *postmasterStatus) { if (strcmp(postmasterStatus, PM_STATUS_STARTING) == 0) { return POSTMASTER_STATUS_STARTING; } else if (strcmp(postmasterStatus, PM_STATUS_STOPPING) == 0) { return POSTMASTER_STATUS_STOPPING; } else if (strcmp(postmasterStatus, PM_STATUS_READY) == 0) { return POSTMASTER_STATUS_READY; } else if (strcmp(postmasterStatus, PM_STATUS_STANDBY) == 0) { return POSTMASTER_STATUS_STANDBY; } log_warn("Failed to read Postmaster status: \"%s\"", postmasterStatus); return POSTMASTER_STATUS_UNKNOWN; } /* * pmStatusToString returns a textual representation of given Postmaster status * given as a PmStatus enum. * * We're not using the PM_STATUS_READY etc constants here because those are * blank-padded to always be the same length, and then the warning messages * including "ready " look buggy in a way. */ char * pmStatusToString(PostmasterStatus pm_status) { switch (pm_status) { case POSTMASTER_STATUS_UNKNOWN: { return "unknown"; } case POSTMASTER_STATUS_STARTING: { return "starting"; } case POSTMASTER_STATUS_STOPPING: { return "stopping"; } case POSTMASTER_STATUS_READY: { return "ready"; } case POSTMASTER_STATUS_STANDBY: return "standby"; } /* keep compiler happy */ return "unknown"; } /* * pgsetup_get_pgport returns the port to use either from the PGPORT * environment variable, or from our default hard-coded value of 5432. */ int pgsetup_get_pgport() { char pgport_env[NAMEDATALEN]; int pgport = 0; if (env_exists("PGPORT") && get_env_copy("PGPORT", pgport_env, NAMEDATALEN)) { if (stringToInt(pgport_env, &pgport) && pgport > 0) { return pgport; } else { log_warn("Failed to parse PGPORT value \"%s\", using %d", pgport_env, POSTGRES_PORT); return POSTGRES_PORT; } } else { /* no PGPORT */ return POSTGRES_PORT; } } /* * pgsetup_validate_ssl_settings returns true if our SSL settings are following * one of the three following cases: * * - --no-ssl: ssl is not activated and no file has been provided * - --ssl-self-signed: ssl is activated and no file has been provided * - --ssl-*-files: ssl is activated and all the files have been provided * * Otherwise it logs an error message and return false. */ bool pgsetup_validate_ssl_settings(PostgresSetup *pgSetup) { SSLOptions *ssl = &(pgSetup->ssl); log_trace("pgsetup_validate_ssl_settings"); /* * When using the full SSL options, we validate that the files exists where * given and set the default sslmode to verify-full. * * --ssl-ca-file * --ssl-crl-file * --server-cert * --server-key */ if (ssl->active && !ssl->createSelfSignedCert) { /* * When passing files in manually for SSL we need at least cert and a * key */ if (IS_EMPTY_STRING_BUFFER(ssl->serverCert) || IS_EMPTY_STRING_BUFFER(ssl->serverKey)) { log_error("Failed to setup SSL with user-provided certificates: " "options --server-cert and --server-key are required."); return false; } /* check that the given files exist */ if (!file_exists(ssl->serverCert)) { log_error("--server-cert file does not exist at \"%s\"", ssl->serverCert); return false; } if (!file_exists(ssl->serverKey)) { log_error("--server-key file does not exist at \"%s\"", ssl->serverKey); return false; } if (!IS_EMPTY_STRING_BUFFER(ssl->caFile) && !file_exists(ssl->caFile)) { log_error("--ssl-ca-file file does not exist at \"%s\"", ssl->caFile); return false; } if (!IS_EMPTY_STRING_BUFFER(ssl->crlFile) && !file_exists(ssl->crlFile)) { log_error("--ssl-crl-file file does not exist at \"%s\"", ssl->crlFile); return false; } /* install a default value for --ssl-mode, use verify-full */ if (ssl->sslMode == SSL_MODE_UNKNOWN) { ssl->sslMode = SSL_MODE_VERIFY_FULL; strlcpy(ssl->sslModeStr, pgsetup_sslmode_to_string(ssl->sslMode), SSL_MODE_STRLEN); log_info("Using default --ssl-mode \"%s\"", ssl->sslModeStr); } /* check that we have a CA file to use with verif-ca/verify-full */ if (ssl->sslMode >= SSL_MODE_VERIFY_CA && IS_EMPTY_STRING_BUFFER(ssl->caFile)) { log_error("--ssl-ca-file is required when --ssl-mode \"%s\" is used", ssl->sslModeStr); return false; } /* * Normalize the filenames. * We already log errors so we can simply return the result */ return normalize_filename(pgSetup->ssl.caFile, pgSetup->ssl.caFile, MAXPGPATH) && normalize_filename(pgSetup->ssl.crlFile, pgSetup->ssl.crlFile, MAXPGPATH) && normalize_filename(pgSetup->ssl.serverCert, pgSetup->ssl.serverCert, MAXPGPATH) && normalize_filename(pgSetup->ssl.serverKey, pgSetup->ssl.serverKey, MAXPGPATH); } /* * When --ssl-self-signed is used, we default to using sslmode=require. * Setting higher than that are wrong, false sense of security. */ if (ssl->createSelfSignedCert) { /* in that case we want an sslMode of require at most */ if (ssl->sslMode > SSL_MODE_REQUIRE) { log_error("--ssl-mode \"%s\" is not compatible with self-signed " "certificates, please provide certificates signed by " "your trusted CA.", pgsetup_sslmode_to_string(ssl->sslMode)); log_info("See https://www.postgresql.org/docs/current/libpq-ssl.html" " for details"); return false; } if (ssl->sslMode == SSL_MODE_UNKNOWN) { /* install a default value for --ssl-mode */ ssl->sslMode = SSL_MODE_REQUIRE; strlcpy(ssl->sslModeStr, pgsetup_sslmode_to_string(ssl->sslMode), SSL_MODE_STRLEN); log_info("Using default --ssl-mode \"%s\"", ssl->sslModeStr); } log_info("Using --ssl-self-signed: pg_autoctl will " "create self-signed certificates, allowing for " "encrypted network traffic"); log_warn("Self-signed certificates provide protection against " "eavesdropping; this setup does NOT protect against " "Man-In-The-Middle attacks nor Impersonation attacks."); log_warn("See https://www.postgresql.org/docs/current/libpq-ssl.html " "for details"); return true; } /* --no-ssl is ok */ if (ssl->active == 0) { log_warn("No encryption is used for network traffic! This allows an " "attacker on the network to read all replication data."); log_warn("Using --ssl-self-signed instead of --no-ssl is recommend to " "achieve more security with the same ease of deployment."); log_warn("See https://www.postgresql.org/docs/current/libpq-ssl.html " "for details on how to improve"); /* Install a default value for --ssl-mode */ if (ssl->sslMode == SSL_MODE_UNKNOWN) { ssl->sslMode = SSL_MODE_PREFER; strlcpy(ssl->sslModeStr, pgsetup_sslmode_to_string(ssl->sslMode), SSL_MODE_STRLEN); log_info("Using default --ssl-mode \"%s\"", ssl->sslModeStr); } return true; } return false; } /* * pg_setup_sslmode_to_string parses a string representing the sslmode into an * internal enum value, so that we can easily compare values. */ SSLMode pgsetup_parse_sslmode(const char *sslMode) { SSLMode enumArray[] = { SSL_MODE_DISABLE, SSL_MODE_ALLOW, SSL_MODE_PREFER, SSL_MODE_REQUIRE, SSL_MODE_VERIFY_CA, SSL_MODE_VERIFY_FULL }; char *sslModeArray[] = { "disable", "allow", "prefer", "require", "verify-ca", "verify-full", NULL }; int sslModeArrayIndex = 0; for (sslModeArrayIndex = 0; sslModeArray[sslModeArrayIndex] != NULL; sslModeArrayIndex++) { if (strcmp(sslMode, sslModeArray[sslModeArrayIndex]) == 0) { return enumArray[sslModeArrayIndex]; } } return SSL_MODE_UNKNOWN; } /* * pgsetup_sslmode_to_string returns the string representation of the enum. */ char * pgsetup_sslmode_to_string(SSLMode sslMode) { switch (sslMode) { case SSL_MODE_UNKNOWN: { return "unknown"; } case SSL_MODE_DISABLE: { return "disable"; } case SSL_MODE_ALLOW: { return "allow"; } case SSL_MODE_PREFER: { return "prefer"; } case SSL_MODE_REQUIRE: { return "require"; } case SSL_MODE_VERIFY_CA: { return "verify-ca"; } case SSL_MODE_VERIFY_FULL: return "verify-full"; } /* This is a huge bug */ log_error("BUG: some unknown SSL_MODE enum value was encountered"); return "unknown"; } /* * pg_setup_standby_slot_supported returns true when the target Postgres * instance represented in pgSetup is compatible with using * pg_replication_slot_advance() on a standby node. * * In Postgres 11 and 12, the pg_replication_slot_advance() function has been * buggy and prevented WAL recycling on standby nodes. * * See https://github.com/citusdata/pg_auto_failover/issues/283 for the problem * and https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=b48df81 * for the solution. * * We need Postgres 11 starting at 11.9, Postgres 12 starting at 12.4, or * Postgres 13 or more recent to make use of pg_replication_slot_advance. */ bool pg_setup_standby_slot_supported(PostgresSetup *pgSetup, int logLevel) { int pg_version = 0; if (!parse_pg_version_string(pgSetup->pg_version, &pg_version)) { /* errors have already been logged */ return false; } int major = pg_version / 100; int minor = pg_version % 100; /* do we have Postgres 10 (or before, though we don't support that) */ if (pg_version < 1100) { log_trace("pg_setup_standby_slot_supported(%d): false", pg_version); return false; } /* Postgres 11.0 up to 11.8 included the bug */ if (pg_version >= 1100 && pg_version < 1109) { log_level(logLevel, "Postgres %d.%d does not support replication slots " "on a standby node", major, minor); return false; } /* Postgres 11.9 and up are good */ if (pg_version >= 1109 && pg_version < 1200) { return true; } /* Postgres 12.0 up to 12.3 included the bug */ if (pg_version >= 1200 && pg_version < 1204) { log_level(logLevel, "Postgres %d.%d does not support replication slots " "on a standby node", major, minor); return false; } /* Postgres 12.4 and up are good */ if (pg_version >= 1204 && pg_version < 1300) { return true; } /* Starting with Postgres 13, all versions are known to have the bug fix */ if (pg_version >= 1300) { return true; } /* should not happen */ log_debug("BUG in pg_setup_standby_slot_supported(%d): " "unknown Postgres version, returning false", pg_version); return false; } /* * pgsetup_parse_hba_level parses a string that represents an HBAEditLevel * value. */ HBAEditLevel pgsetup_parse_hba_level(const char *level) { HBAEditLevel enumArray[] = { HBA_EDIT_SKIP, HBA_EDIT_MINIMAL, HBA_EDIT_LAN }; char *levelArray[] = { "skip", "minimal", "app", NULL }; for (int i = 0; levelArray[i] != NULL; i++) { if (strcmp(level, levelArray[i]) == 0) { return enumArray[i]; } } return HBA_EDIT_UNKNOWN; } /* * pgsetup_hba_level_to_string returns the string representation of an * hbaLevel enum value. */ char * pgsetup_hba_level_to_string(HBAEditLevel hbaLevel) { switch (hbaLevel) { case HBA_EDIT_SKIP: { return "skip"; } case HBA_EDIT_MINIMAL: { return "minimal"; } case HBA_EDIT_LAN: { return "app"; } case HBA_EDIT_UNKNOWN: return "unknown"; } log_error("BUG: hbaLevel %d is unknown", hbaLevel); return "unknown"; } /* * dbstateToString returns a string from a pgControlFile state enum. */ const char * dbstateToString(DBState state) { switch (state) { case DB_STARTUP: { return "starting up"; } case DB_SHUTDOWNED: { return "shut down"; } case DB_SHUTDOWNED_IN_RECOVERY: { return "shut down in recovery"; } case DB_SHUTDOWNING: { return "shutting down"; } case DB_IN_CRASH_RECOVERY: { return "in crash recovery"; } case DB_IN_ARCHIVE_RECOVERY: { return "in archive recovery"; } case DB_IN_PRODUCTION: return "in production"; } return "unrecognized status code"; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/pgsetup.h000066400000000000000000000206461414244367200224250ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pgsetup.h * Discovers a PostgreSQL setup by calling pg_controldata and reading * postmaster.pid file, getting clues from the process environment and from * user given hints (options). * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef PGSETUP_H #define PGSETUP_H #include #include "postgres_fe.h" #include "parson.h" /* * Maximum length of serialized pg_lsn value * It is taken from postgres file pg_lsn.c. * It defines MAXPG_LSNLEN to be 17 and * allocates a buffer 1 byte larger. We * went for 18 to make buffer allocation simpler. */ #define PG_LSN_MAXLENGTH 18 /* * System status indicator. From postgres:src/include/catalog/pg_control.h */ typedef enum DBState { DB_STARTUP = 0, DB_SHUTDOWNED, DB_SHUTDOWNED_IN_RECOVERY, DB_SHUTDOWNING, DB_IN_CRASH_RECOVERY, DB_IN_ARCHIVE_RECOVERY, DB_IN_PRODUCTION } DBState; /* * To be able to check if a minor upgrade should be scheduled, and to check for * system WAL compatiblity, we use some parts of the pg_controldata output. * * See postgresql/src/include/catalog/pg_control.h for definitions of the * following fields of the ControlFileData struct. */ typedef struct pg_control_data { uint64_t system_identifier; uint32_t pg_control_version; /* PG_CONTROL_VERSION */ uint32_t catalog_version_no; /* see catversion.h */ DBState state; /* see enum above */ char latestCheckpointLSN[PG_LSN_MAXLENGTH]; uint32_t timeline_id; } PostgresControlData; /* * We don't need the full information set form the pidfile, it onyl allows us * to guess/retrieve the PostgreSQL port number from the PGDATA without having * to ask the user to provide the information. */ typedef struct pg_pidfile { pid_t pid; unsigned short port; } PostgresPIDFile; /* * From pidfile.h we also extract the Postmaster status, one of the following * values: */ typedef enum { POSTMASTER_STATUS_UNKNOWN = 0, POSTMASTER_STATUS_STARTING, POSTMASTER_STATUS_STOPPING, POSTMASTER_STATUS_READY, POSTMASTER_STATUS_STANDBY } PostmasterStatus; /* * When discovering Postgres we try to determine if the local $PGDATA directory * belongs to a primary or a secondary server. If the server is running, it's * easy: connect and ask with the pg_is_in_recovery() SQL function. If the * server is not running, we might be lucky and find a standby setup file and * then we know it's not a primary. * * Otherwise we just don't know. */ typedef enum PostgresRole { POSTGRES_ROLE_UNKNOWN, POSTGRES_ROLE_PRIMARY, POSTGRES_ROLE_RECOVERY, /* Either PITR or Hot Standby */ POSTGRES_ROLE_STANDBY /* We know it's an Hot Standby */ } PostgresRole; /* * pg_auto_failover knows how to manage three kinds of PostgreSQL servers: * * - Standalone PostgreSQL instances * - Citus Coordinator PostgreSQL instances * - Citus Worker PostgreSQL instances * * Each of them may then take on the role of a primary or a standby depending * on circumstances. Citus coordinator and worker instances need to load the * citus extension in shared_preload_libraries, which the keeper ensures. * * At failover time, when dealing with a Citus worker instance, the keeper * fetches its coordinator hostname and port from the monitor and blocks writes * using the citus master_update_node() function call in a prepared * transaction. */ typedef enum PgInstanceKind { NODE_KIND_UNKNOWN = 0, NODE_KIND_STANDALONE, NODE_KIND_CITUS_COORDINATOR, NODE_KIND_CITUS_WORKER } PgInstanceKind; #define IS_CITUS_INSTANCE_KIND(x) \ (x == NODE_KIND_CITUS_COORDINATOR \ || x == NODE_KIND_CITUS_WORKER) #define PG_VERSION_STRING_MAX 12 /* * Monitor keeps a replication settings for each node. */ typedef struct NodeReplicationSettings { char name[_POSIX_HOST_NAME_MAX]; int candidatePriority; /* promotion candidate priority */ bool replicationQuorum; /* true if participates in write quorum */ } NodeReplicationSettings; /* * How much should we edit the Postgres HBA file? * * The default value is HBA_EDIT_MINIMAL and pg_autoctl then add entries for * the monitor to be able to connect to the local node, and an entry for the * other nodes to be able to connect with streaming replication privileges. */ typedef enum { HBA_EDIT_UNKNOWN = 0, HBA_EDIT_SKIP, HBA_EDIT_MINIMAL, HBA_EDIT_LAN, } HBAEditLevel; /* * pg_auto_failover also support SSL settings. */ typedef enum { SSL_MODE_UNKNOWN = 0, SSL_MODE_DISABLE, SSL_MODE_ALLOW, SSL_MODE_PREFER, SSL_MODE_REQUIRE, SSL_MODE_VERIFY_CA, SSL_MODE_VERIFY_FULL } SSLMode; #define SSL_MODE_STRLEN 12 /* longuest is "verify-full" at 11 chars */ typedef struct SSLOptions { int active; /* INI support has int, does not have bool */ bool createSelfSignedCert; SSLMode sslMode; char sslModeStr[SSL_MODE_STRLEN]; char caFile[MAXPGPATH]; char crlFile[MAXPGPATH]; char serverCert[MAXPGPATH]; char serverKey[MAXPGPATH]; } SSLOptions; /* * In the PostgresSetup structure, we use pghost either as socket directory * name or as a hostname. We could use MAXPGPATH rather than * _POSIX_HOST_NAME_MAX chars in that name, but then again the hostname is * part of a connection string that must be held in MAXCONNINFO. * * If you want to change pghost[_POSIX_HOST_NAME_MAX], keep that in mind! */ typedef struct pg_setup { char pgdata[MAXPGPATH]; /* PGDATA */ char pg_ctl[MAXPGPATH]; /* absolute path to pg_ctl */ char pg_version[PG_VERSION_STRING_MAX]; /* pg_ctl --version */ char username[NAMEDATALEN]; /* username, defaults to USER */ char dbname[NAMEDATALEN]; /* dbname, defaults to PGDATABASE */ char pghost[_POSIX_HOST_NAME_MAX]; /* local PGHOST to connect to */ int pgport; /* PGPORT */ char listen_addresses[MAXPGPATH]; /* listen_addresses */ int proxyport; /* Proxy port */ char authMethod[NAMEDATALEN]; /* auth method, defaults to trust */ char hbaLevelStr[NAMEDATALEN]; /* user choice of HBA editing */ HBAEditLevel hbaLevel; /* user choice of HBA editing */ PostmasterStatus pm_status; /* Postmaster status */ bool is_in_recovery; /* select pg_is_in_recovery() */ PostgresControlData control; /* pg_controldata pgdata */ PostgresPIDFile pidFile; /* postmaster.pid information */ PgInstanceKind pgKind; /* standalone/coordinator/worker */ NodeReplicationSettings settings; /* node replication settings */ SSLOptions ssl; /* ssl options */ char citusClusterName[NAMEDATALEN]; /* citus.cluster_name */ } PostgresSetup; #define IS_EMPTY_STRING_BUFFER(strbuf) (strbuf[0] == '\0') bool pg_setup_init(PostgresSetup *pgSetup, PostgresSetup *options, bool missing_pgdata_is_ok, bool pg_is_not_running_is_ok); bool read_pg_pidfile(PostgresSetup *pgSetup, bool pgIsNotRunningIsOk, int maxRetries); void fprintf_pg_setup(FILE *stream, PostgresSetup *pgSetup); bool pg_setup_as_json(PostgresSetup *pgSetup, JSON_Value *js); bool pg_setup_get_local_connection_string(PostgresSetup *pgSetup, char *connectionString); bool pg_setup_pgdata_exists(PostgresSetup *pgSetup); bool pg_setup_is_running(PostgresSetup *pgSetup); PostgresRole pg_setup_role(PostgresSetup *pgSetup); bool pg_setup_is_ready(PostgresSetup *pgSetup, bool pg_is_not_running_is_ok); bool pg_setup_wait_until_is_ready(PostgresSetup *pgSetup, int timeout, int logLevel); bool pg_setup_wait_until_is_stopped(PostgresSetup *pgSetup, int timeout, int logLevel); char * pmStatusToString(PostmasterStatus pm_status); char * pg_setup_get_username(PostgresSetup *pgSetup); char * pg_setup_get_auth_method(PostgresSetup *pgSetup); bool pg_setup_skip_hba_edits(PostgresSetup *pgSetup); bool pg_setup_set_absolute_pgdata(PostgresSetup *pgSetup); PgInstanceKind nodeKindFromString(const char *nodeKind); char * nodeKindToString(PgInstanceKind kind); int pgsetup_get_pgport(void); bool pgsetup_validate_ssl_settings(PostgresSetup *pgSetup); SSLMode pgsetup_parse_sslmode(const char *sslMode); char * pgsetup_sslmode_to_string(SSLMode sslMode); bool pg_setup_standby_slot_supported(PostgresSetup *pgSetup, int logLevel); HBAEditLevel pgsetup_parse_hba_level(const char *level); char * pgsetup_hba_level_to_string(HBAEditLevel hbaLevel); const char * dbstateToString(DBState state); #endif /* PGSETUP_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/pgsql.c000066400000000000000000002520361414244367200220570ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pgsql.c * API for sending SQL commands to a PostgreSQL server * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include "postgres_fe.h" #include "libpq-fe.h" #include "pqexpbuffer.h" #include "portability/instr_time.h" #include "cli_root.h" #include "defaults.h" #include "log.h" #include "parsing.h" #include "pgsql.h" #include "signals.h" #include "string_utils.h" #define ERRCODE_DUPLICATE_OBJECT "42710" #define ERRCODE_DUPLICATE_DATABASE "42P04" #define ERRCODE_INVALID_OBJECT_DEFINITION "42P17" #define ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE "55000" #define ERRCODE_OBJECT_IN_USE "55006" #define ERRCODE_UNDEFINED_OBJECT "42704" static char * ConnectionTypeToString(ConnectionType connectionType); static void log_connection_error(PGconn *connection, int logLevel); static void pgAutoCtlDefaultNoticeProcessor(void *arg, const char *message); static void pgAutoCtlDebugNoticeProcessor(void *arg, const char *message); static PGconn * pgsql_open_connection(PGSQL *pgsql); static bool pgsql_retry_open_connection(PGSQL *pgsql); static bool is_response_ok(PGresult *result); static bool clear_results(PGSQL *pgsql); static void pgsql_handle_notifications(PGSQL *pgsql); static bool pgsql_alter_system_set(PGSQL *pgsql, GUC setting); static bool pgsql_get_current_setting(PGSQL *pgsql, char *settingName, char **currentValue); static void parsePgMetadata(void *ctx, PGresult *result); static void parsePgReachedTargetLSN(void *ctx, PGresult *result); static void parseReplicationSlotMaintain(void *ctx, PGresult *result); static void parsePgReachedTargetLSN(void *ctx, PGresult *result); static void parseIdentifySystemResult(void *ctx, PGresult *result); static void parseTimelineHistoryResult(void *ctx, PGresult *result); /* * parseSingleValueResult is a ParsePostgresResultCB callback that reads the * first column of the first row of the resultset only, and parses the answer * into the expected C value, one of type QueryResultType. */ void parseSingleValueResult(void *ctx, PGresult *result) { SingleValueResultContext *context = (SingleValueResultContext *) ctx; context->ntuples = PQntuples(result); if (context->ntuples == 1) { char *value = PQgetvalue(result, 0, 0); /* this function is never used when we expect NULL values */ if (PQgetisnull(result, 0, 0)) { context->parsedOk = false; return; } switch (context->resultType) { case PGSQL_RESULT_BOOL: { context->boolVal = strcmp(value, "t") == 0; context->parsedOk = true; break; } case PGSQL_RESULT_INT: { if (!stringToInt(value, &context->intVal)) { context->parsedOk = false; log_error("Failed to parse int result \"%s\"", value); } context->parsedOk = true; break; } case PGSQL_RESULT_BIGINT: { if (!stringToUInt64(value, &context->bigint)) { context->parsedOk = false; log_error("Failed to parse uint64_t result \"%s\"", value); } context->parsedOk = true; break; } case PGSQL_RESULT_STRING: { context->strVal = strdup(value); context->parsedOk = true; break; } } } } /* * fetchedRows is a pgsql_execute_with_params callback function that sets a * SingleValueResultContext->intVal to PQntuples(result), that is how many rows * are fetched by the query. */ void fetchedRows(void *ctx, PGresult *result) { SingleValueResultContext *context = (SingleValueResultContext *) ctx; context->parsedOk = true; context->intVal = PQntuples(result); } /* * pgsql_init initializes a PGSQL struct to connect to the given database * URL or connection string. */ bool pgsql_init(PGSQL *pgsql, char *url, ConnectionType connectionType) { pgsql->connectionType = connectionType; pgsql->connection = NULL; /* set our default retry policy for interactive commands */ (void) pgsql_set_interactive_retry_policy(&(pgsql->retryPolicy)); if (validate_connection_string(url)) { /* size of url has already been validated. */ strlcpy(pgsql->connectionString, url, MAXCONNINFO); } else { return false; } return true; } /* * pgsql_set_retry_policy sets the retry policy to the given maxT (maximum * total time spent retrying), maxR (maximum number of retries, zero when not * retrying at all, -1 for unbounded number of retries), and maxSleepTime to * cap our exponential backoff with decorrelated jitter computation. */ void pgsql_set_retry_policy(ConnectionRetryPolicy *retryPolicy, int maxT, int maxR, int maxSleepTime, int baseSleepTime) { retryPolicy->maxT = maxT; retryPolicy->maxR = maxR; retryPolicy->maxSleepTime = maxSleepTime; retryPolicy->baseSleepTime = baseSleepTime; /* initialize a seed for our random number generator */ pg_srand48(time(0)); } /* * pgsql_set_default_retry_policy sets the default retry policy: no retry. We * use the other default parameters but with a maxR of zero they don't get * used. * * This is the retry policy that prevails in the main keeper loop. */ void pgsql_set_main_loop_retry_policy(ConnectionRetryPolicy *retryPolicy) { (void) pgsql_set_retry_policy(retryPolicy, POSTGRES_PING_RETRY_TIMEOUT, 0, /* do not retry by default */ POSTGRES_PING_RETRY_CAP_SLEEP_TIME, POSTGRES_PING_RETRY_BASE_SLEEP_TIME); } /* * pgsql_set_init_retry_policy sets the retry policy to 15 mins of total * retrying time, unbounded number of attempts, and up to 2 seconds of sleep * time in between attempts. * * This is the policy that we use in keeper_register_and_init. When using * automated provisioning tools and frameworks, it might be that every node is * provisionned concurrently and we might try to connect to the monitor before * it's ready. In that case we want to retry for a long time. */ void pgsql_set_init_retry_policy(ConnectionRetryPolicy *retryPolicy) { (void) pgsql_set_retry_policy(retryPolicy, POSTGRES_PING_RETRY_TIMEOUT, -1, /* unbounded number of attempts */ POSTGRES_PING_RETRY_CAP_SLEEP_TIME, POSTGRES_PING_RETRY_BASE_SLEEP_TIME); } /* * pgsql_set_interactive_retry_policy sets the retry policy to 2 seconds of * total retrying time (or PGCONNECT_TIMEOUT when that's set), unbounded number * of attempts, and up to 2 seconds of sleep time in between attempts. */ void pgsql_set_interactive_retry_policy(ConnectionRetryPolicy *retryPolicy) { (void) pgsql_set_retry_policy(retryPolicy, pgconnect_timeout, -1, /* unbounded number of attempts */ POSTGRES_PING_RETRY_CAP_SLEEP_TIME, POSTGRES_PING_RETRY_BASE_SLEEP_TIME); } /* * pgsql_set_monitor_interactive_retry_policy sets the retry policy to 15 mins * of total retrying time, unbounded number of attemps, and up to 5 seconds of * sleep time in between attemps, starting at 1 second for the first retry. * * We use this policy in interactive commands when connecting to the monitor, * such as when doing pg_autoctl enable|disable maintenance. */ void pgsql_set_monitor_interactive_retry_policy(ConnectionRetryPolicy *retryPolicy) { int cap = 5 * 1000; /* sleep up to 5s between attempts */ int sleepTime = 1 * 1000; /* first retry happens after 1 second */ (void) pgsql_set_retry_policy(retryPolicy, POSTGRES_PING_RETRY_TIMEOUT, -1, /* unbounded number of attempts */ cap, sleepTime); } #define min(a, b) (a < b ? a : b) /* * http://c-faq.com/lib/randrange.html */ #define random_between(M, N) \ ((M) + pg_lrand48() / (RAND_MAX / ((N) -(M) +1) + 1)) /* * pgsql_compute_connection_retry_sleep_time returns how much time to sleep * this time, in milliseconds. */ int pgsql_compute_connection_retry_sleep_time(ConnectionRetryPolicy *retryPolicy) { /* * https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ * * Adding jitter is a small change to the sleep function: * * sleep = random_between(0, min(cap, base * 2^attempt)) * * There are a few ways to implement these timed backoff loops. Let’s call * the algorithm above “Full Jitter”, and consider two alternatives. The * first alternative is “Equal Jitter”, where we always keep some of the * backoff and jitter by a smaller amount: * * temp = min(cap, base * 2^attempt) * sleep = temp/2 + random_between(0, temp/2) * * The intuition behind this one is that it prevents very short sleeps, * always keeping some of the slow down from the backoff. * * A second alternative is “Decorrelated Jitter”, which is similar to “Full * Jitter”, but we also increase the maximum jitter based on the last * random value. * * sleep = min(cap, random_between(base, sleep*3)) * * Which approach do you think is best? * * The no-jitter exponential backoff approach is the clear loser. [...] * * Of the jittered approaches, “Equal Jitter” is the loser. It does * slightly more work than “Full Jitter”, and takes much longer. The * decision between “Decorrelated Jitter” and “Full Jitter” is less clear. * The “Full Jitter” approach uses less work, but slightly more time. Both * approaches, though, present a substantial decrease in client work and * server load. * * Here we implement "Decorrelated Jitter", which is better in terms of * time spent, something we care to optimize for even when it means more * work on the monitor side. */ int previousSleepTime = retryPolicy->sleepTime; int sleepTime = random_between(retryPolicy->baseSleepTime, previousSleepTime * 3); retryPolicy->sleepTime = min(retryPolicy->maxSleepTime, sleepTime); ++(retryPolicy->attempts); return retryPolicy->sleepTime; } /* * pgsql_retry_policy_expired returns true when we should stop retrying, either * per the policy (maxR / maxT) or because we received a signal that we have to * obey. */ bool pgsql_retry_policy_expired(ConnectionRetryPolicy *retryPolicy) { instr_time duration; /* Any signal is reason enough to break out from this retry loop. */ if (asked_to_quit || asked_to_stop || asked_to_stop_fast || asked_to_reload) { return true; } /* set the first retry time when it's not been set previously */ if (INSTR_TIME_IS_ZERO(retryPolicy->startTime)) { INSTR_TIME_SET_CURRENT(retryPolicy->startTime); } INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, retryPolicy->startTime); /* * We stop retrying as soon as we have spent all of our time budget or all * of our attempts count budget, whichever comes first. * * maxR = 0 (zero) means no retry at all, checked before the loop * maxR < 0 (zero) means unlimited number of retries */ if ((INSTR_TIME_GET_MILLISEC(duration) >= (retryPolicy->maxT * 1000)) || (retryPolicy->maxR > 0 && retryPolicy->attempts >= retryPolicy->maxR)) { return true; } return false; } /* * connectionTypeToString transforms a connectionType in a string to be used in * a user facing message. */ static char * ConnectionTypeToString(ConnectionType connectionType) { switch (connectionType) { case PGSQL_CONN_LOCAL: { return "local"; } case PGSQL_CONN_MONITOR: { return "monitor"; } case PGSQL_CONN_COORDINATOR: { return "coordinator"; } case PGSQL_CONN_UPSTREAM: { return "upstream"; } default: { return "unknown connection type"; } } } /* * Finish a PGSQL client connection. */ void pgsql_finish(PGSQL *pgsql) { if (pgsql->connection != NULL) { char scrubbedConnectionString[MAXCONNINFO] = { 0 }; if (!parse_and_scrub_connection_string(pgsql->connectionString, scrubbedConnectionString)) { log_debug("Failed to scrub password from connection string"); strlcpy(scrubbedConnectionString, pgsql->connectionString, sizeof(scrubbedConnectionString)); } log_debug("Disconnecting from [%s] \"%s\"", ConnectionTypeToString(pgsql->connectionType), scrubbedConnectionString); PQfinish(pgsql->connection); pgsql->connection = NULL; /* * When we fail to connect, on the way out we call pgsql_finish to * reset the connection to NULL. We still want the callers to be able * to inquire about our connection status, so refrain to reset the * status. */ } pgsql->connectionStatementType = PGSQL_CONNECTION_SINGLE_STATEMENT; } /* * log_connection_error logs the PQerrorMessage from the given connection. */ static void log_connection_error(PGconn *connection, int logLevel) { char *message = connection != NULL ? PQerrorMessage(connection) : NULL; char *errorLines[BUFSIZE] = { 0 }; int lineCount = splitLines(message, errorLines, BUFSIZE); int lineNumber = 0; /* PQerrorMessage is then "connection pointer is NULL", not helpful */ if (connection == NULL) { return; } for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { char *line = errorLines[lineNumber]; if (lineNumber == 0) { log_level(logLevel, "Connection to database failed: %s", line); } else { log_level(logLevel, "%s", line); } } } /* * pgsql_open_connection opens a PostgreSQL connection, given a PGSQL client * instance. If a connection is already open in the client (it's not NULL), * then this errors, unless we are inside a transaction opened by pgsql_begin. */ static PGconn * pgsql_open_connection(PGSQL *pgsql) { /* we might be connected already */ if (pgsql->connection != NULL) { if (pgsql->connectionStatementType != PGSQL_CONNECTION_MULTI_STATEMENT) { log_error("BUG: requested to open an already open connection in " "non PGSQL_CONNECTION_MULTI_STATEMENT mode"); pgsql_finish(pgsql); return NULL; } return pgsql->connection; } char scrubbedConnectionString[MAXCONNINFO] = { 0 }; (void) parse_and_scrub_connection_string(pgsql->connectionString, scrubbedConnectionString); log_debug("Connecting to [%s] \"%s\"", ConnectionTypeToString(pgsql->connectionType), scrubbedConnectionString); /* we implement our own retry strategy */ setenv("PGCONNECT_TIMEOUT", POSTGRES_CONNECT_TIMEOUT, 1); /* register our starting time */ INSTR_TIME_SET_CURRENT(pgsql->retryPolicy.startTime); INSTR_TIME_SET_ZERO(pgsql->retryPolicy.connectTime); /* Make a connection to the database */ pgsql->connection = PQconnectdb(pgsql->connectionString); /* Check to see that the backend connection was successfully made */ if (PQstatus(pgsql->connection) != CONNECTION_OK) { /* * Implement the retry policy: * * First observe the maxR property: maximum retries allowed. When set * to zero, we don't retry at all. */ if (pgsql->retryPolicy.maxR == 0) { INSTR_TIME_SET_CURRENT(pgsql->retryPolicy.connectTime); (void) log_connection_error(pgsql->connection, LOG_ERROR); log_error("Failed to connect to %s database at \"%s\", " "see above for details", ConnectionTypeToString(pgsql->connectionType), pgsql->connectionString); pgsql->status = PG_CONNECTION_BAD; pgsql_finish(pgsql); return NULL; } /* * If we reach this part of the code, the connectionType is not LOCAL * and the retryPolicy has a non-zero maximum retry count. Let's retry! */ if (!pgsql_retry_open_connection(pgsql)) { /* errors have already been logged */ return NULL; } } INSTR_TIME_SET_CURRENT(pgsql->retryPolicy.connectTime); pgsql->status = PG_CONNECTION_OK; /* set the libpq notice receiver to integrate notifications as warnings. */ PQsetNoticeProcessor(pgsql->connection, &pgAutoCtlDefaultNoticeProcessor, NULL); return pgsql->connection; } /* * Refrain from warning too often. The user certainly wants to know that we are * still trying to connect, though warning several times a second is not going * to help anyone. A good trade-off seems to be a warning every 30s. */ #define SHOULD_WARN_AGAIN(duration) \ (INSTR_TIME_GET_MILLISEC(duration) > 30000) /* * pgsql_retry_open_connection loops over a PQping call until the remote server * is ready to accept connections, and then connects to it and returns true * when it could connect, false otherwise. */ static bool pgsql_retry_open_connection(PGSQL *pgsql) { bool connectionOk = false; PGPing lastWarningMessage = PQPING_OK; instr_time lastWarningTime; INSTR_TIME_SET_ZERO(lastWarningTime); char scrubbedConnectionString[MAXCONNINFO] = { 0 }; (void) parse_and_scrub_connection_string(pgsql->connectionString, scrubbedConnectionString); log_warn("Failed to connect to \"%s\", retrying until " "the server is ready", scrubbedConnectionString); /* should not happen */ if (pgsql->retryPolicy.maxR == 0) { return false; } /* reset our internal counter before entering the retry loop */ pgsql->retryPolicy.attempts = 1; while (!connectionOk) { if (pgsql_retry_policy_expired(&(pgsql->retryPolicy))) { instr_time duration; INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, pgsql->retryPolicy.startTime); (void) log_connection_error(pgsql->connection, LOG_ERROR); pgsql->status = PG_CONNECTION_BAD; pgsql_finish(pgsql); log_error("Failed to connect to \"%s\" " "after %d attempts in %d ms, " "pg_autoctl stops retrying now", scrubbedConnectionString, pgsql->retryPolicy.attempts, (int) INSTR_TIME_GET_MILLISEC(duration)); return false; } /* * Now compute how much time to wait for this round, and increment how * many times we tried to connect already. */ int sleep = pgsql_compute_connection_retry_sleep_time(&(pgsql->retryPolicy)); /* we have milliseconds, pg_usleep() wants microseconds */ (void) pg_usleep(sleep * 1000); log_debug("PQping(%s): slept %d ms on attempt %d", scrubbedConnectionString, pgsql->retryPolicy.sleepTime, pgsql->retryPolicy.attempts); switch (PQping(pgsql->connectionString)) { /* * https://www.postgresql.org/docs/current/libpq-connect.html * * The server is running and appears to be accepting connections. */ case PQPING_OK: { log_debug("PQping OK after %d attempts", pgsql->retryPolicy.attempts); /* * Ping is now ok, and connection is still NULL because the * first attempt to connect failed. Now is a good time to * establish the connection. * * PQping does not check authentication, so we might still fail * to connect to the server. */ pgsql->connection = PQconnectdb(pgsql->connectionString); if (PQstatus(pgsql->connection) == CONNECTION_OK) { instr_time duration; INSTR_TIME_SET_CURRENT(duration); connectionOk = true; pgsql->status = PG_CONNECTION_OK; pgsql->retryPolicy.connectTime = duration; INSTR_TIME_SUBTRACT(duration, pgsql->retryPolicy.startTime); log_info("Successfully connected to \"%s\" " "after %d attempts in %d ms.", scrubbedConnectionString, pgsql->retryPolicy.attempts, (int) INSTR_TIME_GET_MILLISEC(duration)); } else { instr_time durationSinceLastWarning; INSTR_TIME_SET_CURRENT(durationSinceLastWarning); INSTR_TIME_SUBTRACT(durationSinceLastWarning, lastWarningTime); if (lastWarningMessage != PQPING_OK || SHOULD_WARN_AGAIN(durationSinceLastWarning)) { lastWarningMessage = PQPING_OK; INSTR_TIME_SET_CURRENT(lastWarningTime); /* * Only show details when that's the last attempt, * otherwise accept that this may happen as a transient * state. */ (void) log_connection_error(pgsql->connection, LOG_DEBUG); log_debug("Failed to connect after successful ping"); } } break; } /* * https://www.postgresql.org/docs/current/libpq-connect.html * * The server is running but is in a state that disallows * connections (startup, shutdown, or crash recovery). */ case PQPING_REJECT: { instr_time durationSinceLastWarning; INSTR_TIME_SET_CURRENT(durationSinceLastWarning); INSTR_TIME_SUBTRACT(durationSinceLastWarning, lastWarningTime); if (lastWarningMessage != PQPING_REJECT || SHOULD_WARN_AGAIN(durationSinceLastWarning)) { lastWarningMessage = PQPING_REJECT; INSTR_TIME_SET_CURRENT(lastWarningTime); log_warn( "The server at \"%s\" is running but is in a state " "that disallows connections (startup, shutdown, or " "crash recovery).", scrubbedConnectionString); } break; } /* * https://www.postgresql.org/docs/current/libpq-connect.html * * The server could not be contacted. This might indicate that the * server is not running, or that there is something wrong with the * given connection parameters (for example, wrong port number), or * that there is a network connectivity problem (for example, a * firewall blocking the connection request). */ case PQPING_NO_RESPONSE: { instr_time durationSinceStart, durationSinceLastWarning; INSTR_TIME_SET_CURRENT(durationSinceStart); INSTR_TIME_SUBTRACT(durationSinceStart, pgsql->retryPolicy.startTime); INSTR_TIME_SET_CURRENT(durationSinceLastWarning); INSTR_TIME_SUBTRACT(durationSinceLastWarning, lastWarningTime); /* no message at all the first 30s: 30000ms */ if (SHOULD_WARN_AGAIN(durationSinceStart) && (lastWarningMessage != PQPING_NO_RESPONSE || SHOULD_WARN_AGAIN(durationSinceLastWarning))) { lastWarningMessage = PQPING_NO_RESPONSE; INSTR_TIME_SET_CURRENT(lastWarningTime); log_warn( "The server at \"%s\" could not be contacted " "after %d attempts in %d ms (milliseconds). " "This might indicate that the server is not running, " "or that there is something wrong with the given " "connection parameters (for example, wrong port " "number), or that there is a network connectivity " "problem (for example, a firewall blocking the " "connection request).", scrubbedConnectionString, pgsql->retryPolicy.attempts, (int) INSTR_TIME_GET_MILLISEC(durationSinceStart)); } break; } /* * https://www.postgresql.org/docs/current/libpq-connect.html * * No attempt was made to contact the server, because the supplied * parameters were obviously incorrect or there was some * client-side problem (for example, out of memory). */ case PQPING_NO_ATTEMPT: { lastWarningMessage = PQPING_NO_ATTEMPT; log_debug("Failed to ping server \"%s\" because of " "client-side problems (no attempt were made)", scrubbedConnectionString); break; } } } if (!connectionOk && pgsql->connection != NULL) { INSTR_TIME_SET_CURRENT(pgsql->retryPolicy.connectTime); (void) log_connection_error(pgsql->connection, LOG_ERROR); pgsql->status = PG_CONNECTION_BAD; pgsql_finish(pgsql); return false; } return true; } /* * pgAutoCtlDefaultNoticeProcessor is our default PostgreSQL libpq Notice * Processing: NOTICE, WARNING, HINT etc are processed as log_warn messages by * default. */ static void pgAutoCtlDefaultNoticeProcessor(void *arg, const char *message) { char *m = strdup(message); char *lines[BUFSIZE]; int lineCount = splitLines(m, lines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_warn("%s", lines[lineNumber]); } free(m); } /* * pgAutoCtlDebugNoticeProcessor is our PostgreSQL libpq Notice Processing to * use when wanting to send NOTICE, WARNING, HINT as log_debug messages. */ static void pgAutoCtlDebugNoticeProcessor(void *arg, const char *message) { char *m = strdup(message); char *lines[BUFSIZE]; int lineCount = splitLines(m, lines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_debug("%s", lines[lineNumber]); } free(m); } /* * pgsql_begin is responsible for opening a mutli statement connection and * opening a transaction block by issuing a 'BEGIN' query. */ bool pgsql_begin(PGSQL *pgsql) { /* * Indicate that we're running a transaction, so that the connection is not * closed after each query automatically. It also allows us to detect bugs * easily. We need to do this before executing BEGIN, because otherwise the * connection is closed after the BEGIN statement automatically. */ pgsql->connectionStatementType = PGSQL_CONNECTION_MULTI_STATEMENT; if (!pgsql_execute(pgsql, "BEGIN")) { /* * We need to manually call pgsql_finish to clean up here in case of * this failure, because we have set the statement type to MULTI. */ pgsql_finish(pgsql); return false; } return true; } /* * pgsql_rollback is responsible for issuing a 'ROLLBACK' query to an already * opened transaction, usually via a previous pgsql_begin() command. * * It closes the connection but leaves the error contents, if any, for the user * to examine should it is wished for. */ bool pgsql_rollback(PGSQL *pgsql) { bool result; if (pgsql->connectionStatementType != PGSQL_CONNECTION_MULTI_STATEMENT || pgsql->connection == NULL) { log_error("BUG: call to pgsql_rollback without holding an open " "multi statement connection"); return false; } result = pgsql_execute(pgsql, "ROLLBACK"); /* * Connection might be be closed during the pgsql_execute(), notably in case * of error. Be explicit and close it regardless though. */ if (pgsql->connection) { pgsql_finish(pgsql); } return result; } /* * pgsql_commit is responsible for issuing a 'COMMIT' query to an already * opened transaction, usually via a previous pgsql_begin() command. * * It closes the connection but leaves the error contents, if any, for the user * to examine should it is wished for. */ bool pgsql_commit(PGSQL *pgsql) { bool result; if (pgsql->connectionStatementType != PGSQL_CONNECTION_MULTI_STATEMENT || pgsql->connection == NULL) { log_error("BUG: call to pgsql_commit() without holding an open " "multi statement connection"); if (pgsql->connection) { pgsql_finish(pgsql); } return false; } result = pgsql_execute(pgsql, "COMMIT"); /* * Connection might be be closed during the pgsql_execute(), notably in case * of error. Be explicit and close it regardless though. */ if (pgsql->connection) { pgsql_finish(pgsql); } return result; } /* * pgsql_execute opens a connection, runs a given SQL command, and closes * the connection again. * * We avoid persisting connection across multiple commands to simplify error * handling. */ bool pgsql_execute(PGSQL *pgsql, const char *sql) { return pgsql_execute_with_params(pgsql, sql, 0, NULL, NULL, NULL, NULL); } /* * pgsql_execute_with_params opens a connection, runs a given SQL command, * and closes the connection again. * * We avoid persisting connection across multiple commands to simplify error * handling. */ bool pgsql_execute_with_params(PGSQL *pgsql, const char *sql, int paramCount, const Oid *paramTypes, const char **paramValues, void *context, ParsePostgresResultCB *parseFun) { char debugParameters[BUFSIZE] = { 0 }; PGresult *result = NULL; PGconn *connection = pgsql_open_connection(pgsql); if (connection == NULL) { return false; } log_debug("%s;", sql); if (paramCount > 0) { int paramIndex = 0; int remainingBytes = BUFSIZE; char *writePointer = (char *) debugParameters; for (paramIndex = 0; paramIndex < paramCount; paramIndex++) { int bytesWritten = 0; const char *value = paramValues[paramIndex]; if (paramIndex > 0) { bytesWritten = sformat(writePointer, remainingBytes, ", "); remainingBytes -= bytesWritten; writePointer += bytesWritten; } if (value == NULL) { bytesWritten = sformat(writePointer, remainingBytes, "NULL"); } else { bytesWritten = sformat(writePointer, remainingBytes, "'%s'", value); } remainingBytes -= bytesWritten; writePointer += bytesWritten; } log_debug("%s", debugParameters); } if (paramCount == 0) { result = PQexec(connection, sql); } else { result = PQexecParams(connection, sql, paramCount, paramTypes, paramValues, NULL, NULL, 0); } if (!is_response_ok(result)) { char *sqlstate = PQresultErrorField(result, PG_DIAG_SQLSTATE); char *message = PQerrorMessage(connection); char *errorLines[BUFSIZE]; int lineCount = splitLines(message, errorLines, BUFSIZE); int lineNumber = 0; char *prefix = pgsql->connectionType == PGSQL_CONN_MONITOR ? "Monitor" : "Postgres"; /* * PostgreSQL Error message might contain several lines. Log each of * them as a separate ERROR line here. */ for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_error("%s %s", prefix, errorLines[lineNumber]); } /* * The monitor uses those error codes in situations we know how to * handle, so if we have one of those, it's not a client-side error * with a badly formed SQL query etc. */ if (pgsql->connectionType == PGSQL_CONN_MONITOR && sqlstate != NULL && !(strcmp(sqlstate, ERRCODE_INVALID_OBJECT_DEFINITION) == 0 || strcmp(sqlstate, ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE) == 0 || strcmp(sqlstate, ERRCODE_OBJECT_IN_USE) == 0 || strcmp(sqlstate, ERRCODE_UNDEFINED_OBJECT) == 0)) { log_error("SQL query: %s", sql); log_error("SQL params: %s", debugParameters); } else { log_debug("SQL query: %s", sql); log_debug("SQL params: %s", debugParameters); } /* now stash away the SQL STATE if any */ if (context && sqlstate) { AbstractResultContext *ctx = (AbstractResultContext *) context; strlcpy(ctx->sqlstate, sqlstate, SQLSTATE_LENGTH); } /* if we get a connection exception, track that */ if (sqlstate && strncmp(sqlstate, STR_ERRCODE_CLASS_CONNECTION_EXCEPTION, 2) == 0) { pgsql->status = PG_CONNECTION_BAD; } PQclear(result); clear_results(pgsql); /* * Multi statements might want to ROLLBACK and hold to the open * connection for a retry step. */ if (pgsql->connectionStatementType == PGSQL_CONNECTION_SINGLE_STATEMENT) { PQfinish(pgsql->connection); pgsql->connection = NULL; } return false; } if (parseFun != NULL) { (*parseFun)(context, result); } PQclear(result); clear_results(pgsql); if (pgsql->connectionStatementType == PGSQL_CONNECTION_SINGLE_STATEMENT) { PQfinish(pgsql->connection); pgsql->connection = NULL; } return true; } /* * is_response_ok returns whether the query result is a correct response * (not an error or failure). */ static bool is_response_ok(PGresult *result) { ExecStatusType resultStatus = PQresultStatus(result); return resultStatus == PGRES_SINGLE_TUPLE || resultStatus == PGRES_TUPLES_OK || resultStatus == PGRES_COMMAND_OK; } /* * clear_results consumes results on a connection until NULL is returned. * If an error is returned it returns false. */ static bool clear_results(PGSQL *pgsql) { PGconn *connection = pgsql->connection; /* * Per Postgres documentation: You should, however, remember to check * PQnotifies after each PQgetResult or PQexec, to see if any * notifications came in during the processing of the command. * * Before calling clear_results(), we called PQexecParams(). */ (void) pgsql_handle_notifications(pgsql); while (true) { PGresult *result = PQgetResult(connection); /* * Per Postgres documentation: You should, however, remember to check * PQnotifies after each PQgetResult or PQexec, to see if any * notifications came in during the processing of the command. * * Here, we just called PQgetResult(). */ (void) pgsql_handle_notifications(pgsql); if (result == NULL) { break; } if (!is_response_ok(result)) { log_error("Failure from Postgres: %s", PQerrorMessage(connection)); PQclear(result); pgsql_finish(pgsql); return false; } PQclear(result); } return true; } /* * pgsql_handle_notifications check PQnotifies when a PGSQL notificationChannel * has been set. Then if the parsed notification is from the * notificationGroupId we set notificationReceived and also log the * notification. * * This allow another part of the code to later know that some notifications * have been received. */ static void pgsql_handle_notifications(PGSQL *pgsql) { PGconn *connection = pgsql->connection; PGnotify *notify; if (pgsql->notificationProcessFunction == NULL) { return; } PQconsumeInput(connection); while ((notify = PQnotifies(connection)) != NULL) { log_trace("pgsql_handle_notifications: \"%s\"", notify->extra); if ((*pgsql->notificationProcessFunction)(pgsql->notificationGroupId, pgsql->notificationNodeId, notify->relname, notify->extra)) { /* mark that we received some notifications */ pgsql->notificationReceived = true; } PQfreemem(notify); PQconsumeInput(connection); } } /* * pgsql_is_in_recovery connects to PostgreSQL and sets the is_in_recovery * boolean to the result of the SELECT pg_is_in_recovery() query. It returns * false when something went wrong doing that. */ bool pgsql_is_in_recovery(PGSQL *pgsql, bool *is_in_recovery) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_BOOL, false }; char *sql = "SELECT pg_is_in_recovery()"; if (!pgsql_execute_with_params(pgsql, sql, 0, NULL, NULL, &context, &parseSingleValueResult)) { /* errors have been logged already */ return false; } if (!context.parsedOk) { log_error("Failed to get result from pg_is_in_recovery()"); return false; } *is_in_recovery = context.boolVal; return true; } /* * check_postgresql_settings connects to our local PostgreSQL instance and * verifies that our minimal viable configuration is in place by running a SQL * query that looks at the current settings. */ bool pgsql_check_postgresql_settings(PGSQL *pgsql, bool isCitusInstanceKind, bool *settings_are_ok) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_BOOL, false }; const char *sql = isCitusInstanceKind ? CHECK_CITUS_NODE_SETTINGS_SQL : CHECK_POSTGRESQL_NODE_SETTINGS_SQL; if (!pgsql_execute_with_params(pgsql, sql, 0, NULL, NULL, &context, &parseSingleValueResult)) { /* errors have been logged already */ return false; } if (!context.parsedOk) { /* errors have already been logged */ return false; } *settings_are_ok = context.boolVal; return true; } /* * pgsql_check_monitor_settings connects to the given pgsql instance to check * that pgautofailover is part of shared_preload_libraries. */ bool pgsql_check_monitor_settings(PGSQL *pgsql, bool *settings_are_ok) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_BOOL, false }; const char *sql = "select exists(select 1 from " "unnest(" "string_to_array(current_setting('shared_preload_libraries'), ','))" " as t(name) " "where trim(name) = 'pgautofailover');"; if (!pgsql_execute_with_params(pgsql, sql, 0, NULL, NULL, &context, &parseSingleValueResult)) { /* errors have been logged already */ return false; } if (!context.parsedOk) { /* errors have already been logged */ return false; } *settings_are_ok = context.boolVal; return true; } /* * postgres_sprintf_replicationSlotName prints the replication Slot Name to use * for given nodeId in the given slotName buffer of given size. */ bool postgres_sprintf_replicationSlotName(int64_t nodeId, char *slotName, int size) { int bytesWritten = sformat(slotName, size, "%s_%" PRId64, REPLICATION_SLOT_NAME_DEFAULT, nodeId); return bytesWritten <= size; } /* * pgsql_set_synchronous_standby_names set synchronous_standby_names on the * local Postgres to the value computed on the pg_auto_failover monitor. */ bool pgsql_set_synchronous_standby_names(PGSQL *pgsql, char *synchronous_standby_names) { char quoted[BUFSIZE] = { 0 }; GUC setting = { "synchronous_standby_names", quoted }; if (sformat(quoted, BUFSIZE, "'%s'", synchronous_standby_names) >= BUFSIZE) { log_error("Failed to apply the synchronous_standby_names value \"%s\": " "pg_autoctl supports values up to %d bytes and this one " "requires %lu bytes", synchronous_standby_names, BUFSIZE, (unsigned long) strlen(synchronous_standby_names)); return false; } return pgsql_alter_system_set(pgsql, setting); } /* * pgsql_replication_slot_maintain advances the current confirmed position of * the given replication slot up to the given LSN position, create the * replication slot if it does not exist yet, and remove the slots that exist * in Postgres but are ommited in the given array of slots. */ typedef struct ReplicationSlotMaintainContext { char sqlstate[SQLSTATE_LENGTH]; char operation[NAMEDATALEN]; char slotName[BUFSIZE]; char lsn[PG_LSN_MAXLENGTH]; bool parsedOK; } ReplicationSlotMaintainContext; /* * pgsql_replication_slot_exists checks that a replication slot with the given * slotName exists on the Postgres server. */ bool pgsql_replication_slot_exists(PGSQL *pgsql, const char *slotName, bool *slotExists) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_BOOL, false }; char *sql = "SELECT 1 FROM pg_replication_slots WHERE slot_name = $1"; int paramCount = 1; Oid paramTypes[1] = { NAMEOID }; const char *paramValues[1] = { slotName }; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &fetchedRows)) { /* errors have already been logged */ return false; } if (!context.parsedOk) { log_error("Failed to check if the replication slot \"%s\" exists", slotName); return false; } /* we receive 0 rows in the result when the slot does not exist yet */ *slotExists = context.intVal == 1; return true; } /* * pgsql_create_replication_slot tries to create a replication slot on the * database identified by a connection string. It's implemented as CREATE IF * NOT EXISTS so that it's idempotent and can be retried easily. */ bool pgsql_create_replication_slot(PGSQL *pgsql, const char *slotName) { ReplicationSlotMaintainContext context = { 0 }; char *sql = "SELECT 'create', slot_name, lsn " " FROM pg_create_physical_replication_slot($1) " " WHERE NOT EXISTS " " (SELECT 1 FROM pg_replication_slots WHERE slot_name = $1)"; const Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1] = { slotName }; log_trace("pgsql_create_replication_slot"); /* * parseReplicationSlotMaintain will log_info() the replication slot * creation if it happens. When the slot already exists we return 0 row and * remain silent about it. */ return pgsql_execute_with_params(pgsql, sql, 1, paramTypes, paramValues, &context, parseReplicationSlotMaintain); } /* * pgsql_drop_replication_slot drops a given replication slot. */ bool pgsql_drop_replication_slot(PGSQL *pgsql, const char *slotName) { char *sql = "SELECT pg_drop_replication_slot(slot_name) " " FROM pg_replication_slots " " WHERE slot_name = $1"; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1] = { slotName }; log_info("Drop replication slot \"%s\"", slotName); return pgsql_execute_with_params(pgsql, sql, 1, paramTypes, paramValues, NULL, NULL); } /* * BuildNodesArrayValues build the SQL expression to use in a FROM clause to * represent the list of other standby nodes from the given nodeArray. * * Such a list looks either like: * * VALUES($1, $2::pg_lsn), ($3, $4) * * or for an empty set (e.g. when we're the only standby): * * SELECT id, lsn * FROM (values (null::int, null::pg_lsn)) as t(id, lsn) * WHERE false * * We actually need to provide an empty set (0 rows) with columns of the * expected data types so that we can join against the existing replication * slots and drop them. If the set is empty, we drop all the slots. * * We return how many parameters we filled in paramTypes and paramValues from * the nodeArray. */ #define NODEID_MAX_LENGTH 20 /* should allow for bigint digits */ typedef struct nodesArraysValuesParams { int count; Oid types[NODE_ARRAY_MAX_COUNT * 2]; char *values[NODE_ARRAY_MAX_COUNT * 2]; /* * Pre-allocate arrays for the data separately from the values array, which * needs to be a (const char **) thing rather than a (char [][]) thing, * because of the pgsql_execute_with_params and libpq APIs. */ char nodeIds[NODE_ARRAY_MAX_COUNT][NODEID_MAX_LENGTH]; char lsns[NODE_ARRAY_MAX_COUNT][PG_LSN_MAXLENGTH]; } nodesArraysValuesParams; static bool BuildNodesArrayValues(NodeAddressArray *nodeArray, nodesArraysValuesParams *sqlParams, PQExpBuffer values) { int nodeIndex = 0; int paramIndex = 0; /* when we didn't find any node to process, return our empty set */ if (nodeArray->count == 0) { appendPQExpBufferStr( values, "SELECT id, lsn " "FROM (values (null::int, null::pg_lsn)) as t(id, lsn) " "where false"); return true; } /* we start the VALUES subquery with the values SQL keyword */ appendPQExpBufferStr(values, "values "); /* * Build a SQL VALUES statement for every other node registered in the * system, so that we can maintain their LSN position locally on a standby * server. */ for (nodeIndex = 0; nodeIndex < nodeArray->count; nodeIndex++) { NodeAddress *node = &(nodeArray->nodes[nodeIndex]); char *nodeIdString = intToString(node->nodeId).strValue; int idParamIndex = paramIndex; int lsnParamIndex = paramIndex + 1; sqlParams->types[idParamIndex] = INT8OID; strlcpy(sqlParams->nodeIds[nodeIndex], nodeIdString, NODEID_MAX_LENGTH); /* store the (char *) pointer to the data in values */ sqlParams->values[idParamIndex] = sqlParams->nodeIds[nodeIndex]; sqlParams->types[lsnParamIndex] = LSNOID; strlcpy(sqlParams->lsns[nodeIndex], node->lsn, PG_LSN_MAXLENGTH); /* store the (char *) pointer to the data in values */ sqlParams->values[lsnParamIndex] = sqlParams->lsns[nodeIndex]; appendPQExpBuffer(values, "%s($%d, $%d%s)", paramIndex == 0 ? "" : ",", /* we begin at $1 here: intentional off-by-one */ idParamIndex + 1, lsnParamIndex + 1, /* cast only the first row */ paramIndex == 0 ? "::pg_lsn" : ""); /* prepare next round */ paramIndex += 2; } /* count how many parameters where appended to the VALUES() parts */ sqlParams->count = paramIndex; return true; } /* * pgsql_replication_slot_create_and_drop drops replication slots that belong * to nodes that have been removed, and creates replication slots for nodes * that have been newly registered. We call that function on the primary, where * the slots are maintained by the replication protocol. * * On the standby nodes, we advance the slots ourselves and use the other * function pgsql_replication_slot_maintain which is complete (create, drop, * advance). */ bool pgsql_replication_slot_create_and_drop(PGSQL *pgsql, NodeAddressArray *nodeArray) { PQExpBuffer query = createPQExpBuffer(); PQExpBuffer values = createPQExpBuffer(); /* *INDENT-OFF* */ char *sqlTemplate = /* * We could simplify the writing of this query, but we prefer that it * looks as much as possible like the query used in * pgsql_replication_slot_maintain() so that we can maintain both * easily. */ "WITH nodes(slot_name, lsn) as (" " SELECT '" REPLICATION_SLOT_NAME_DEFAULT "_' || id, lsn" " FROM (%s) as sb(id, lsn) " "), \n" "dropped as (" " SELECT slot_name, pg_drop_replication_slot(slot_name) " " FROM pg_replication_slots pgrs LEFT JOIN nodes USING(slot_name) " " WHERE nodes.slot_name IS NULL " " AND ( slot_name ~ '" REPLICATION_SLOT_NAME_PATTERN "' " " OR slot_name ~ '" REPLICATION_SLOT_NAME_DEFAULT "' )" " AND not active" " AND slot_type = 'physical'" "), \n" "created as (" "SELECT c.slot_name, c.lsn " " FROM nodes LEFT JOIN pg_replication_slots pgrs USING(slot_name), " " LATERAL pg_create_physical_replication_slot(slot_name, true) c" " WHERE pgrs.slot_name IS NULL " ") \n" "SELECT 'create', slot_name, lsn FROM created " " union all " "SELECT 'drop', slot_name, NULL::pg_lsn FROM dropped"; /* *INDENT-ON* */ nodesArraysValuesParams sqlParams = { 0 }; ReplicationSlotMaintainContext context = { 0 }; if (!BuildNodesArrayValues(nodeArray, &sqlParams, values)) { /* errors have already been logged */ destroyPQExpBuffer(query); destroyPQExpBuffer(values); return false; } /* add the computed ($1,$2), ... string to the query "template" */ appendPQExpBuffer(query, sqlTemplate, values->data); bool success = pgsql_execute_with_params(pgsql, query->data, sqlParams.count, sqlParams.types, (const char **) sqlParams.values, &context, parseReplicationSlotMaintain); destroyPQExpBuffer(query); destroyPQExpBuffer(values); return success; } /* * pgsql_replication_slot_maintain creates, drops, and advance replication * slots that belong to other standby nodes. We call that function on the * standby nodes, where the slots are maintained manually just in case we need * them at failover. */ bool pgsql_replication_slot_maintain(PGSQL *pgsql, NodeAddressArray *nodeArray) { PQExpBuffer query = createPQExpBuffer(); PQExpBuffer values = createPQExpBuffer(); /* *INDENT-OFF* */ char *sqlTemplate = "WITH nodes(slot_name, lsn) as (" " SELECT '" REPLICATION_SLOT_NAME_DEFAULT "_' || id, lsn" " FROM (%s) as sb(id, lsn) " "), \n" "dropped as (" " SELECT slot_name, pg_drop_replication_slot(slot_name) " " FROM pg_replication_slots pgrs LEFT JOIN nodes USING(slot_name) " " WHERE nodes.slot_name IS NULL " " AND slot_name ~ '" REPLICATION_SLOT_NAME_PATTERN "' " " AND not active" " AND slot_type = 'physical'" "), \n" "advanced as (" "SELECT a.slot_name, a.end_lsn" " FROM pg_replication_slots s JOIN nodes USING(slot_name), " " LATERAL pg_replication_slot_advance(slot_name, lsn) a" " WHERE nodes.lsn <> '0/0' and nodes.lsn >= s.restart_lsn " " and not s.active " "), \n" "created as (" "SELECT c.slot_name, c.lsn " " FROM nodes LEFT JOIN pg_replication_slots pgrs USING(slot_name), " " LATERAL pg_create_physical_replication_slot(slot_name, true) c" " WHERE pgrs.slot_name IS NULL " ") \n" "SELECT 'create', slot_name, lsn FROM created " " union all " "SELECT 'drop', slot_name, NULL::pg_lsn FROM dropped " " union all " "SELECT 'advance', slot_name, end_lsn FROM advanced "; /* *INDENT-ON* */ nodesArraysValuesParams sqlParams = { 0 }; ReplicationSlotMaintainContext context = { 0 }; if (!BuildNodesArrayValues(nodeArray, &sqlParams, values)) { /* errors have already been logged */ destroyPQExpBuffer(query); destroyPQExpBuffer(values); return false; } /* add the computed ($1,$2), ... string to the query "template" */ appendPQExpBuffer(query, sqlTemplate, values->data); bool success = pgsql_execute_with_params(pgsql, query->data, sqlParams.count, sqlParams.types, (const char **) sqlParams.values, &context, parseReplicationSlotMaintain); destroyPQExpBuffer(query); destroyPQExpBuffer(values); return success; } /* * parseReplicationSlotMaintain parses the result from a PostgreSQL query * fetching two columns from pg_stat_replication: sync_state and currentLSN. */ static void parseReplicationSlotMaintain(void *ctx, PGresult *result) { int rowNumber = 0; ReplicationSlotMaintainContext *context = (ReplicationSlotMaintainContext *) ctx; if (PQnfields(result) != 3) { log_error("Query returned %d columns, expected 3", PQnfields(result)); context->parsedOK = false; return; } for (rowNumber = 0; rowNumber < PQntuples(result); rowNumber++) { /* operation and slotName can't be NULL given how the SQL is built */ char *operation = PQgetvalue(result, rowNumber, 0); char *slotName = PQgetvalue(result, rowNumber, 1); char *lsn = PQgetisnull(result, rowNumber, 2) ? "" : PQgetvalue(result, rowNumber, 2); /* adding or removing another standby node is worthy of a log line */ if (strcmp(operation, "create") == 0) { log_info("Creating replication slot \"%s\"", slotName); } else if (strcmp(operation, "drop") == 0) { log_info("Dropping replication slot \"%s\"", slotName); } else { log_debug("parseReplicationSlotMaintain: %s %s %s", operation, slotName, lsn); } } context->parsedOK = true; } /* * pgsql_disable_synchronous_replication disables synchronous replication * in Postgres such that writes do not block if there is no replica. */ bool pgsql_disable_synchronous_replication(PGSQL *pgsql) { GUC setting = { "synchronous_standby_names", "''" }; char *cancelBlockedStatementsCommand = "SELECT pg_cancel_backend(pid) " " FROM pg_stat_activity " " WHERE wait_event = 'SyncRep'"; log_info("Disabling synchronous replication"); if (!pgsql_alter_system_set(pgsql, setting)) { return false; } log_debug("Unblocking commands waiting for synchronous replication"); if (!pgsql_execute(pgsql, cancelBlockedStatementsCommand)) { return false; } return true; } /* * pgsql_set_default_transaction_mode_read_only makes it so that the server * won't be a target of a connection string requiring target_session_attrs * read-write by issuing ALTER SYSTEM SET transaction_mode_read_only TO on; * */ bool pgsql_set_default_transaction_mode_read_only(PGSQL *pgsql) { GUC setting = { "default_transaction_read_only", "'on'" }; log_info("Setting default_transaction_read_only to on"); return pgsql_alter_system_set(pgsql, setting); } /* * pgsql_set_default_transaction_mode_read_write makes it so that the server * can be a target of a connection string requiring target_session_attrs * read-write by issuing ALTER SYSTEM SET transaction_mode_read_only TO off; * */ bool pgsql_set_default_transaction_mode_read_write(PGSQL *pgsql) { GUC setting = { "default_transaction_read_only", "'off'" }; log_info("Setting default_transaction_read_only to off"); return pgsql_alter_system_set(pgsql, setting); } /* * pgsql_checkpoint runs a CHECKPOINT command on postgres to trigger a checkpoint. */ bool pgsql_checkpoint(PGSQL *pgsql) { return pgsql_execute(pgsql, "CHECKPOINT"); } /* * pgsql_alter_system_set runs an ALTER SYSTEM SET ... command on Postgres * to globally set a GUC and then runs pg_reload_conf() to make existing * sessions reload it. */ static bool pgsql_alter_system_set(PGSQL *pgsql, GUC setting) { char command[BUFSIZE]; sformat(command, sizeof(command), "ALTER SYSTEM SET %s TO %s", setting.name, setting.value); if (!pgsql_execute(pgsql, command)) { log_error("Failed to set \"%s\" to \"%s\" with ALTER SYSTEM, " "see above for details", setting.name, setting.value); return false; } if (!pgsql_reload_conf(pgsql)) { log_error("Failed to reload Postgres config after ALTER SYSTEM " "to set \"%s\" to \"%s\".", setting.name, setting.value); return false; } return true; } /* * pgsql_reset_primary_conninfo issues the following SQL commands: * * ALTER SYSTEM RESET primary_conninfo; * ALTER SYSTEM RESET primary_slot_name; * * That's necessary to clean-up the replication settings that pg_basebackup * puts in place in postgresql.auto.conf in Postgres 12. We don't reload the * configuration after the RESET in that case, because Postgres 12 requires a * restart to apply the new setting value anyway. */ bool pgsql_reset_primary_conninfo(PGSQL *pgsql) { char *reset_primary_conninfo = "ALTER SYSTEM RESET primary_conninfo"; char *reset_primary_slot_name = "ALTER SYSTEM RESET primary_slot_name"; /* ALTER SYSTEM cannot run inside a transaction block */ if (!pgsql_execute(pgsql, reset_primary_conninfo)) { return false; } if (!pgsql_execute(pgsql, reset_primary_slot_name)) { return false; } return true; } /* * pgsql_reload_conf causes open sessions to reload the PostgreSQL configuration * files. */ bool pgsql_reload_conf(PGSQL *pgsql) { char *sql = "SELECT pg_reload_conf()"; log_info("Reloading Postgres configuration and HBA rules"); return pgsql_execute(pgsql, sql); } /* * pgsql_get_hba_file_path gets the value of the hba_file setting in * Postgres or returns false if a failure occurred. The value is copied to * the hbaFilePath pointer. */ bool pgsql_get_hba_file_path(PGSQL *pgsql, char *hbaFilePath, int maxPathLength) { char *configValue = NULL; if (!pgsql_get_current_setting(pgsql, "hba_file", &configValue)) { /* pgsql_get_current_setting logs a relevant error */ return false; } int hbaFilePathLength = strlcpy(hbaFilePath, configValue, maxPathLength); if (hbaFilePathLength >= maxPathLength) { log_error("The hba_file \"%s\" returned by postgres is %d characters, " "the maximum supported by pg_autoctl is %d characters", configValue, hbaFilePathLength, maxPathLength); free(configValue); return false; } free(configValue); return true; } /* * pgsql_get_current_setting gets the value of a GUC in Postgres by running * SELECT current_setting($settingName), or returns false if a failure occurred. * * If getting the value was successful, currentValue will point to a copy of the * value which should be freed by the caller. */ static bool pgsql_get_current_setting(PGSQL *pgsql, char *settingName, char **currentValue) { SingleValueResultContext context = { 0 }; char *sql = "SELECT current_setting($1)"; int paramCount = 1; Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1] = { settingName }; context.resultType = PGSQL_RESULT_STRING; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { /* errors have already been logged */ return false; } if (!context.parsedOk) { log_error("Failed to get result from current_setting('%s')", settingName); return false; } *currentValue = context.strVal; return true; } /* * pgsql_create_database issues a CREATE DATABASE statement. */ bool pgsql_create_database(PGSQL *pgsql, const char *dbname, const char *owner) { char command[BUFSIZE]; char *escapedDBName, *escapedOwner; /* open a connection upfront since it is needed by PQescape functions */ PGconn *connection = pgsql_open_connection(pgsql); if (connection == NULL) { /* error message was logged in pgsql_open_connection */ return false; } /* escape the dbname */ escapedDBName = PQescapeIdentifier(connection, dbname, strlen(dbname)); if (escapedDBName == NULL) { log_error("Failed to create database \"%s\": %s", dbname, PQerrorMessage(connection)); pgsql_finish(pgsql); return false; } /* escape the username */ escapedOwner = PQescapeIdentifier(connection, owner, strlen(owner)); if (escapedOwner == NULL) { log_error("Failed to create database \"%s\": %s", dbname, PQerrorMessage(connection)); PQfreemem(escapedDBName); pgsql_finish(pgsql); return false; } /* now build the SQL command */ sformat(command, BUFSIZE, "CREATE DATABASE %s WITH OWNER %s", escapedDBName, escapedOwner); log_debug("Running command on Postgres: %s;", command); PQfreemem(escapedDBName); PQfreemem(escapedOwner); PGresult *result = PQexec(connection, command); if (!is_response_ok(result)) { /* * Check if we have a duplicate_database (42P04) error, in which case * it means the user has already been created, accept that as a * non-error, only inform about the situation. */ char *sqlstate = PQresultErrorField(result, PG_DIAG_SQLSTATE); if (strcmp(sqlstate, ERRCODE_DUPLICATE_DATABASE) == 0) { log_info("The database \"%s\" already exists, skipping.", dbname); } else { log_error("Failed to create database \"%s\"[%s]: %s", dbname, sqlstate, PQerrorMessage(connection)); PQclear(result); clear_results(pgsql); pgsql_finish(pgsql); return false; } } PQclear(result); clear_results(pgsql); if (pgsql->connectionStatementType == PGSQL_CONNECTION_SINGLE_STATEMENT) { pgsql_finish(pgsql); } return true; } /* * pgsql_create_extension issues a CREATE EXTENSION statement. */ bool pgsql_create_extension(PGSQL *pgsql, const char *name) { char command[BUFSIZE]; /* open a connection upfront since it is needed by PQescape functions */ PGconn *connection = pgsql_open_connection(pgsql); if (connection == NULL) { /* error message was logged in pgsql_open_connection */ return false; } /* escape the dbname */ char *escapedIdentifier = PQescapeIdentifier(connection, name, strlen(name)); if (escapedIdentifier == NULL) { log_error("Failed to create extension \"%s\": %s", name, PQerrorMessage(connection)); pgsql_finish(pgsql); return false; } /* now build the SQL command */ sformat(command, BUFSIZE, "CREATE EXTENSION IF NOT EXISTS %s CASCADE", escapedIdentifier); PQfreemem(escapedIdentifier); log_debug("Running command on Postgres: %s;", command); PGresult *result = PQexec(connection, command); if (!is_response_ok(result)) { /* * Check if we have a duplicate_object (42710) error, in which case * it means the user has already been created, accept that as a * non-error, only inform about the situation. */ char *sqlstate = PQresultErrorField(result, PG_DIAG_SQLSTATE); log_error("Failed to create extension \"%s\"[%s]: %s", name, sqlstate, PQerrorMessage(connection)); PQclear(result); clear_results(pgsql); pgsql_finish(pgsql); return false; } PQclear(result); clear_results(pgsql); if (pgsql->connectionStatementType == PGSQL_CONNECTION_SINGLE_STATEMENT) { pgsql_finish(pgsql); } return true; } /* * pgsql_create_user creates a user with the given settings. * * Unlike most functions this function does opens a connection itself * because it has some specific requirements around logging, error handling * and escaping. */ bool pgsql_create_user(PGSQL *pgsql, const char *userName, const char *password, bool login, bool superuser, bool replication, int connlimit) { /* open a connection upfront since it is needed by PQescape functions */ PGconn *connection = pgsql_open_connection(pgsql); if (connection == NULL) { /* error message was logged in pgsql_open_connection */ return false; } /* escape the username */ PQExpBuffer query = createPQExpBuffer(); char *escapedIdentifier = PQescapeIdentifier(connection, userName, strlen(userName)); if (escapedIdentifier == NULL) { log_error("Failed to create user \"%s\": %s", userName, PQerrorMessage(connection)); pgsql_finish(pgsql); return false; } appendPQExpBuffer(query, "CREATE USER %s", escapedIdentifier); PQfreemem(escapedIdentifier); if (login || superuser || replication || password) { appendPQExpBufferStr(query, " WITH"); } if (login) { appendPQExpBufferStr(query, " LOGIN"); } if (superuser) { appendPQExpBufferStr(query, " SUPERUSER"); } if (replication) { appendPQExpBufferStr(query, " REPLICATION"); } if (connlimit > -1) { appendPQExpBuffer(query, " CONNECTION LIMIT %d", connlimit); } if (password) { /* show the statement before we append the password */ log_debug("Running command on Postgres: %s PASSWORD '*****';", query->data); escapedIdentifier = PQescapeLiteral(connection, password, strlen(password)); if (escapedIdentifier == NULL) { log_error("Failed to create user \"%s\": %s", userName, PQerrorMessage(connection)); PQfreemem(escapedIdentifier); pgsql_finish(pgsql); destroyPQExpBuffer(query); return false; } appendPQExpBuffer(query, " PASSWORD %s", escapedIdentifier); PQfreemem(escapedIdentifier); } else { log_debug("Running command on Postgres: %s;", query->data); } /* memory allocation could have failed while building string */ if (PQExpBufferBroken(query)) { log_error("Failed to allocate memory"); destroyPQExpBuffer(query); pgsql_finish(pgsql); return false; } /* * Set the libpq notice receiver to integrate notifications as debug * message, because when dealing with the citus extension those messages * are not that interesting to our pg_autoctl users frankly: * * NOTICE: not propagating CREATE ROLE/USER commands to worker nodes * HINT: Connect to worker nodes directly... */ PQnoticeProcessor previousNoticeProcessor = PQsetNoticeProcessor(connection, &pgAutoCtlDebugNoticeProcessor, NULL); PGresult *result = PQexec(connection, query->data); destroyPQExpBuffer(query); if (!is_response_ok(result)) { /* * Check if we have a duplicate_object (42710) error, in which case * it means the user has already been created, accept that as a * non-error, only inform about the situation. */ char *sqlstate = PQresultErrorField(result, PG_DIAG_SQLSTATE); if (strcmp(sqlstate, ERRCODE_DUPLICATE_OBJECT) == 0) { log_info("The user \"%s\" already exists, skipping.", userName); } else { log_error("Failed to create user \"%s\"[%s]: %s", userName, sqlstate, PQerrorMessage(connection)); PQclear(result); clear_results(pgsql); pgsql_finish(pgsql); return false; } } PQclear(result); clear_results(pgsql); if (pgsql->connectionStatementType == PGSQL_CONNECTION_SINGLE_STATEMENT) { pgsql_finish(pgsql); } else { /* restore the normal notice message processing, if needed. */ PQsetNoticeProcessor(connection, previousNoticeProcessor, NULL); } return true; } /* * pgsql_has_replica returns whether a replica with the given username is active. */ bool pgsql_has_replica(PGSQL *pgsql, char *userName, bool *hasReplica) { SingleValueResultContext context = { { 0 }, PGSQL_RESULT_BOOL, false }; /* * Check whether there is an entry in pg_stat_replication, which means * there is either a pg_basebackup or streaming replica active. In either * case, it means there is a replica that recently communicated with the * postgres server, which is all we care about for the purpose of this * function. */ char *sql = "SELECT EXISTS (SELECT 1 FROM pg_stat_replication WHERE usename = $1)"; const Oid paramTypes[1] = { TEXTOID }; const char *paramValues[1] = { userName }; int paramCount = 1; if (!pgsql_execute_with_params(pgsql, sql, paramCount, paramTypes, paramValues, &context, &parseSingleValueResult)) { /* errors have already been logged */ return false; } if (!context.parsedOk) { log_error("Failed to find pg_stat_replication"); return false; } *hasReplica = context.boolVal; return true; } /* * hostname_from_uri parses a PostgreSQL connection string URI and returns * whether the URL was successfully parsed. */ bool hostname_from_uri(const char *pguri, char *hostname, int maxHostLength, int *port) { int found = 0; char *errmsg; PQconninfoOption *conninfo, *option; conninfo = PQconninfoParse(pguri, &errmsg); if (conninfo == NULL) { log_error("Failed to parse pguri \"%s\": %s", pguri, errmsg); PQfreemem(errmsg); return false; } for (option = conninfo; option->keyword != NULL; option++) { if (strcmp(option->keyword, "host") == 0 || strcmp(option->keyword, "hostaddr") == 0) { if (option->val) { int hostNameLength = strlcpy(hostname, option->val, maxHostLength); if (hostNameLength >= maxHostLength) { log_error( "The URL \"%s\" contains a hostname of %d characters, " "the maximum supported by pg_autoctl is %d characters", option->val, hostNameLength, maxHostLength); PQconninfoFree(conninfo); return false; } ++found; } } if (strcmp(option->keyword, "port") == 0) { if (option->val) { /* we expect a single port number in a monitor's URI */ if (!stringToInt(option->val, port)) { log_error("Failed to parse port number : %s", option->val); PQconninfoFree(conninfo); return false; } ++found; } else { *port = POSTGRES_PORT; } } if (found == 2) { break; } } PQconninfoFree(conninfo); return true; } /* * validate_connection_string takes a connection string and parses it with * libpq, varifying that it's well formed and usable. */ bool validate_connection_string(const char *connectionString) { char *errorMessage = NULL; int length = strlen(connectionString); if (length >= MAXCONNINFO) { log_error("Connection string \"%s\" is %d " "characters, the maximum supported by pg_autoctl is %d", connectionString, length, MAXCONNINFO); return false; } PQconninfoOption *connInfo = PQconninfoParse(connectionString, &errorMessage); if (connInfo == NULL) { log_error("Failed to parse connection string \"%s\": %s ", connectionString, errorMessage); PQfreemem(errorMessage); return false; } PQconninfoFree(connInfo); return true; } /* * pgsql_get_postgres_metadata returns several bits of information that we need * to take decisions in the rest of the code: * * - pg_is_in_recovery (primary or standby, as expected?) * - sync_state from pg_stat_replication when a primary * - current_lsn from the server * - pg_control_version * - catalog_version_no * - system_identifier * * With those metadata we can then check our expectations and take decisions in * some cases. We can obtain all the metadata that we need easily enough in a * single SQL query, so that's what we do. */ typedef struct PgMetadata { char sqlstate[6]; bool parsedOk; bool pg_is_in_recovery; char syncState[PGSR_SYNC_STATE_MAXLENGTH]; char currentLSN[PG_LSN_MAXLENGTH]; PostgresControlData control; } PgMetadata; bool pgsql_get_postgres_metadata(PGSQL *pgsql, bool *pg_is_in_recovery, char *pgsrSyncState, char *currentLSN, PostgresControlData *control) { PgMetadata context = { 0 }; /* *INDENT-OFF* */ char *sql = /* * Make it so that we still have the current WAL LSN even in the case * where there's no replication slot in use by any standby. * * When on the primary, we might have multiple standby nodes connected. * We're good when at least one of them is either 'sync' or 'quorum'. * We don't check individual replication slots, we take the "best" one * and report that. */ "select pg_is_in_recovery()," " coalesce(rep.sync_state, '') as sync_state," " case when pg_is_in_recovery()" " then coalesce(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())" " else pg_current_wal_flush_lsn()" " end as current_lsn," " pg_control_version, catalog_version_no, system_identifier," " case when pg_is_in_recovery()" " then (select received_tli from pg_stat_wal_receiver)" " else (select timeline_id from pg_control_checkpoint()) " " end as timeline_id " " from (values(1)) as dummy" " full outer join" " (select pg_control_version, catalog_version_no, system_identifier " " from pg_control_system()" " )" " as control on true" " full outer join" " (" " select sync_state" " from pg_replication_slots slot" " join pg_stat_replication rep" " on rep.pid = slot.active_pid" " where slot_name ~ '" REPLICATION_SLOT_NAME_PATTERN "' " " or slot_name = '" REPLICATION_SLOT_NAME_DEFAULT "' " "order by case sync_state " " when 'quorum' then 4 " " when 'sync' then 3 " " when 'potential' then 2 " " when 'async' then 1 " " else 0 end " " desc limit 1" " ) " "as rep on true"; /* *INDENT-ON* */ if (!pgsql_execute_with_params(pgsql, sql, 0, NULL, NULL, &context, &parsePgMetadata)) { /* errors have been logged already */ return false; } if (!context.parsedOk) { log_error("Failed to parse the Postgres metadata"); return false; } *pg_is_in_recovery = context.pg_is_in_recovery; /* the last two metadata items are opt-in */ if (pgsrSyncState != NULL) { strlcpy(pgsrSyncState, context.syncState, PGSR_SYNC_STATE_MAXLENGTH); } if (currentLSN != NULL) { strlcpy(currentLSN, context.currentLSN, PG_LSN_MAXLENGTH); } /* overwrite the Control Data fetched from the query */ *control = context.control; pgsql_finish(pgsql); return true; } /* * parsePgMetadata parses the result from a PostgreSQL query fetching * two columns from pg_stat_replication: sync_state and currentLSN. */ static void parsePgMetadata(void *ctx, PGresult *result) { PgMetadata *context = (PgMetadata *) ctx; char *value; if (PQnfields(result) != 7) { log_error("Query returned %d columns, expected 7", PQnfields(result)); context->parsedOk = false; return; } if (PQntuples(result) != 1) { log_error("Query returned %d rows, expected 1", PQntuples(result)); context->parsedOk = false; return; } context->pg_is_in_recovery = strcmp(PQgetvalue(result, 0, 0), "t") == 0; if (!PQgetisnull(result, 0, 1)) { value = PQgetvalue(result, 0, 1); strlcpy(context->syncState, value, PGSR_SYNC_STATE_MAXLENGTH); } else { context->syncState[0] = '\0'; } if (!PQgetisnull(result, 0, 2)) { value = PQgetvalue(result, 0, 2); strlcpy(context->currentLSN, value, PG_LSN_MAXLENGTH); } else { context->currentLSN[0] = '\0'; } value = PQgetvalue(result, 0, 3); if (!stringToUInt(value, &(context->control.pg_control_version))) { log_error("Failed to parse pg_control_version \"%s\"", value); context->parsedOk = true; return; } value = PQgetvalue(result, 0, 4); if (!stringToUInt(value, &(context->control.catalog_version_no))) { log_error("Failed to parse catalog_version_no \"%s\"", value); context->parsedOk = true; return; } value = PQgetvalue(result, 0, 5); if (!stringToUInt64(value, &(context->control.system_identifier))) { log_error("Failed to parse system_identifier \"%s\"", value); context->parsedOk = true; return; } /* * On a standby node that doesn't have a primary_conninfo then we fail to * retrieve the received_tli from pg_stat_wal_receiver. We encode the NULL * we get in that case with a zero, which is not a value we expect. */ if (PQgetisnull(result, 0, 6)) { context->control.timeline_id = 0; } else { value = PQgetvalue(result, 0, 6); if (!stringToUInt(value, &(context->control.timeline_id))) { log_error("Failed to parse timeline_id \"%s\"", value); context->parsedOk = true; return; } } context->parsedOk = true; } typedef struct PgReachedTargetLSN { char sqlstate[6]; bool parsedOk; bool hasReachedLSN; char currentLSN[PG_LSN_MAXLENGTH]; bool noRows; } PgReachedTargetLSN; /* * pgsql_one_slot_has_reached_target_lsn checks that at least one replication * slot has reached the given LSN already, using the Postgres system views * pg_replication_slots and pg_stat_replication on the primary server. */ bool pgsql_one_slot_has_reached_target_lsn(PGSQL *pgsql, char *targetLSN, char *currentLSN, bool *hasReachedLSN) { PgReachedTargetLSN context = { 0 }; /* * We pick the most advanced LSN reached by the pgautofailover replication * slots, and only consider those that have made it to "sync" or "quorum" * sync_state already. This function is typically called after sync rep has * been enabled on the primary. */ /* *INDENT-OFF* */ char *sql = " select $1::pg_lsn <= flush_lsn, flush_lsn " " from pg_replication_slots slot" " join pg_stat_replication rep" " on rep.pid = slot.active_pid" " where ( slot_name ~ '" REPLICATION_SLOT_NAME_PATTERN "' " " or slot_name = '" REPLICATION_SLOT_NAME_DEFAULT "') " " and sync_state in ('sync', 'quorum') " "order by flush_lsn desc limit 1"; /* *INDENT-ON* */ const Oid paramTypes[1] = { LSNOID }; const char *paramValues[1] = { targetLSN }; if (!pgsql_execute_with_params(pgsql, sql, 1, paramTypes, paramValues, &context, &parsePgReachedTargetLSN)) { /* errors have been logged already */ return false; } if (!context.parsedOk) { if (context.noRows) { log_warn("No standby nodes are connected at the moment"); } else { log_error("Failed to fetch current flush_lsn location for " "connected standby nodes, see above for details"); } return false; } *hasReachedLSN = context.hasReachedLSN; strlcpy(currentLSN, context.currentLSN, PG_LSN_MAXLENGTH); return true; } /* * pgsql_has_reached_target_lsn calls pg_last_wal_replay_lsn() and compares the * current LSN on the system to the given targetLSN. */ bool pgsql_has_reached_target_lsn(PGSQL *pgsql, char *targetLSN, char *currentLSN, bool *hasReachedLSN) { PgReachedTargetLSN context = { 0 }; char *sql = "SELECT $1::pg_lsn <= pg_last_wal_replay_lsn(), " " pg_last_wal_replay_lsn()"; const Oid paramTypes[1] = { LSNOID }; const char *paramValues[1] = { targetLSN }; if (!pgsql_execute_with_params(pgsql, sql, 1, paramTypes, paramValues, &context, &parsePgReachedTargetLSN)) { /* errors have been logged already */ return false; } if (!context.parsedOk) { log_error("Failed to get result from pg_last_wal_replay_lsn()"); return false; } *hasReachedLSN = context.hasReachedLSN; strlcpy(currentLSN, context.currentLSN, PG_LSN_MAXLENGTH); return true; } /* * parsePgMetadata parses the result from a PostgreSQL query fetching * two columns from pg_stat_replication: sync_state and currentLSN. */ static void parsePgReachedTargetLSN(void *ctx, PGresult *result) { PgReachedTargetLSN *context = (PgReachedTargetLSN *) ctx; if (PQnfields(result) != 2) { log_error("Query returned %d columns, expected 2", PQnfields(result)); context->parsedOk = false; return; } if (PQntuples(result) == 0) { log_debug("parsePgReachedTargetLSN: query returned no rows"); context->parsedOk = false; context->noRows = true; return; } if (PQntuples(result) != 1) { log_error("Query returned %d rows, expected 1", PQntuples(result)); context->parsedOk = false; return; } context->hasReachedLSN = strcmp(PQgetvalue(result, 0, 0), "t") == 0; if (!PQgetisnull(result, 0, 1)) { char *value = PQgetvalue(result, 0, 1); strlcpy(context->currentLSN, value, PG_LSN_MAXLENGTH); } else { context->currentLSN[0] = '\0'; } context->parsedOk = true; } typedef struct IdentifySystemResult { char sqlstate[6]; bool parsedOk; IdentifySystem *system; } IdentifySystemResult; typedef struct TimelineHistoryResult { char sqlstate[6]; bool parsedOk; char filename[MAXPGPATH]; char content[BUFSIZE * BUFSIZE]; /* 1MB should get us quite very far */ } TimelineHistoryResult; /* * pgsql_identify_system connects to the given pgsql client and issue the * replication command IDENTIFY_SYSTEM. The pgsql connection string should * contain the 'replication=1' parameter. */ bool pgsql_identify_system(PGSQL *pgsql, IdentifySystem *system) { PGconn *connection = pgsql_open_connection(pgsql); if (connection == NULL) { /* error message was logged in pgsql_open_connection */ return false; } /* extended query protocol not supported in a replication connection */ PGresult *result = PQexec(connection, "IDENTIFY_SYSTEM"); if (!is_response_ok(result)) { log_error("Failed to IDENTIFY_SYSTEM: %s", PQerrorMessage(connection)); PQclear(result); clear_results(pgsql); PQfinish(connection); return false; } IdentifySystemResult isContext = { { 0 }, false, system }; (void) parseIdentifySystemResult((void *) &isContext, result); PQclear(result); clear_results(pgsql); log_debug("IDENTIFY_SYSTEM: timeline %d, xlogpos %s, systemid %" PRIu64, system->timeline, system->xlogpos, system->identifier); if (!isContext.parsedOk) { log_error("Failed to get result from IDENTIFY_SYSTEM"); PQfinish(connection); return false; } /* while at it, we also run the TIMELINE_HISTORY command */ if (system->timeline > 1) { TimelineHistoryResult hContext = { 0 }; char sql[BUFSIZE] = { 0 }; sformat(sql, sizeof(sql), "TIMELINE_HISTORY %d", system->timeline); result = PQexec(connection, sql); if (!is_response_ok(result)) { log_error("Failed to request TIMELINE_HISTORY: %s", PQerrorMessage(connection)); PQclear(result); clear_results(pgsql); PQfinish(connection); return false; } (void) parseTimelineHistoryResult((void *) &hContext, result); PQclear(result); clear_results(pgsql); if (!hContext.parsedOk) { log_error("Failed to get result from TIMELINE_HISTORY"); PQfinish(connection); return false; } if (!parseTimeLineHistory(hContext.filename, hContext.content, system)) { /* errors have already been logged */ PQfinish(connection); return false; } TimeLineHistoryEntry *current = &(system->timelines.history[system->timelines.count - 1]); log_debug("TIMELINE_HISTORY: \"%s\", timeline %d started at %X/%X", hContext.filename, current->tli, (uint32_t) (current->begin >> 32), (uint32_t) current->begin); } /* now we're done with running SQL queries */ PQfinish(connection); return true; } /* * parsePgMetadata parses the result from a PostgreSQL query fetching * two columns from pg_stat_replication: sync_state and currentLSN. */ static void parseIdentifySystemResult(void *ctx, PGresult *result) { IdentifySystemResult *context = (IdentifySystemResult *) ctx; if (PQnfields(result) != 4) { log_error("Query returned %d columns, expected 4", PQnfields(result)); context->parsedOk = false; return; } if (PQntuples(result) == 0) { log_debug("parseIdentifySystem: query returned no rows"); context->parsedOk = false; return; } if (PQntuples(result) != 1) { log_error("Query returned %d rows, expected 1", PQntuples(result)); context->parsedOk = false; return; } /* systemid (text) */ char *value = PQgetvalue(result, 0, 0); if (!stringToUInt64(value, &(context->system->identifier))) { log_error("Failed to parse system_identifier \"%s\"", value); context->parsedOk = false; return; } /* timeline (int4) */ value = PQgetvalue(result, 0, 1); if (!stringToUInt32(value, &(context->system->timeline))) { log_error("Failed to parse timeline \"%s\"", value); context->parsedOk = false; return; } /* xlogpos (text) */ value = PQgetvalue(result, 0, 2); strlcpy(context->system->xlogpos, value, PG_LSN_MAXLENGTH); /* dbname (text) Database connected to or null */ if (!PQgetisnull(result, 0, 3)) { value = PQgetvalue(result, 0, 3); strlcpy(context->system->dbname, value, NAMEDATALEN); } context->parsedOk = true; } /* * parseTimelineHistory parses the result of the TIMELINE_HISTORY replication * command. */ static void parseTimelineHistoryResult(void *ctx, PGresult *result) { TimelineHistoryResult *context = (TimelineHistoryResult *) ctx; if (PQnfields(result) != 2) { log_error("Query returned %d columns, expected 2", PQnfields(result)); context->parsedOk = false; return; } if (PQntuples(result) == 0) { log_debug("parseTimelineHistory: query returned no rows"); context->parsedOk = false; return; } if (PQntuples(result) != 1) { log_error("Query returned %d rows, expected 1", PQntuples(result)); context->parsedOk = false; return; } /* filename (text) */ char *value = PQgetvalue(result, 0, 0); strlcpy(context->filename, value, sizeof(context->filename)); /* content (bytea) */ value = PQgetvalue(result, 0, 1); if (strlen(value) >= sizeof(context->content)) { log_error("Received a timeline history file of %lu bytes, " "pg_autoctl is limited to files of up to %lu bytes.", (unsigned long) strlen(value), (unsigned long) sizeof(context->content)); context->parsedOk = false; } strlcpy(context->content, value, sizeof(context->content)); context->parsedOk = true; } /* * parseTimeLineHistory parses the content of a timeline history file. */ bool parseTimeLineHistory(const char *filename, const char *content, IdentifySystem *system) { char *historyLines[BUFSIZE] = { 0 }; int lineCount = splitLines((char *) content, historyLines, BUFSIZE); int lineNumber = 0; if (lineCount >= PG_AUTOCTL_MAX_TIMELINES) { log_error("history file \"%s\" contains %d lines, " "pg_autoctl only supports up to %d lines", filename, lineCount, PG_AUTOCTL_MAX_TIMELINES - 1); return false; } uint64_t prevend = InvalidXLogRecPtr; system->timelines.count = 0; TimeLineHistoryEntry *entry = &(system->timelines.history[system->timelines.count]); for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { char *ptr = historyLines[lineNumber]; /* skip leading whitespace and check for # comment */ for (; *ptr; ptr++) { if (!isspace((unsigned char) *ptr)) { break; } } if (*ptr == '\0' || *ptr == '#') { continue; } log_trace("parseTimeLineHistory line %d is \"%s\"", lineNumber, historyLines[lineNumber]); char *tabptr = strchr(historyLines[lineNumber], '\t'); if (tabptr == NULL) { log_error("Failed to parse history file line %d: \"%s\"", lineNumber, ptr); return false; } *tabptr = '\0'; if (!stringToUInt(historyLines[lineNumber], &(entry->tli))) { log_error("Failed to parse history timeline \"%s\"", tabptr); return false; } char *lsn = tabptr + 1; for (char *lsnend = lsn; *lsnend; lsnend++) { if (!(isxdigit((unsigned char) *lsnend) || *lsnend == '/')) { *lsnend = '\0'; break; } } if (!parseLSN(lsn, &(entry->end))) { log_error("Failed to parse history timeline %d LSN \"%s\"", entry->tli, lsn); return false; } entry->begin = prevend; prevend = entry->end; log_trace("parseTimeLineHistory[%d]: tli %d [%X/%X %X/%X]", system->timelines.count, entry->tli, (uint32) (entry->begin >> 32), (uint32) entry->begin, (uint32) (entry->end >> 32), (uint32) entry->end); entry = &(system->timelines.history[++system->timelines.count]); } /* * Create one more entry for the "tip" of the timeline, which has no entry * in the history file. */ entry->tli = system->timeline; entry->begin = prevend; entry->end = InvalidXLogRecPtr; log_trace("parseTimeLineHistory[%d]: tli %d [%X/%X %X/%X]", system->timelines.count, entry->tli, (uint32) (entry->begin >> 32), (uint32) entry->begin, (uint32) (entry->end >> 32), (uint32) entry->end); /* fix the off-by-one so that the count is a count, not an index */ ++system->timelines.count; return true; } /* * LISTEN/NOTIFY support. * * First, send a LISTEN command. */ bool pgsql_listen(PGSQL *pgsql, char *channels[]) { PGresult *result = NULL; char sql[BUFSIZE]; /* * mark the connection as multi statement since it is going to be used by * for processing notifications */ pgsql->connectionStatementType = PGSQL_CONNECTION_MULTI_STATEMENT; /* open a connection upfront since it is needed by PQescape functions */ PGconn *connection = pgsql_open_connection(pgsql); if (connection == NULL) { /* error message was logged in pgsql_open_connection */ return false; } for (int i = 0; channels[i]; i++) { char *channel = PQescapeIdentifier(connection, channels[i], strlen(channels[i])); if (channel == NULL) { log_error("Failed to LISTEN \"%s\": %s", channels[i], PQerrorMessage(connection)); pgsql_finish(pgsql); return false; } sformat(sql, BUFSIZE, "LISTEN %s", channel); PQfreemem(channel); result = PQexec(connection, sql); if (!is_response_ok(result)) { log_error("Failed to LISTEN \"%s\": %s", channels[i], PQerrorMessage(connection)); PQclear(result); clear_results(pgsql); return false; } PQclear(result); clear_results(pgsql); } return true; } /* * Preapre a multi statement connection which can later be used in wait for * notification functions. * * Contrarry to pgsql_listen, this function, only prepares the connection and it * is the user's responsibility to define which channels to listen to. */ bool pgsql_prepare_to_wait(PGSQL *pgsql) { /* * mark the connection as multi statement since it is going to be used by * for processing notifications */ pgsql->connectionStatementType = PGSQL_CONNECTION_MULTI_STATEMENT; /* open a connection upfront since it is needed by PQescape functions */ PGconn *connection = pgsql_open_connection(pgsql); if (connection == NULL) { /* error message was logged in pgsql_open_connection */ return false; } return true; } /* * pgsql_alter_extension_update_to executes ALTER EXTENSION ... UPDATE TO ... */ bool pgsql_alter_extension_update_to(PGSQL *pgsql, const char *extname, const char *version) { char command[BUFSIZE]; char *escapedIdentifier, *escapedVersion; /* open a connection upfront since it is needed by PQescape functions */ PGconn *connection = pgsql_open_connection(pgsql); if (connection == NULL) { /* error message was logged in pgsql_open_connection */ return false; } /* escape the extname */ escapedIdentifier = PQescapeIdentifier(connection, extname, strlen(extname)); if (escapedIdentifier == NULL) { log_error("Failed to update extension \"%s\": %s", extname, PQerrorMessage(connection)); pgsql_finish(pgsql); return false; } /* escape the version */ escapedVersion = PQescapeIdentifier(connection, version, strlen(version)); if (escapedIdentifier == NULL) { log_error("Failed to update extension \"%s\" to version \"%s\": %s", extname, version, PQerrorMessage(connection)); pgsql_finish(pgsql); return false; } /* now build the SQL command */ int n = sformat(command, BUFSIZE, "ALTER EXTENSION %s UPDATE TO %s", escapedIdentifier, escapedVersion); if (n >= BUFSIZE) { log_error("BUG: pg_autoctl only supports SQL string up to %d bytes, " "a SQL string of %d bytes is needed to " "update the \"%s\" extension.", BUFSIZE, n, extname); } PQfreemem(escapedIdentifier); PQfreemem(escapedVersion); log_debug("Running command on Postgres: %s;", command); PGresult *result = PQexec(connection, command); if (!is_response_ok(result)) { char *sqlstate = PQresultErrorField(result, PG_DIAG_SQLSTATE); log_error("Error %s while running Postgres query: %s:", sqlstate, command); char *message = PQerrorMessage(connection); char *errorLines[BUFSIZE]; int lineCount = splitLines(message, errorLines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { log_error("%s", errorLines[lineNumber]); } PQclear(result); clear_results(pgsql); pgsql_finish(pgsql); return false; } PQclear(result); clear_results(pgsql); return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/pgsql.h000066400000000000000000000301721414244367200220570ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pgsql.h * Functions for interacting with a postgres server * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef PGSQL_H #define PGSQL_H #include #include #include "libpq-fe.h" #include "portability/instr_time.h" #include "defaults.h" #include "pgsetup.h" #include "state.h" /* * OID values from PostgreSQL src/include/catalog/pg_type.h */ #define BOOLOID 16 #define NAMEOID 19 #define INT4OID 23 #define INT8OID 20 #define TEXTOID 25 #define LSNOID 3220 /* * Maximum connection info length as used in walreceiver.h */ #define MAXCONNINFO 1024 /* * pg_stat_replication.sync_state is one if: * sync, async, quorum, potential */ #define PGSR_SYNC_STATE_MAXLENGTH 10 /* * We receive a list of "other nodes" from the monitor, and we store that list * in local memory. We pre-allocate the memory storage, and limit how many node * addresses we can handle because of the pre-allocation strategy. */ #define NODE_ARRAY_MAX_COUNT 12 /* abstract representation of a Postgres server that we can connect to */ typedef enum { PGSQL_CONN_LOCAL = 0, PGSQL_CONN_MONITOR, PGSQL_CONN_COORDINATOR, PGSQL_CONN_UPSTREAM, PGSQL_CONN_APP } ConnectionType; /* * Retry policy to follow when we fail to connect to a Postgres URI. * * In almost all the code base the retry mechanism is implemented in the main * loop so we want to fail fast and let the main loop handle the connection * retry and the different network timeouts that we have, including the network * partition detection timeout. * * In the initialisation code path though, pg_autoctl might be launched from * provisioning script on a set of nodes in parallel, and in that case we need * to secure a connection and implement a retry policy at the point in the code * where we open a connection, so that it's transparent to the caller. * * When we do retry connecting, we implement an Exponential Backoff with * Decorrelated Jitter algorithm as proven useful in the following article: * * https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ */ typedef struct ConnectionRetryPolicy { int maxT; /* maximum time spent retrying (seconds) */ int maxR; /* maximum number of retries, might be zero */ int maxSleepTime; /* in millisecond, used to cap sleepTime */ int baseSleepTime; /* in millisecond, base time to sleep for */ int sleepTime; /* in millisecond, time waited for last round */ instr_time startTime; /* time of the first attempt */ instr_time connectTime; /* time of successful connection */ int attempts; /* how many attempts have been made so far */ } ConnectionRetryPolicy; /* * Denote if the connetion is going to be used for one, or multiple statements. * This is used by psql_* functions to know if a connection is to be closed * after successful completion, or if the the connection is to be maintained * open for further queries. * * A common use case for maintaining a connection open, is while wishing to open * and maintain a transaction block. Another, is while listening for events. */ typedef enum { PGSQL_CONNECTION_SINGLE_STATEMENT = 0, PGSQL_CONNECTION_MULTI_STATEMENT } ConnectionStatementType; /* * Allow higher level code to distinguish between failure to connect to the * target Postgres service and failure to run a query or obtain the expected * result. To that end we expose PQstatus() of the connection. * * We don't use the same enum values as in libpq because we want to have the * unknown value when we didn't try to connect yet. */ typedef enum { PG_CONNECTION_UNKNOWN = 0, PG_CONNECTION_OK, PG_CONNECTION_BAD } PGConnStatus; /* notification processing */ typedef bool (*ProcessNotificationFunction)(int notificationGroupId, int64_t notificationNodeId, char *channel, char *payload); typedef struct PGSQL { ConnectionType connectionType; ConnectionStatementType connectionStatementType; char connectionString[MAXCONNINFO]; PGconn *connection; ConnectionRetryPolicy retryPolicy; PGConnStatus status; ProcessNotificationFunction notificationProcessFunction; int notificationGroupId; int64_t notificationNodeId; bool notificationReceived; } PGSQL; /* PostgreSQL ("Grand Unified Configuration") setting */ typedef struct GUC { char *name; char *value; } GUC; /* network address of a node in an HA group */ typedef struct NodeAddress { int64_t nodeId; char name[_POSIX_HOST_NAME_MAX]; char host[_POSIX_HOST_NAME_MAX]; int port; int tli; char lsn[PG_LSN_MAXLENGTH]; bool isPrimary; } NodeAddress; typedef struct NodeAddressArray { int count; NodeAddress nodes[NODE_ARRAY_MAX_COUNT]; } NodeAddressArray; /* * TimeLineHistoryEntry is taken from Postgres definitions and adapted to * client-size code where we don't have all the necessary infrastruture. In * particular we don't define a XLogRecPtr data type nor do we define a * TimeLineID data type. * * Zero is used indicate an invalid pointer. Bootstrap skips the first possible * WAL segment, initializing the first WAL page at WAL segment size, so no XLOG * record can begin at zero. */ #define InvalidXLogRecPtr 0 #define XLogRecPtrIsInvalid(r) ((r) == InvalidXLogRecPtr) #define PG_AUTOCTL_MAX_TIMELINES 1024 typedef struct TimeLineHistoryEntry { uint32_t tli; uint64_t begin; /* inclusive */ uint64_t end; /* exclusive, InvalidXLogRecPtr means infinity */ } TimeLineHistoryEntry; typedef struct TimeLineHistory { int count; TimeLineHistoryEntry history[PG_AUTOCTL_MAX_TIMELINES]; } TimeLineHistory; /* * The IdentifySystem contains information that is parsed from the * IDENTIFY_SYSTEM replication command, and then the TIMELINE_HISTORY result. */ typedef struct IdentifySystem { uint64_t identifier; uint32_t timeline; char xlogpos[PG_LSN_MAXLENGTH]; char dbname[NAMEDATALEN]; TimeLineHistory timelines; } IdentifySystem; /* * The replicationSource structure is used to pass the bits of a connection * string to the primary node around in several function calls. All the * information stored in there must fit in a connection string, so MAXCONNINFO * is a good proxy for their maximum size. */ typedef struct ReplicationSource { NodeAddress primaryNode; char userName[NAMEDATALEN]; char slotName[MAXCONNINFO]; char password[MAXCONNINFO]; char maximumBackupRate[MAXIMUM_BACKUP_RATE_LEN]; char backupDir[MAXCONNINFO]; char applicationName[MAXCONNINFO]; char targetLSN[PG_LSN_MAXLENGTH]; char targetAction[NAMEDATALEN]; char targetTimeline[NAMEDATALEN]; SSLOptions sslOptions; IdentifySystem system; } ReplicationSource; /* * Arrange a generic way to parse PostgreSQL result from a query. Most of the * queries we need here return a single row of a single column, so that's what * the default context and parsing allows for. */ /* callback for parsing query results */ typedef void (ParsePostgresResultCB)(void *context, PGresult *result); typedef enum { PGSQL_RESULT_BOOL = 1, PGSQL_RESULT_INT, PGSQL_RESULT_BIGINT, PGSQL_RESULT_STRING } QueryResultType; /* * As a way to communicate the SQL STATE when an error occurs, every * pgsql_execute_with_params context structure must have the same first field, * an array of 5 characters (plus '\0' at the end). */ #define SQLSTATE_LENGTH 6 #define STR_ERRCODE_CLASS_CONNECTION_EXCEPTION "08" typedef struct AbstractResultContext { char sqlstate[SQLSTATE_LENGTH]; } AbstractResultContext; /* data structure for keeping a single-value query result */ typedef struct SingleValueResultContext { char sqlstate[SQLSTATE_LENGTH]; QueryResultType resultType; bool parsedOk; int ntuples; bool boolVal; int intVal; uint64_t bigint; char *strVal; } SingleValueResultContext; #define CHECK__SETTINGS_SQL \ "select bool_and(ok) " \ "from (" \ "select current_setting('max_wal_senders')::int >= 12" \ " union all " \ "select current_setting('max_replication_slots')::int >= 12" \ " union all " \ "select current_setting('wal_level') in ('replica', 'logical')" \ " union all " \ "select current_setting('wal_log_hints') = 'on'" #define CHECK_POSTGRESQL_NODE_SETTINGS_SQL \ CHECK__SETTINGS_SQL \ ") as t(ok) " #define CHECK_CITUS_NODE_SETTINGS_SQL \ CHECK__SETTINGS_SQL \ " union all " \ "select lib = 'citus' " \ "from unnest(string_to_array(" \ "current_setting('shared_preload_libraries'), ',') " \ " || array['not citus']) " \ "with ordinality ast(lib, n) where n = 1" \ ") as t(ok) " bool pgsql_init(PGSQL *pgsql, char *url, ConnectionType connectionType); void pgsql_set_retry_policy(ConnectionRetryPolicy *retryPolicy, int maxT, int maxR, int maxSleepTime, int baseSleepTime); void pgsql_set_main_loop_retry_policy(ConnectionRetryPolicy *retryPolicy); void pgsql_set_init_retry_policy(ConnectionRetryPolicy *retryPolicy); void pgsql_set_interactive_retry_policy(ConnectionRetryPolicy *retryPolicy); void pgsql_set_monitor_interactive_retry_policy(ConnectionRetryPolicy *retryPolicy); int pgsql_compute_connection_retry_sleep_time(ConnectionRetryPolicy *retryPolicy); bool pgsql_retry_policy_expired(ConnectionRetryPolicy *retryPolicy); void pgsql_finish(PGSQL *pgsql); void parseSingleValueResult(void *ctx, PGresult *result); void fetchedRows(void *ctx, PGresult *result); bool pgsql_begin(PGSQL *pgsql); bool pgsql_commit(PGSQL *pgsql); bool pgsql_rollback(PGSQL *pgsql); bool pgsql_execute(PGSQL *pgsql, const char *sql); bool pgsql_execute_with_params(PGSQL *pgsql, const char *sql, int paramCount, const Oid *paramTypes, const char **paramValues, void *parseContext, ParsePostgresResultCB *parseFun); bool pgsql_check_postgresql_settings(PGSQL *pgsql, bool isCitusInstanceKind, bool *settings_are_ok); bool pgsql_check_monitor_settings(PGSQL *pgsql, bool *settings_are_ok); bool pgsql_is_in_recovery(PGSQL *pgsql, bool *is_in_recovery); bool pgsql_reload_conf(PGSQL *pgsql); bool pgsql_replication_slot_exists(PGSQL *pgsql, const char *slotName, bool *slotExists); bool pgsql_create_replication_slot(PGSQL *pgsql, const char *slotName); bool pgsql_drop_replication_slot(PGSQL *pgsql, const char *slotName); bool postgres_sprintf_replicationSlotName(int64_t nodeId, char *slotName, int size); bool pgsql_set_synchronous_standby_names(PGSQL *pgsql, char *synchronous_standby_names); bool pgsql_replication_slot_create_and_drop(PGSQL *pgsql, NodeAddressArray *nodeArray); bool pgsql_replication_slot_maintain(PGSQL *pgsql, NodeAddressArray *nodeArray); bool pgsql_disable_synchronous_replication(PGSQL *pgsql); bool pgsql_set_default_transaction_mode_read_only(PGSQL *pgsql); bool pgsql_set_default_transaction_mode_read_write(PGSQL *pgsql); bool pgsql_checkpoint(PGSQL *pgsql); bool pgsql_get_hba_file_path(PGSQL *pgsql, char *hbaFilePath, int maxPathLength); bool pgsql_create_database(PGSQL *pgsql, const char *dbname, const char *owner); bool pgsql_create_extension(PGSQL *pgsql, const char *name); bool pgsql_create_user(PGSQL *pgsql, const char *userName, const char *password, bool login, bool superuser, bool replication, int connlimit); bool pgsql_has_replica(PGSQL *pgsql, char *userName, bool *hasReplica); bool hostname_from_uri(const char *pguri, char *hostname, int maxHostLength, int *port); bool validate_connection_string(const char *connectionString); bool pgsql_reset_primary_conninfo(PGSQL *pgsql); bool pgsql_get_postgres_metadata(PGSQL *pgsql, bool *pg_is_in_recovery, char *pgsrSyncState, char *currentLSN, PostgresControlData *control); bool pgsql_one_slot_has_reached_target_lsn(PGSQL *pgsql, char *targetLSN, char *currentLSN, bool *hasReachedLSN); bool pgsql_has_reached_target_lsn(PGSQL *pgsql, char *targetLSN, char *currentLSN, bool *hasReachedLSN); bool pgsql_identify_system(PGSQL *pgsql, IdentifySystem *system); bool pgsql_listen(PGSQL *pgsql, char *channels[]); bool pgsql_prepare_to_wait(PGSQL *pgsql); bool pgsql_alter_extension_update_to(PGSQL *pgsql, const char *extname, const char *version); bool parseTimeLineHistory(const char *filename, const char *content, IdentifySystem *system); #endif /* PGSQL_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/pgtuning.c000066400000000000000000000234361414244367200225640ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pgtuning.c * Adjust some very basic Postgres tuning to the system properties. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include "postgres_fe.h" #include "pqexpbuffer.h" #include "config.h" #include "env_utils.h" #include "file_utils.h" #include "log.h" #include "pgctl.h" #include "pgtuning.h" #include "system_utils.h" /* * In most cases we are going to initdb a Postgres instance for our users, we * might as well introduce some naive Postgres tuning. In the static array are * selected Postgres default values and static values we always set. * * Dynamic code is then used on the target systems to compute better values * dynamically for some parameters: work_mem, maintenance_work_mem, * effective_cache_size, autovacuum_max_workers. */ GUC postgres_tuning[] = { { "track_functions", "pl" }, { "shared_buffers", "'128 MB'" }, { "work_mem", "'4 MB'" }, { "maintenance_work_mem", "'64MB'" }, { "effective_cache_size", "'4 GB'" }, { "autovacuum_max_workers", "3" }, { "autovacuum_vacuum_scale_factor", "0.08" }, { "autovacuum_analyze_scale_factor", "0.02" }, { NULL, NULL } }; typedef struct DynamicTuning { int autovacuum_max_workers; uint64_t shared_buffers; uint64_t work_mem; uint64_t maintenance_work_mem; uint64_t effective_cache_size; } DynamicTuning; static bool pgtuning_compute_mem_settings(SystemInfo *sysInfo, DynamicTuning *tuning); void pgtuning_log_settings(DynamicTuning *tuning, int logLevel); static int pgtuning_compute_max_workers(SystemInfo *sysInfo); static bool pgtuning_edit_guc_settings(GUC *settings, DynamicTuning *tuning, char *config, size_t size); /* * pgtuning_prepare_guc_settings probes the system information (nCPU and total * RAM) and computes some better defaults for Postgres. */ bool pgtuning_prepare_guc_settings(GUC *settings, char *config, size_t size) { SystemInfo sysInfo = { 0 }; DynamicTuning tuning = { 0 }; char totalram[BUFSIZE] = { 0 }; if (!get_system_info(&sysInfo)) { /* errors have already been logged */ return false; } (void) pretty_print_bytes(totalram, sizeof(totalram), sysInfo.totalram); log_debug("Detected %d CPUs and %s total RAM on this server", sysInfo.ncpu, totalram); /* * Disable Postgres tuning when running the unit test suite: we install our * default set of values rather than computing better values for the * current environment. */ if (!(env_exists(PG_AUTOCTL_DEBUG) && env_exists("PG_REGRESS_SOCK_DIR"))) { tuning.autovacuum_max_workers = pgtuning_compute_max_workers(&sysInfo); if (!pgtuning_compute_mem_settings(&sysInfo, &tuning)) { log_error("Failed to compute memory settings, using defaults"); return false; } (void) pgtuning_log_settings(&tuning, LOG_DEBUG); } return pgtuning_edit_guc_settings(settings, &tuning, config, size); } /* * pgtuning_compute_max_workers returns how many autovacuum max workers we can * setup on the local system, depending on its number of CPUs. * * We could certainly cook a simple enough maths expression to compute the * numbers assigned in this range based "grid" here, but that would be much * harder to maintain and change our mind about, and not as easy to grasp on a * quick reading. */ static int pgtuning_compute_max_workers(SystemInfo *sysInfo) { /* use the default up to 16 cores (HT included) */ if (sysInfo->ncpu < 16) { return 3; } else if (sysInfo->ncpu < 24) { return 4; } else if (sysInfo->ncpu < 32) { return 6; } else if (sysInfo->ncpu < 48) { return 8; } else if (sysInfo->ncpu < 64) { return 12; } else { return 16; } } /* * pgtuning_compute_work_mem computes how much work mem to use on this system. * * Inspiration has been taken from http://pgconfigurator.cybertec.at * * Rather than trying to devise a good maths expression to compute values, we * implement our decision making with a range based approach. Some values are * still computed with an expression (shared_buffers is set to 25% of the total * RAM up to 256 GB of RAM, for instance). */ static bool pgtuning_compute_mem_settings(SystemInfo *sysInfo, DynamicTuning *tuning) { uint64_t oneGB = ((uint64_t) 1) << 30; /* * <= 8 GB of RAM */ if (sysInfo->totalram <= (8 * oneGB)) { tuning->shared_buffers = sysInfo->totalram / 4; tuning->work_mem = 16 * 1 << 20; /* 16 MB */ tuning->maintenance_work_mem = 256 * 1 << 20; /* 256 MB */ } /* * > 8 GB up to 64 GB of RAM */ else if (sysInfo->totalram <= (64 * oneGB)) { tuning->shared_buffers = sysInfo->totalram / 4; tuning->work_mem = 24 * 1 << 20; /* 24 MB */ tuning->maintenance_work_mem = 512 * 1 << 20; /* 512 MB */ } /* * > 64 GB up to 256 GB of RAM */ else if (sysInfo->totalram <= (256 * oneGB)) { tuning->shared_buffers = 16 * oneGB; /* 16 GB */ tuning->work_mem = 32 * 1 << 20; /* 32 MB */ tuning->maintenance_work_mem = oneGB; /* 1 GB */ } /* * > 256 GB of RAM */ else { tuning->shared_buffers = 32 * oneGB; /* 32 GB */ tuning->work_mem = 64 * 1 << 20; /* 64 MB */ tuning->maintenance_work_mem = 2 * oneGB; /* 2 GB */ } /* * What's not in shared buffers is expected to be mostly file system cache, * and then again effective_cache_size is a hint and does not need to be * the exact value as shown by the free(1) command. */ tuning->effective_cache_size = sysInfo->totalram - tuning->shared_buffers; return true; } /* * pgtuning_log_mem_settings logs the memory settings we computed. */ void pgtuning_log_settings(DynamicTuning *tuning, int logLevel) { char buf[BUFSIZE] = { 0 }; log_level(logLevel, "Setting autovacuum_max_workers to %d", tuning->autovacuum_max_workers); (void) pretty_print_bytes(buf, sizeof(buf), tuning->shared_buffers); log_level(logLevel, "Setting shared_buffers to %s", buf); (void) pretty_print_bytes(buf, sizeof(buf), tuning->work_mem); log_level(logLevel, "Setting work_mem to %s", buf); (void) pretty_print_bytes(buf, sizeof(buf), tuning->maintenance_work_mem); log_level(logLevel, "Setting maintenance_work_mem to %s", buf); (void) pretty_print_bytes(buf, sizeof(buf), tuning->effective_cache_size); log_level(logLevel, "Setting effective_cache_size to %s", buf); } /* * pgtuning_edit_guc_settings prepares a Postgres configuration file snippet * from the given GUC settings and the dynamic tuning adjusted to the system * and place the resulting snippet in the pre-allocated string buffer config of * given size. */ #define streq(x, y) ((x != NULL) && (y != NULL) && (strcmp(x, y) == 0)) static bool pgtuning_edit_guc_settings(GUC *settings, DynamicTuning *tuning, char *config, size_t size) { PQExpBuffer contents = createPQExpBuffer(); int settingIndex = 0; if (contents == NULL) { log_error("Failed to allocate memory"); return false; } appendPQExpBuffer(contents, "# basic tuning computed by pg_auto_failover\n"); /* replace placeholder values with dynamic tuned values */ for (settingIndex = 0; settings[settingIndex].name != NULL; settingIndex++) { GUC *setting = &settings[settingIndex]; if (streq(setting->name, "autovacuum_max_workers")) { if (tuning->autovacuum_max_workers > 0) { appendPQExpBuffer(contents, "%s = %d\n", setting->name, tuning->autovacuum_max_workers); } else { appendPQExpBuffer(contents, "%s = %s\n", setting->name, setting->value); } } else if (streq(setting->name, "shared_buffers")) { if (tuning->shared_buffers > 0) { char pretty[BUFSIZE] = { 0 }; (void) pretty_print_bytes(pretty, sizeof(pretty), tuning->shared_buffers); appendPQExpBuffer(contents, "%s = '%s'\n", setting->name, pretty); } else { appendPQExpBuffer(contents, "%s = %s\n", setting->name, setting->value); } } else if (streq(setting->name, "work_mem")) { if (tuning->work_mem > 0) { char pretty[BUFSIZE] = { 0 }; (void) pretty_print_bytes(pretty, sizeof(pretty), tuning->work_mem); appendPQExpBuffer(contents, "%s = '%s'\n", setting->name, pretty); } else { appendPQExpBuffer(contents, "%s = %s\n", setting->name, setting->value); } } else if (streq(setting->name, "maintenance_work_mem")) { if (tuning->maintenance_work_mem > 0) { char pretty[BUFSIZE] = { 0 }; (void) pretty_print_bytes(pretty, sizeof(pretty), tuning->maintenance_work_mem); appendPQExpBuffer(contents, "%s = '%s'\n", setting->name, pretty); } else { appendPQExpBuffer(contents, "%s = %s\n", setting->name, setting->value); } } else if (streq(setting->name, "effective_cache_size")) { if (tuning->effective_cache_size > 0) { char pretty[BUFSIZE] = { 0 }; (void) pretty_print_bytes(pretty, sizeof(pretty), tuning->effective_cache_size); appendPQExpBuffer(contents, "%s = '%s'\n", setting->name, pretty); } else { appendPQExpBuffer(contents, "%s = %s\n", setting->name, setting->value); } } else { appendPQExpBuffer(contents, "%s = %s\n", setting->name, setting->value); } } /* memory allocation could have failed while building string */ if (PQExpBufferBroken(contents)) { log_error("Failed to allocate memory"); destroyPQExpBuffer(contents); return false; } if (size < contents->len) { log_error("Failed to prepare Postgres tuning for the local system, " "the setup needs %lu bytes and pg_autoctl only support " "up to %lu bytes", (unsigned long) contents->len, (unsigned long) size); destroyPQExpBuffer(contents); return false; } strlcpy(config, contents->data, size); destroyPQExpBuffer(contents); return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/pgtuning.h000066400000000000000000000006411414244367200225620ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pgtuning.h * Adjust some very basic Postgres tuning to the system properties. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef PGTUNING_H #define PGTUNING_H #include extern GUC postgres_tuning[]; bool pgtuning_prepare_guc_settings(GUC *settings, char *config, size_t size); #endif /* PGTUNING_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/pidfile.c000066400000000000000000000304711414244367200223420ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pidfile.c * Utilities to manage the pg_autoctl pidfile. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "cli_common.h" #include "cli_root.h" #include "defaults.h" #include "env_utils.h" #include "fsm.h" #include "keeper.h" #include "keeper_config.h" #include "keeper_pg_init.h" #include "lock_utils.h" #include "log.h" #include "monitor.h" #include "pgctl.h" #include "pidfile.h" #include "state.h" #include "signals.h" #include "string_utils.h" /* pidfile for this process */ char service_pidfile[MAXPGPATH] = { 0 }; static void remove_service_pidfile_atexit(void); /* * create_pidfile writes our pid in a file. * * When running in a background loop, we need a pidFile to add a command line * tool that send signals to the process. The pidfile has a single line * containing our PID. */ bool create_pidfile(const char *pidfile, pid_t pid) { PQExpBuffer content = createPQExpBuffer(); log_trace("create_pidfile(%d): \"%s\"", pid, pidfile); if (content == NULL) { log_fatal("Failed to allocate memory to update our PID file"); return false; } if (!prepare_pidfile_buffer(content, pid)) { /* errors have already been logged */ destroyPQExpBuffer(content); return false; } /* memory allocation could have failed while building string */ if (PQExpBufferBroken(content)) { log_error("Failed to create pidfile \"%s\": out of memory", pidfile); destroyPQExpBuffer(content); return false; } bool success = write_file(content->data, content->len, pidfile); destroyPQExpBuffer(content); return success; } /* * prepare_pidfile_buffer prepares a PQExpBuffer content with the information * expected to be found in a pidfile. */ bool prepare_pidfile_buffer(PQExpBuffer content, pid_t pid) { char pgdata[MAXPGPATH] = { 0 }; /* we get PGDATA from the environment */ if (!get_env_pgdata(pgdata)) { log_fatal("Failed to get PGDATA to create the PID file"); return false; } /* * line # * 1 supervisor PID * 2 data directory path * 3 version number (PG_AUTOCTL_VERSION) * 4 extension version number (PG_AUTOCTL_EXTENSION_VERSION) * 5 shared semaphore id (used to serialize log writes) */ appendPQExpBuffer(content, "%d\n", pid); appendPQExpBuffer(content, "%s\n", pgdata); appendPQExpBuffer(content, "%s\n", PG_AUTOCTL_VERSION); appendPQExpBuffer(content, "%s\n", PG_AUTOCTL_EXTENSION_VERSION); appendPQExpBuffer(content, "%d\n", log_semaphore.semId); return true; } /* * create_pidfile writes the given serviceName pidfile, using getpid(). */ bool create_service_pidfile(const char *pidfile, const char *serviceName) { pid_t pid = getpid(); /* compute the service pidfile and store it in our global variable */ (void) get_service_pidfile(pidfile, serviceName, service_pidfile); /* register our service pidfile clean-up atexit */ atexit(remove_service_pidfile_atexit); return create_pidfile(service_pidfile, pid); } /* * get_service_pidfile computes the pidfile names for the given service. */ void get_service_pidfile(const char *pidfile, const char *serviceName, char *servicePidFilename) { char filename[MAXPGPATH] = { 0 }; sformat(filename, sizeof(filename), "pg_autoctl_%s.pid", serviceName); path_in_same_directory(pidfile, filename, servicePidFilename); } /* * remove_service_pidfile_atexit is called atexit() to remove the service * pidfile. */ static void remove_service_pidfile_atexit() { (void) remove_pidfile(service_pidfile); } /* * read_pidfile read pg_autoctl pid from a file, and returns true when we could * read a PID that belongs to a currently running process. */ bool read_pidfile(const char *pidfile, pid_t *pid) { long fileSize = 0L; char *fileContents = NULL; char *fileLines[1]; int pidnum = 0; if (!file_exists(pidfile)) { return false; } if (!read_file(pidfile, &fileContents, &fileSize)) { log_debug("Failed to read the PID file \"%s\", removing it", pidfile); (void) remove_pidfile(pidfile); return false; } splitLines(fileContents, fileLines, 1); stringToInt(fileLines[0], &pidnum); *pid = pidnum; free(fileContents); if (pid <= 0) { log_debug("Read negative pid %d in file \"%s\", removing it", *pid, pidfile); (void) remove_pidfile(pidfile); return false; } /* is it a stale file? */ if (kill(*pid, 0) == 0) { return true; } else { log_debug("Failed to signal pid %d: %m", *pid); *pid = 0; log_info("Found a stale pidfile at \"%s\"", pidfile); log_warn("Removing the stale pid file \"%s\"", pidfile); /* * We must return false here, after having determined that the * pidfile belongs to a process that doesn't exist anymore. So we * remove the pidfile and don't take the return value into account * at this point. */ (void) remove_pidfile(pidfile); /* we might have to cleanup a stale SysV semaphore, too */ (void) semaphore_cleanup(pidfile); return false; } } /* * remove_pidfile removes pg_autoctl pidfile. */ bool remove_pidfile(const char *pidfile) { if (remove(pidfile) != 0) { log_error("Failed to remove pid file \"%s\": %m", pidfile); return false; } return true; } /* * check_pidfile checks that the given PID file still contains the known pid of * the service. If the file is owned by another process, just quit immediately. */ void check_pidfile(const char *pidfile, pid_t start_pid) { pid_t checkpid = 0; /* * It might happen that the PID file got removed from disk, then * allowing another process to run. * * We should then quit in an emergency if our PID file either doesn't * exist anymore, or has been overwritten with another PID. * */ if (read_pidfile(pidfile, &checkpid)) { if (checkpid != start_pid) { log_fatal("Our PID file \"%s\" now contains PID %d, " "instead of expected pid %d. Quitting.", pidfile, checkpid, start_pid); exit(EXIT_CODE_QUIT); } } else { /* * Surrendering seems the less risky option for us now. * * Any other strategy would need to be careful about race conditions * happening when several processes (keeper or others) are trying to * create or remove the pidfile at the same time, possibly in different * orders. Yeah, let's quit. */ log_fatal("PID file not found at \"%s\", quitting.", pidfile); exit(EXIT_CODE_QUIT); } } /* * read_service_pidfile_version_string reads a service pidfile and copies the * version string found on line PIDFILE_LINE_VERSION_STRING into the * pre-allocated buffer versionString. */ bool read_service_pidfile_version_strings(const char *pidfile, char *versionString, char *extensionVersionString) { long fileSize = 0L; char *fileContents = NULL; char *fileLines[BUFSIZE] = { 0 }; int lineNumber; if (!read_file_if_exists(pidfile, &fileContents, &fileSize)) { return false; } int lineCount = splitLines(fileContents, fileLines, BUFSIZE); for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { int pidLine = lineNumber + 1; /* zero-based, one-based */ /* version string */ if (pidLine == PIDFILE_LINE_VERSION_STRING) { strlcpy(versionString, fileLines[lineNumber], BUFSIZE); } /* extension version string, comes later in the file */ if (pidLine == PIDFILE_LINE_EXTENSION_VERSION) { strlcpy(extensionVersionString, fileLines[lineNumber], BUFSIZE); free(fileContents); return true; } } free(fileContents); return false; } /* * fprint_pidfile_as_json prints the content of the pidfile as JSON. * * When includeStatus is true, add a "status" entry for each PID (main service * and sub-processes) with either "running" or "stale" as a value, depending on * what a kill -0 reports. */ void pidfile_as_json(JSON_Value *js, const char *pidfile, bool includeStatus) { JSON_Value *jsServices = json_value_init_array(); JSON_Array *jsServicesArray = json_value_get_array(jsServices); JSON_Object *jsobj = json_value_get_object(js); long fileSize = 0L; char *fileContents = NULL; char *fileLines[BUFSIZE] = { 0 }; int lineNumber; if (!read_file_if_exists(pidfile, &fileContents, &fileSize)) { exit(EXIT_CODE_INTERNAL_ERROR); } int lineCount = splitLines(fileContents, fileLines, BUFSIZE); for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { int pidLine = lineNumber + 1; /* zero-based, one-based */ char *separator = NULL; /* main pid */ if (pidLine == PIDFILE_LINE_PID) { int pidnum = 0; stringToInt(fileLines[lineNumber], &pidnum); json_object_set_number(jsobj, "pid", (double) pidnum); if (includeStatus) { if (kill(pidnum, 0) == 0) { json_object_set_string(jsobj, "status", "running"); } else { json_object_set_string(jsobj, "status", "stale"); } } continue; } /* data directory */ if (pidLine == PIDFILE_LINE_DATA_DIR) { json_object_set_string(jsobj, "pgdata", fileLines[lineNumber]); } /* version string */ if (pidLine == PIDFILE_LINE_VERSION_STRING) { json_object_set_string(jsobj, "version", fileLines[lineNumber]); } /* extension version string */ if (pidLine == PIDFILE_LINE_EXTENSION_VERSION) { /* skip it, the supervisor does not connect to the monitor */ (void) 0; } /* semId */ if (pidLine == PIDFILE_LINE_SEM_ID) { int semId = 0; if (stringToInt(fileLines[lineNumber], &semId)) { json_object_set_number(jsobj, "semId", (double) semId); } else { log_error("Failed to parse semId \"%s\"", fileLines[lineNumber]); } continue; } if (pidLine >= PIDFILE_LINE_FIRST_SERVICE) { JSON_Value *jsService = json_value_init_object(); JSON_Object *jsServiceObj = json_value_get_object(jsService); if ((separator = strchr(fileLines[lineNumber], ' ')) == NULL) { log_debug("Failed to find a space separator in line: \"%s\"", fileLines[lineNumber]); continue; } else { int pidnum = 0; char *serviceName = separator + 1; char servicePidFile[BUFSIZE] = { 0 }; char versionString[BUFSIZE] = { 0 }; char extensionVersionString[BUFSIZE] = { 0 }; *separator = '\0'; stringToInt(fileLines[lineNumber], &pidnum); json_object_set_string(jsServiceObj, "name", serviceName); json_object_set_number(jsServiceObj, "pid", pidnum); if (includeStatus) { if (kill(pidnum, 0) == 0) { json_object_set_string(jsServiceObj, "status", "running"); } else { json_object_set_string(jsServiceObj, "status", "stale"); } } /* grab version number of the service by parsing its pidfile */ get_service_pidfile(pidfile, serviceName, servicePidFile); if (!read_service_pidfile_version_strings( servicePidFile, versionString, extensionVersionString)) { /* warn about it and continue */ log_warn("Failed to read version string for " "service \"%s\" in pidfile \"%s\"", serviceName, servicePidFile); } else { json_object_set_string(jsServiceObj, "version", versionString); json_object_set_string(jsServiceObj, "pgautofailover", extensionVersionString); } } json_array_append_value(jsServicesArray, jsService); } } json_object_set_value(jsobj, "services", jsServices); free(fileContents); } bool is_process_stopped(const char *pidfile, bool *stopped, pid_t *pid) { if (!file_exists(pidfile)) { *stopped = true; return true; } if (!read_pidfile(pidfile, pid)) { log_error("Failed to read PID file \"%s\"", pidfile); return false; } *stopped = false; return true; } /* * wait_for_process_to_stop waits until the PID found in the pidfile is not running * anymore. */ bool wait_for_process_to_stop(const char *pidfile, int timeout, bool *stopped, pid_t *pid) { if (!is_process_stopped(pidfile, stopped, pid)) { /* errors have already been logged */ return false; } log_info("An instance of pg_autoctl is running with PID %d, " "waiting for it to stop.", *pid); int timeout_counter = timeout; while (timeout_counter > 0) { if (kill(*pid, 0) == -1 && errno == ESRCH) { log_info("The pg_autoctl instance with pid %d " "has now terminated.", *pid); *stopped = true; return true; } sleep(1); --timeout_counter; } *stopped = false; return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/pidfile.h000066400000000000000000000043701414244367200223460ustar00rootroot00000000000000/* * src/bin/pg_autoctl/pidfile.h * Utilities to manage the pg_autoctl pidfile. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef PIDFILE_H #define PIDFILE_H #include #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "keeper.h" #include "keeper_config.h" #include "monitor.h" #include "monitor_config.h" /* * As of pg_autoctl 1.4, the contents of the pidfile is: * * line # * 1 supervisor PID * 2 data directory path * 3 version number (PG_AUTOCTL_VERSION) * 4 extension version number (PG_AUTOCTL_EXTENSION_VERSION) * 5 shared semaphore id (used to serialize log writes) * 6 first supervised service pid line * 7 second supervised service pid line * ... * * The supervised service lines are added later, not the first time we create * the pidfile. Each service line contains 2 bits of information, separated * with spaces: * * pid service-name * * Each service creates its own pidfile with its own version number. At * pg_autoctl upgrade time, we might have a supervisor process that's running * with a different version than one of the restarted pg_autoctl services. */ #define PIDFILE_LINE_PID 1 #define PIDFILE_LINE_DATA_DIR 2 #define PIDFILE_LINE_VERSION_STRING 3 #define PIDFILE_LINE_EXTENSION_VERSION 4 #define PIDFILE_LINE_SEM_ID 5 #define PIDFILE_LINE_FIRST_SERVICE 6 bool create_pidfile(const char *pidfile, pid_t pid); bool prepare_pidfile_buffer(PQExpBuffer content, pid_t pid); bool create_service_pidfile(const char *pidfile, const char *serviceName); void get_service_pidfile(const char *pidfile, const char *serviceName, char *filename); bool read_service_pidfile_version_strings(const char *pidfile, char *versionString, char *extensionVersionString); bool read_pidfile(const char *pidfile, pid_t *pid); bool remove_pidfile(const char *pidfile); void check_pidfile(const char *pidfile, pid_t start_pid); void pidfile_as_json(JSON_Value *js, const char *pidfile, bool includeStatus); bool is_process_stopped(const char *pidfile, bool *stopped, pid_t *pid); bool wait_for_process_to_stop(const char *pidfile, int timeout, bool *stopped, pid_t *pid); #endif /* PIDFILE_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/primary_standby.c000066400000000000000000001407711414244367200241420ustar00rootroot00000000000000/* * src/bin/pg_autoctl/primary_standby.c * API to manage a local postgres database cluster * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include "postgres_fe.h" #include "config.h" #include "file_utils.h" #include "keeper.h" #include "log.h" #include "parsing.h" #include "pgctl.h" #include "pghba.h" #include "pgsql.h" #include "primary_standby.h" #include "signals.h" #include "state.h" static bool local_postgres_wait_until_ready(LocalPostgresServer *postgres); static void local_postgres_update_pg_failures_tracking(LocalPostgresServer *postgres, bool pgIsRunning); /* * Default settings for postgres databases managed by pg_auto_failover. * These settings primarily ensure that streaming replication is * possible and synchronous replication is the default. * * listen_addresses and port are placeholder values in this array and are * replaced with dynamic values from the setup when used. */ #define DEFAULT_GUC_SETTINGS_FOR_PG_AUTO_FAILOVER \ { "shared_preload_libraries", "pg_stat_statements" }, \ { "listen_addresses", "'*'" }, \ { "port", "5432" }, \ { "max_wal_senders", "12" }, \ { "max_replication_slots", "12" }, \ { "wal_level", "'replica'" }, \ { "wal_log_hints", "on" }, \ { "wal_sender_timeout", "'30s'" }, \ { "hot_standby_feedback", "on" }, \ { "hot_standby", "on" }, \ { "synchronous_commit", "on" }, \ { "logging_collector", "on" }, \ { "log_destination", "stderr" }, \ { "log_directory", "log" }, \ { "log_min_messages", "info" }, \ { "log_connections", "off" }, \ { "log_disconnections", "off" }, \ { "log_lock_waits", "on" }, \ { "password_encryption", "md5" }, \ { "ssl", "off" }, \ { "ssl_ca_file", "" }, \ { "ssl_crl_file", "" }, \ { "ssl_cert_file", "" }, \ { "ssl_key_file", "" }, \ { "ssl_ciphers", "'" DEFAULT_SSL_CIPHERS "'" } #define DEFAULT_GUC_SETTINGS_FOR_PG_AUTO_FAILOVER_PRE_13 \ DEFAULT_GUC_SETTINGS_FOR_PG_AUTO_FAILOVER, \ { "wal_keep_segments", "512" } #define DEFAULT_GUC_SETTINGS_FOR_PG_AUTO_FAILOVER_13 \ DEFAULT_GUC_SETTINGS_FOR_PG_AUTO_FAILOVER, \ { "wal_keep_size", "'8 GB'" } GUC postgres_default_settings_pre_13[] = { DEFAULT_GUC_SETTINGS_FOR_PG_AUTO_FAILOVER_PRE_13, { NULL, NULL } }; GUC postgres_default_settings_13[] = { DEFAULT_GUC_SETTINGS_FOR_PG_AUTO_FAILOVER_13, { NULL, NULL } }; GUC citus_default_settings_pre_13[] = { DEFAULT_GUC_SETTINGS_FOR_PG_AUTO_FAILOVER_PRE_13, { "shared_preload_libraries", "'citus,pg_stat_statements'" }, { "citus.node_conninfo", "'sslmode=prefer'" }, { "citus.cluster_name", "'default'" }, { "citus.use_secondary_nodes", "'never'" }, { "citus.local_hostname", "'localhost'" }, { NULL, NULL } }; GUC citus_default_settings_13[] = { DEFAULT_GUC_SETTINGS_FOR_PG_AUTO_FAILOVER_13, { "shared_preload_libraries", "'citus,pg_stat_statements'" }, { "citus.node_conninfo", "'sslmode=prefer'" }, { "citus.cluster_name", "'default'" }, { "citus.use_secondary_nodes", "'never'" }, { "citus.local_hostname", "'localhost'" }, { NULL, NULL } }; /* * local_postgres_init initializes an interface for managing a local * postgres server with the given setup. */ void local_postgres_init(LocalPostgresServer *postgres, PostgresSetup *pgSetup) { char connInfo[MAXCONNINFO]; pg_setup_get_local_connection_string(pgSetup, connInfo); pgsql_init(&postgres->sqlClient, connInfo, PGSQL_CONN_LOCAL); postgres->postgresSetup = *pgSetup; /* reset PostgreSQL restart failures tracking */ postgres->pgFirstStartFailureTs = 0; postgres->pgStartRetries = 0; /* set the local instance kind from the configuration. */ postgres->pgKind = pgSetup->pgKind; if (!local_postgres_set_status_path(postgres, true)) { /* errors have already been logged */ exit(EXIT_CODE_BAD_STATE); } } /* * local_postgres_set_status_path sets the file pathname to the pg_autoctl.pg * file that we use to signal the Postgres controller if Postgres is expected * to be running or not. * * When the file does not exist, the controller do nothing, so it's safe to * always remove the file at startup. */ bool local_postgres_set_status_path(LocalPostgresServer *postgres, bool unlink) { PostgresSetup *pgSetup = &(postgres->postgresSetup); LocalExpectedPostgresStatus *pgStatus = &(postgres->expectedPgStatus); /* normalize our PGDATA path when it exists on-disk already */ if (directory_exists(pgSetup->pgdata)) { /* normalize the existing path to PGDATA */ if (!normalize_filename(pgSetup->pgdata, pgSetup->pgdata, MAXPGPATH)) { /* errors have already been logged */ return false; } } log_trace("local_postgres_set_status_path: %s", pgSetup->pgdata); /* initialize our Postgres state file path */ if (!build_xdg_path(pgStatus->pgStatusPath, XDG_RUNTIME, pgSetup->pgdata, KEEPER_POSTGRES_STATE_FILENAME)) { /* highly unexpected */ log_error("Failed to build pg_autoctl postgres state file pathname, " "see above for details."); exit(EXIT_CODE_INTERNAL_ERROR); } log_trace("local_postgres_set_status_path: %s", pgStatus->pgStatusPath); /* local_postgres_init removes any stale pg_autoctl.pg file */ if (unlink && !local_postgres_unlink_status_file(postgres)) { /* errors have already been logged */ return false; } return true; } /* * local_postgres_unlink_status_file unlinks the file we use to communicate * with the Postgres controller, so that this process won't interfere with * whatever the user is doing durning maintenance (such as stop Postgres). */ bool local_postgres_unlink_status_file(LocalPostgresServer *postgres) { LocalExpectedPostgresStatus *pgStatus = &(postgres->expectedPgStatus); log_trace("local_postgres_unlink_status_file: %s", pgStatus->pgStatusPath); return unlink_file(pgStatus->pgStatusPath); } /* * local_postgres_UpdatePgFailuresTracking updates our tracking of PostgreSQL * restart failures. */ static void local_postgres_update_pg_failures_tracking(LocalPostgresServer *postgres, bool pgIsRunning) { if (pgIsRunning) { /* reset PostgreSQL restart failures tracking */ postgres->pgFirstStartFailureTs = 0; postgres->pgStartRetries = 0; postgres->pgIsRunning = true; } else { uint64_t now = time(NULL); /* update PostgreSQL restart failure tracking */ if (postgres->pgFirstStartFailureTs == 0) { postgres->pgFirstStartFailureTs = now; } ++postgres->pgStartRetries; } } /* * local_postgres_finish closes our connection to the local PostgreSQL * server, if needs be. */ void local_postgres_finish(LocalPostgresServer *postgres) { pgsql_finish(&postgres->sqlClient); } /* * local_postgres_update updates the LocalPostgresServer pgSetup information * with what we discover from the newly created Postgres instance. Typically * used just after a pg_basebackup. */ bool local_postgres_update(LocalPostgresServer *postgres, bool postgresNotRunningIsOk) { PostgresSetup *pgSetup = &(postgres->postgresSetup); PostgresSetup newPgSetup = { 0 }; bool missingPgdataIsOk = true; /* in case a connection is still established, now is time to close */ (void) local_postgres_finish(postgres); if (!pg_setup_init(&newPgSetup, pgSetup, missingPgdataIsOk, postgresNotRunningIsOk)) { /* errors have already been logged */ return false; } (void) local_postgres_init(postgres, &newPgSetup); return true; } /* * local_postgres_wait_until_ready waits until Postgres is running and updates * our failure tracking counters for the Postgres service accordingly. */ static bool local_postgres_wait_until_ready(LocalPostgresServer *postgres) { PostgresSetup *pgSetup = &(postgres->postgresSetup); int timeout = 10; /* wait for Postgres for 10s */ bool pgIsRunning = pg_is_running(pgSetup->pg_ctl, pgSetup->pgdata); log_trace("local_postgres_wait_until_ready: Postgres %s in \"%s\"", pgIsRunning ? "is running" : "is not running", pgSetup->pgdata); if (!pgIsRunning) { /* main logging is done in the Postgres controller sub-process */ pgIsRunning = pg_setup_wait_until_is_ready(pgSetup, timeout, LOG_DEBUG); /* update connection string for connection to postgres */ (void) local_postgres_update_pg_failures_tracking(postgres, pgIsRunning); if (pgIsRunning) { /* update pgSetup cache with new Postgres pid and all */ local_postgres_init(postgres, pgSetup); log_debug("local_postgres_wait_until_ready: Postgres is running " "with pid %d", pgSetup->pidFile.pid); } else { log_error("Failed to ensure that Postgres is running in \"%s\"", pgSetup->pgdata); } } return pgIsRunning; } /* * ensure_postgres_service_is_running signals the Postgres controller service * that Postgres is expected to be running, by updating the expectedPgStatus * file to the proper values, and then wait until Postgres is running before * returning true in case of success. */ bool ensure_postgres_service_is_running(LocalPostgresServer *postgres) { LocalExpectedPostgresStatus *pgStatus = &(postgres->expectedPgStatus); /* update our data structure in-memory, then on-disk */ if (!keeper_set_postgres_state_running(&(pgStatus->state), pgStatus->pgStatusPath)) { /* errors have already been logged */ return false; } return local_postgres_wait_until_ready(postgres); } /* * ensure_postgres_service_is_running_as_subprocess signals the Postgres * controller service that Postgres is expected to be running as a subprocess * of pg_autoctl, by updating the expectedPgStatus file to the proper values, * and then wait until Postgres is running before returning true in case of * success. */ bool ensure_postgres_service_is_running_as_subprocess(LocalPostgresServer *postgres) { PostgresSetup *pgSetup = &(postgres->postgresSetup); LocalExpectedPostgresStatus *pgStatus = &(postgres->expectedPgStatus); bool pgIsRunning = pg_is_running(pgSetup->pg_ctl, pgSetup->pgdata); /* update our data structure in-memory, then on-disk */ if (!keeper_set_postgres_state_running_as_subprocess(&(pgStatus->state), pgStatus->pgStatusPath)) { /* errors have already been logged */ return false; } /* * If Postgres was already running before we wrote a new expected status * file, then the Postgres controller might be up to stop and then restart * Postgres. This happens when the already running Postgres is not a * subprocess of this pg_autoctl process, and only the controller has the * right information to check that (child process pid for "postgres"). * * Because we are lacking information, we just wait for some time before * checking if Postgres is running (again) */ if (pgIsRunning) { sleep(PG_AUTOCTL_KEEPER_SLEEP_TIME); } return local_postgres_wait_until_ready(postgres); } /* * ensure_postgres_service_is_running signals the Postgres controller service * that Postgres is expected to not be running, by updating the * expectedPgStatus file to the proper values, and then wait until Postgres is * stopped before returning true in case of success. */ bool ensure_postgres_service_is_stopped(LocalPostgresServer *postgres) { PostgresSetup *pgSetup = &(postgres->postgresSetup); LocalExpectedPostgresStatus *pgStatus = &(postgres->expectedPgStatus); int timeout = 10; /* wait for Postgres for 10s */ log_trace("keeper_ensure_postgres_is_stopped"); /* update our data structure in-memory, then on-disk */ if (!keeper_set_postgres_state_stopped(&(pgStatus->state), pgStatus->pgStatusPath)) { /* errors have already been logged */ return false; } return pg_setup_wait_until_is_stopped(pgSetup, timeout, LOG_DEBUG); } /* * primary_has_replica returns whether the local postgres server has a * replica that is connecting using the given user name. */ bool primary_has_replica(LocalPostgresServer *postgres, char *userName, bool *hasStandby) { PGSQL *pgsql = &(postgres->sqlClient); log_trace("primary_has_replica"); bool result = pgsql_has_replica(pgsql, userName, hasStandby); pgsql_finish(pgsql); return result; } /* * upstream_has_replication_slot checks whether the upstream server already has * created our replication slot. */ bool upstream_has_replication_slot(ReplicationSource *upstream, PostgresSetup *pgSetup, bool *hasReplicationSlot) { NodeAddress *primaryNode = &(upstream->primaryNode); PostgresSetup upstreamSetup = { 0 }; PGSQL upstreamClient = { 0 }; char connectionString[MAXCONNINFO] = { 0 }; /* prepare a PostgresSetup that allows preparing a connection string */ strlcpy(upstreamSetup.username, PG_AUTOCTL_REPLICA_USERNAME, NAMEDATALEN); strlcpy(upstreamSetup.dbname, pgSetup->dbname, NAMEDATALEN); strlcpy(upstreamSetup.pghost, primaryNode->host, _POSIX_HOST_NAME_MAX); upstreamSetup.pgport = primaryNode->port; upstreamSetup.ssl = pgSetup->ssl; /* * Build the connection string as if to a local node, but we tweaked the * pgsetup to target the primary node by changing its pghost and pgport. */ pg_setup_get_local_connection_string(&upstreamSetup, connectionString); if (!pgsql_init(&upstreamClient, connectionString, PGSQL_CONN_UPSTREAM)) { /* errors have already been logged */ return false; } if (!pgsql_replication_slot_exists(&upstreamClient, upstream->slotName, hasReplicationSlot)) { /* errors have already been logged */ PQfinish(upstreamClient.connection); return false; } PQfinish(upstreamClient.connection); return true; } /* * primary_create_replication_slot (re)creates a replication slot. The * replication slot will not have its LSN initialized until first use. The * return value indicates whether the operation was successful. */ bool primary_create_replication_slot(LocalPostgresServer *postgres, char *replicationSlotName) { PGSQL *pgsql = &(postgres->sqlClient); log_trace("primary_create_replication_slot(%s)", replicationSlotName); bool result = pgsql_create_replication_slot(pgsql, replicationSlotName); pgsql_finish(pgsql); return result; } /* * primary_drop_replication_slot drops a replication slot if it exists. The * return value indicates whether the operation was successful. */ bool primary_drop_replication_slot(LocalPostgresServer *postgres, char *replicationSlotName) { PGSQL *pgsql = &(postgres->sqlClient); log_trace("primary_drop_replication_slot"); bool result = pgsql_drop_replication_slot(pgsql, replicationSlotName); pgsql_finish(pgsql); return result; } /* * primary_drop_all_replication_slots drops all the replication slots found on * a node. * * When a node has been demoted, the replication slots that used to be * maintained by the streaming replication protocol are now going to be * maintained "manually" by pg_autoctl using pg_replication_slot_advance(). * * There is a problem in pg_replication_slot_advance() in that it only * maintains the restart_lsn property of a replication slot, it does not * maintain the xmin of it. When re-using the pre-existing replication slots, * we want to have a NULL xmin, so we drop the slots, and then create them * again. */ bool primary_drop_all_replication_slots(LocalPostgresServer *postgres) { NodeAddressArray otherNodesArray = { 0 }; log_info("Dropping replication slots (to reset their xmin)"); if (!postgres_replication_slot_create_and_drop(postgres, &otherNodesArray)) { log_error("Failed to drop replication slots on the local Postgres " "instance, see above for details"); return false; } return true; } /* * postgres_replication_slot_create_and_drop drops the replication slots that * belong to dropped nodes on a primary server, and creates replication slots * for newly created nodes on the monitor. */ bool postgres_replication_slot_create_and_drop(LocalPostgresServer *postgres, NodeAddressArray *nodeArray) { PGSQL *pgsql = &(postgres->sqlClient); log_trace("postgres_replication_slot_drop_removed"); bool result = pgsql_replication_slot_create_and_drop(pgsql, nodeArray); pgsql_finish(pgsql); return result; } /* * postgres_replication_slot_advance advances the current confirmed position of * the given replication slot up to the given LSN position. */ bool postgres_replication_slot_maintain(LocalPostgresServer *postgres, NodeAddressArray *nodeArray) { PGSQL *pgsql = &(postgres->sqlClient); log_trace("postgres_replication_slot_maintain"); bool result = pgsql_replication_slot_maintain(pgsql, nodeArray); pgsql_finish(pgsql); return result; } /* * primary_enable_synchronous_replication enables synchronous replication * on a primary postgres node. */ bool primary_set_synchronous_standby_names(LocalPostgresServer *postgres) { PGSQL *pgsql = &(postgres->sqlClient); log_info("Setting synchronous_standby_names to '%s'", postgres->synchronousStandbyNames); bool result = pgsql_set_synchronous_standby_names(pgsql, postgres->synchronousStandbyNames); pgsql_finish(pgsql); return result; } /* * primary_disable_synchronous_replication disables synchronous replication * on a primary postgres node. */ bool primary_disable_synchronous_replication(LocalPostgresServer *postgres) { PGSQL *pgsql = &(postgres->sqlClient); log_trace("primary_disable_synchronous_replication"); bool result = pgsql_disable_synchronous_replication(pgsql); pgsql_finish(pgsql); return result; } /* * postgres_add_default_settings ensures that postgresql.conf includes a * postgresql-auto-failover.conf file that sets a number of good defaults for * settings related to streaming replication and running pg_auto_failover. */ bool postgres_add_default_settings(LocalPostgresServer *postgres, const char *hostname) { PGSQL *pgsql = &(postgres->sqlClient); PostgresSetup *pgSetup = &(postgres->postgresSetup); char configFilePath[MAXPGPATH]; GUC *default_settings = NULL; log_trace("postgres_add_default_settings (%s) [%d]", nodeKindToString(postgres->pgKind), pgSetup->control.pg_control_version); /* configFilePath = $PGDATA/postgresql.conf */ join_path_components(configFilePath, pgSetup->pgdata, "postgresql.conf"); /* in case of errors, pgsql_ functions finish the connection */ pgsql_finish(pgsql); /* * default settings are different depending on Postgres version and Citus * usage, so fetch the curent pg_control_version and make a decision * depending on that. * * Note that many calls to postgres_add_default_settings happen before we * have had the opportunity to call pg_controldata, so now is a good time * to do that. */ if (pgSetup->control.pg_control_version == 0) { bool missingPgdataIsOk = false; if (!pg_controldata(pgSetup, missingPgdataIsOk)) { /* errors have already been logged */ return false; } } if (pgSetup->control.pg_control_version < 1300) { if (IS_CITUS_INSTANCE_KIND(postgres->pgKind)) { default_settings = citus_default_settings_pre_13; } else { default_settings = postgres_default_settings_pre_13; } } else { if (IS_CITUS_INSTANCE_KIND(postgres->pgKind)) { default_settings = citus_default_settings_13; } else { default_settings = postgres_default_settings_13; } } if (!pg_add_auto_failover_default_settings(pgSetup, hostname, configFilePath, default_settings)) { log_error("Failed to add default settings to postgresql.conf: couldn't " "write the new postgresql.conf, see above for details"); return false; } return true; } /* * primary_create_user_with_hba creates a user and updates pg_hba.conf * to allow the user to connect from the given hostname. */ bool primary_create_user_with_hba(LocalPostgresServer *postgres, char *userName, char *password, char *hostname, char *authMethod, HBAEditLevel hbaLevel, int connlimit) { PGSQL *pgsql = &(postgres->sqlClient); bool login = true; bool superuser = false; bool replication = false; char hbaFilePath[MAXPGPATH]; log_trace("primary_create_user_with_hba"); if (!pgsql_create_user(pgsql, userName, password, login, superuser, replication, connlimit)) { log_error("Failed to create user \"%s\" on local postgres server", userName); return false; } if (!pgsql_get_hba_file_path(pgsql, hbaFilePath, MAXPGPATH)) { log_error("Failed to set the pg_hba rule for user \"%s\": couldn't get " "hba_file from local postgres server", userName); return false; } if (!pghba_ensure_host_rule_exists(hbaFilePath, postgres->postgresSetup.ssl.active, HBA_DATABASE_ALL, NULL, userName, hostname, authMethod, hbaLevel)) { log_error("Failed to set the pg_hba rule for user \"%s\"", userName); return false; } if (!pgsql_reload_conf(pgsql)) { log_error("Failed to reload pg_hba settings after updating pg_hba.conf"); return false; } pgsql_finish(pgsql); return true; } /* * primary_create_replication_user creates a user that allows the secondary * to connect for replication. */ bool primary_create_replication_user(LocalPostgresServer *postgres, char *replicationUsername, char *replicationPassword) { PGSQL *pgsql = &(postgres->sqlClient); bool login = true; bool superuser = true; bool replication = true; int connlimit = -1; log_trace("primary_create_replication_user"); bool result = pgsql_create_user(pgsql, replicationUsername, replicationPassword, login, superuser, replication, connlimit); pgsql_finish(pgsql); return result; } /* * standby_init_replication_source initializes a replication source structure * with given arguments. If the upstreamNode is NULL, then the * replicationSource.primary structure slot is not updated. * * Note that we just store the pointers to all those const char *arguments * here, expect for the upstreamNode there's no copying involved. */ bool standby_init_replication_source(LocalPostgresServer *postgres, NodeAddress *upstreamNode, const char *username, const char *password, const char *slotName, const char *maximumBackupRate, const char *backupDirectory, const char *targetLSN, SSLOptions sslOptions, int currentNodeId) { ReplicationSource *upstream = &(postgres->replicationSource); if (upstreamNode != NULL) { upstream->primaryNode.nodeId = upstreamNode->nodeId; strlcpy(upstream->primaryNode.name, upstreamNode->name, _POSIX_HOST_NAME_MAX); strlcpy(upstream->primaryNode.host, upstreamNode->host, _POSIX_HOST_NAME_MAX); upstream->primaryNode.port = upstreamNode->port; } strlcpy(upstream->userName, username, NAMEDATALEN); if (password != NULL) { strlcpy(upstream->password, password, MAXCONNINFO); } strlcpy(upstream->slotName, slotName, MAXCONNINFO); strlcpy(upstream->maximumBackupRate, maximumBackupRate, MAXIMUM_BACKUP_RATE_LEN); strlcpy(upstream->backupDir, backupDirectory, MAXCONNINFO); if (targetLSN != NULL) { strlcpy(upstream->targetLSN, targetLSN, PG_LSN_MAXLENGTH); } upstream->sslOptions = sslOptions; /* prepare our application_name */ sformat(upstream->applicationName, MAXCONNINFO, "%s%d", REPLICATION_APPLICATION_NAME_PREFIX, currentNodeId); return true; } /* * standby_init_database tries to initialize PostgreSQL as a hot standby. It uses * pg_basebackup to do so. Returns false on failure. */ bool standby_init_database(LocalPostgresServer *postgres, const char *hostname, bool skipBaseBackup) { PostgresSetup *pgSetup = &(postgres->postgresSetup); ReplicationSource *upstream = &(postgres->replicationSource); log_trace("standby_init_database"); log_info("Initialising PostgreSQL as a hot standby"); if (pg_setup_pgdata_exists(pgSetup) && pg_setup_is_running(pgSetup)) { log_info("Target directory exists: \"%s\", stopping PostgreSQL", pgSetup->pgdata); /* try to stop PostgreSQL, stop here if that fails */ if (!ensure_postgres_service_is_stopped(postgres)) { log_error("Failed to initialize a standby: " "the database directory exists " "and postgres could not be stopped"); return false; } } /* * Now, we know that pgdata either doesn't exists or belongs to a stopped * PostgreSQL instance. We can safely proceed with pg_basebackup. * * We might be asked to skip pg_basebackup when the PGDATA directory has * already been prepared externally: typically we are creating a standby * node and it was faster to install PGDATA from a file system snapshot or * a backup/recovery tooling. */ if (skipBaseBackup) { log_info("Skipping base backup to use pre-existing PGDATA at \"%s\"", pgSetup->pgdata); } else { /* * pg_basebackup has this bug where it will copy over the whole PGDATA * contents even if the WAL receiver subprocess fails early, typically * when the replication slot does not exist on the target connection. * * We want to protect against this case here, so we manually check that * the replication exists before calling pg_basebackup. */ bool hasReplicationSlot = false; /* * When initialising from another standby (in REPORT_LSN, if there is * currently no primary node and no candidate node either), we don't * require a replication slot on the upstream node. */ bool needsReplicationSlot = !IS_EMPTY_STRING_BUFFER(upstream->slotName); if (needsReplicationSlot && !upstream_has_replication_slot(upstream, pgSetup, &hasReplicationSlot)) { /* errors have already been logged */ return false; } if (!needsReplicationSlot || hasReplicationSlot) { /* first, make sure we can connect with "replication" */ if (!pgctl_identify_system(upstream)) { log_error("Failed to connect to the primary with a replication " "connection string. See above for details"); return false; } /* now pg_basebackup from our upstream node */ if (!pg_basebackup(pgSetup->pgdata, pgSetup->pg_ctl, upstream)) { return false; } } else { log_error("The replication slot \"%s\" has not been created yet " "on the primary node " NODE_FORMAT, upstream->slotName, upstream->primaryNode.nodeId, upstream->primaryNode.name, upstream->primaryNode.host, upstream->primaryNode.port); return false; } } /* we have a new PGDATA, update our pgSetup information */ if (!local_postgres_update(postgres, true)) { log_error("Failed to update our internal Postgres representation " "after pg_basebackup, see above for details"); return false; } /* now setup the replication configuration (primary_conninfo etc) */ if (!pg_setup_standby_mode(pgSetup->control.pg_control_version, pgSetup->pgdata, pgSetup->pg_ctl, upstream)) { log_error("Failed to setup Postgres as a standby after pg_basebackup"); return false; } /* * When --ssl-self-signed has been used, now is the time to build a * self-signed certificate for the server. We place the certificate and * private key in $PGDATA/server.key and $PGDATA/server.crt * * In particular we override the certificates that we might have fetched * from the primary as part of pg_basebackup: we're not a backup, we're a * standby node, we need our own certificate (even if self-signed). */ if (pgSetup->ssl.createSelfSignedCert) { if (!pg_create_self_signed_cert(pgSetup, hostname)) { log_error("Failed to create SSL self-signed certificate, " "see above for details"); return false; } } /* * We might have local edits to implement to the PostgreSQL * configuration, such as a specific listen_addresses or different TLS * key and cert locations. By changing this before starting postgres these * new settings will automatically be applied. */ if (!postgres_add_default_settings(postgres, hostname)) { log_error("Failed to add default settings to the secondary, " "see above for details."); return false; } if (!ensure_postgres_service_is_running(postgres)) { return false; } log_info("PostgreSQL started on port %d", pgSetup->pgport); return true; } /* * primary_rewind_to_standby brings a database directory of a failed primary back * into a state where it can become the standby of the new primary. */ bool primary_rewind_to_standby(LocalPostgresServer *postgres) { PostgresSetup *pgSetup = &(postgres->postgresSetup); ReplicationSource *replicationSource = &(postgres->replicationSource); NodeAddress *primaryNode = &(replicationSource->primaryNode); log_trace("primary_rewind_to_standby"); log_info("Rewinding PostgreSQL to follow new primary node " NODE_FORMAT, primaryNode->nodeId, primaryNode->name, primaryNode->host, primaryNode->port); if (!ensure_postgres_service_is_stopped(postgres)) { log_error("Failed to stop postgres to do rewind"); return false; } if (!postgres_maybe_do_crash_recovery(postgres)) { log_error("Failed to implement Postgres crash recovery " "before calling pg_rewind"); return false; } /* before pg_rewind, make sure we can connect with "replication" */ if (!pgctl_identify_system(replicationSource)) { log_error("Failed to connect to the primary node " NODE_FORMAT "with a replication connection string. " "See above for details", primaryNode->nodeId, primaryNode->name, primaryNode->host, primaryNode->port); } if (!pg_rewind(pgSetup->pgdata, pgSetup->pg_ctl, replicationSource)) { log_error("Failed to rewind old data directory"); return false; } if (!pg_setup_standby_mode(pgSetup->control.pg_control_version, pgSetup->pgdata, pgSetup->pg_ctl, replicationSource)) { log_error("Failed to setup Postgres as a standby, after rewind"); return false; } if (!ensure_postgres_service_is_running(postgres)) { log_error("Failed to start postgres after rewind"); return false; } return true; } /* * postgres_maybe_do_crash_recovery implements a round of Postgres crash * recovery for the local instance of Postgres when pg_rewind would otherwise * fail because of its internal checks. */ bool postgres_maybe_do_crash_recovery(LocalPostgresServer *postgres) { PostgresSetup *pgSetup = &(postgres->postgresSetup); ReplicationSource *replicationSource = &(postgres->replicationSource); LocalExpectedPostgresStatus *pgStatus = &(postgres->expectedPgStatus); /* update our service controller for Postgres to release control */ if (!keeper_set_postgres_state_unknown(&(pgStatus->state), pgStatus->pgStatusPath)) { /* errors have already been logged */ return false; } /* we don't log the output for pg_ctl_status here */ int status = pg_ctl_status(pgSetup->pg_ctl, pgSetup->pgdata, false); if (status != PG_CTL_STATUS_NOT_RUNNING) { log_error("Failed to prepare for crash recovery: " "Postgres is not stopped"); return false; } /* * pg_rewind fails when the target cluster (meaning the local Postgres * instance) is either running or has not been shutdown correctly. Time to * use pg_controldata and see if the DBState there is to pg_rewind liking. */ const bool missingPgdataIsOk = false; if (!pg_controldata(pgSetup, missingPgdataIsOk)) { /* errors have already been logged */ return false; } /* * We know that Postgres is not running thanks to pg_ctl_status, and we * just grabbed the output from pg_controldata. We can now implement the * same pre-condition checks as in Postgres pg_rewind.c. */ if (pgSetup->control.state != DB_SHUTDOWNED && pgSetup->control.state != DB_SHUTDOWNED_IN_RECOVERY) { /* * Before calling pg_rewind, attempt crash recovery on the Postgres * instance and then shutdown. */ ReplicationSource crashRecoveryReplicationSource = { 0 }; log_info("Postgres needs to enter crash recovery before pg_rewind."); crashRecoveryReplicationSource = *replicationSource; /* we target the earlier consistent state possible, or 'immediate' */ strlcpy(crashRecoveryReplicationSource.targetLSN, "immediate", sizeof(crashRecoveryReplicationSource.targetLSN)); /* pause when reaching target to avoid creating a new local timeline */ strlcpy(crashRecoveryReplicationSource.targetAction, "pause", sizeof(crashRecoveryReplicationSource.targetAction)); strlcpy(crashRecoveryReplicationSource.targetTimeline, "current", sizeof(crashRecoveryReplicationSource.targetTimeline)); if (!pg_setup_standby_mode(pgSetup->control.pg_control_version, pgSetup->pgdata, pgSetup->pg_ctl, &crashRecoveryReplicationSource)) { log_error("Failed to setup for crash recovery " "in preparation for pg_rewind"); return false; } /* * Now that the configuration file is ready and asks for Postgres * shutdown when reaching crash recovery time, we start postgres as a * sub-process here and wait for it to terminate. */ fflush(stdout); fflush(stderr); /* time to create the node_active sub-process */ pid_t fpid = fork(); switch (fpid) { case -1: { log_error("Failed to fork the postgres supervisor process"); return false; } case 0: { /* execv() the postgres binary directly, as a sub-process */ (void) pg_ctl_postgres(pgSetup->pg_ctl, pgSetup->pgdata, pgSetup->pgport, pgSetup->listen_addresses, /* do not open the service just yet */ false); /* unexpected */ log_fatal("BUG: returned from service_keeper_runprogram()"); exit(EXIT_CODE_INTERNAL_ERROR); } default: { /* wait until postgres crash recovery is done */ for (int attempts = 0;; attempts++) { int timeout = 30; if (pg_setup_wait_until_is_ready(pgSetup, timeout, LOG_INFO)) { break; } } /* get Postgres current LSN after recovery, might be useful */ PGSQL *pgsql = &(postgres->sqlClient); if (pgsql_get_postgres_metadata(pgsql, &pgSetup->is_in_recovery, postgres->pgsrSyncState, postgres->currentLSN, &(pgSetup->control))) { log_info("Postgres has finished crash recovery at LSN %s", postgres->currentLSN); } else { log_error("Failed to get Postgres metadata, continuing"); } /* * Now stop Postgres by just killing our child process, and * wait until the child process has finished with waitpid(). */ int wpid, status; do { if (kill(fpid, SIGTERM) != 0) { log_error("Failed to send SIGTERM to " "Postgres pid %d: %m", fpid); return false; } wpid = waitpid(fpid, &status, WNOHANG); if (wpid == -1) { log_warn("Failed to wait until Postgres is done: %m"); } /* waitpid could be WIFSTOPPED, then try again */ } while (!(WIFEXITED(status) || !WIFSIGNALED(status))); if (WIFEXITED(status) && WEXITSTATUS(status) == EXIT_CODE_QUIT) { return true; } else if (WIFEXITED(status)) { int returnCode = WEXITSTATUS(status); log_warn("Postgres has finished crash recovery with " "exit code %d", returnCode); (void) pg_log_startup(pgSetup->pgdata, LOG_INFO); } else { log_error("BUG: can't make sense of waitpid() exit code"); return false; } } } } return true; } /* * standby_promote promotes a standby postgres server to primary. */ bool standby_promote(LocalPostgresServer *postgres) { PGSQL *pgsql = &(postgres->sqlClient); PostgresSetup *pgSetup = &(postgres->postgresSetup); bool inRecovery = false; log_trace("standby_promote"); if (!pgsql_is_in_recovery(pgsql, &inRecovery)) { log_error("Failed to promote standby: couldn't determine whether postgres " "is in recovery mode"); return false; } if (!inRecovery) { log_info("Skipping promotion: postgres is not in recovery mode"); /* * Ensure idempotency: if in the last run we managed to promote, but * failed to checkpoint, we still need to checkpoint. */ if (!pgsql_checkpoint(pgsql)) { log_error("Failed to checkpoint after promotion"); return false; } return true; } /* disconnect from PostgreSQL now */ pgsql_finish(pgsql); log_info("Promoting postgres"); if (!pg_ctl_promote(pgSetup->pg_ctl, pgSetup->pgdata)) { log_error("Failed to promote standby: see pg_ctl promote errors above"); return false; } do { log_info("Waiting for postgres to promote"); pg_usleep(AWAIT_PROMOTION_SLEEP_TIME_MS * 1000); if (asked_to_stop || asked_to_stop_fast) { log_trace("standby_promote: signaled"); pgsql_finish(pgsql); return false; } if (!pgsql_is_in_recovery(pgsql, &inRecovery)) { log_error("Failed to determine whether postgres is in " "recovery mode after promotion"); return false; } } while (inRecovery); /* * It's necessary to do a checkpoint before allowing the old primary to * rewind, since there can be a race condition in which pg_rewind detects * no change in timeline in the pg_control file, but a checkpoint is * already in progress causing the timelines to diverge before replication * starts. */ if (!pgsql_checkpoint(pgsql)) { log_error("Failed to checkpoint after promotion"); return false; } /* cleanup our standby setup */ if (!pg_cleanup_standby_mode(pgSetup->control.pg_control_version, pgSetup->pg_ctl, pgSetup->pgdata, pgsql)) { log_error("Failed to clean-up Postgres replication settings, " "see above for details"); return false; } /* disconnect from PostgreSQL now */ pgsql_finish(pgsql); return true; } /* * check_postgresql_settings returns true when our minimal set of PostgreSQL * settings are correctly setup on the target server. */ bool check_postgresql_settings(LocalPostgresServer *postgres, bool *settings_are_ok) { PGSQL *pgsql = &(postgres->sqlClient); bool isCitusInstanceKind = IS_CITUS_INSTANCE_KIND(postgres->pgKind); bool result = pgsql_check_postgresql_settings(pgsql, isCitusInstanceKind, settings_are_ok); pgsql_finish(pgsql); return result; } /* * primary_standby_has_caught_up loops over a SQL query on the primary that * checks the current reported LSN from the standby's replication slot. */ bool primary_standby_has_caught_up(LocalPostgresServer *postgres) { PGSQL *pgsql = &(postgres->sqlClient); char standbyCurrentLSN[PG_LSN_MAXLENGTH] = { 0 }; bool hasReachedLSN = false; /* ensure some WAL level traffic to move things forward */ if (!pgsql_checkpoint(pgsql)) { log_error("Failed to checkpoint before checking " "if a standby has caught-up to LSN %s", postgres->standbyTargetLSN); return false; } if (!pgsql_one_slot_has_reached_target_lsn(pgsql, postgres->standbyTargetLSN, standbyCurrentLSN, &hasReachedLSN)) { /* errors have already been logged */ return false; } if (hasReachedLSN) { log_info("Standby reached LSN %s, thus advanced past LSN %s", standbyCurrentLSN, postgres->standbyTargetLSN); /* cache invalidation */ bzero((void *) postgres->standbyTargetLSN, PG_LSN_MAXLENGTH); return true; } else { log_info("Standby reached LSN %s, waiting for LSN %s", standbyCurrentLSN, postgres->standbyTargetLSN); return false; } } /* * standby_follow_new_primary rewrites the replication setup to follow the new * primary after a failover. */ bool standby_follow_new_primary(LocalPostgresServer *postgres) { PGSQL *pgsql = &(postgres->sqlClient); PostgresSetup *pgSetup = &(postgres->postgresSetup); ReplicationSource *replicationSource = &(postgres->replicationSource); NodeAddress *primaryNode = &(replicationSource->primaryNode); log_info("Follow new primary node " NODE_FORMAT, primaryNode->nodeId, primaryNode->name, primaryNode->host, primaryNode->port); /* when we have a primary, only proceed if we can reach it */ if (!IS_EMPTY_STRING_BUFFER(replicationSource->primaryNode.host)) { if (!pgctl_identify_system(replicationSource)) { log_error("Failed to establish a replication connection " "to the new primary, see above for details"); return false; } } /* cleanup our existing standby setup, including postgresql.auto.conf */ if (!pg_cleanup_standby_mode(pgSetup->control.pg_control_version, pgSetup->pg_ctl, pgSetup->pgdata, pgsql)) { log_error("Failed to clean-up Postgres replication settings, " "see above for details"); return false; } /* we might be back from maintenance and find Postgres is not running */ if (pg_is_running(pgSetup->pg_ctl, pgSetup->pgdata)) { log_info("Stopping Postgres at \"%s\"", pgSetup->pgdata); if (!ensure_postgres_service_is_stopped(postgres)) { log_error("Failed to stop Postgres at \"%s\"", pgSetup->pgdata); return false; } } if (!pg_setup_standby_mode(pgSetup->control.pg_control_version, pgSetup->pgdata, pgSetup->pg_ctl, replicationSource)) { log_error("Failed to setup Postgres as a standby"); return false; } log_info("Restarting Postgres at \"%s\"", pgSetup->pgdata); if (!ensure_postgres_service_is_running(postgres)) { log_error("Failed to restart Postgres after changing its " "primary conninfo, see above for details"); return false; } return true; } /* * standby_fetch_missing_wal sets up replication to fetch up to given * recovery_target_lsn (inclusive) with a recovery_target_action set to * 'promote' so that as soon as we get our WAL bytes we are promoted to being a * primary. */ bool standby_fetch_missing_wal(LocalPostgresServer *postgres) { PGSQL *pgsql = &(postgres->sqlClient); ReplicationSource *replicationSource = &(postgres->replicationSource); NodeAddress *upstreamNode = &(replicationSource->primaryNode); char currentLSN[PG_LSN_MAXLENGTH] = { 0 }; bool hasReachedLSN = false; log_info("Fetching WAL from upstream node " NODE_FORMAT "up to LSN %s", upstreamNode->nodeId, upstreamNode->name, upstreamNode->host, upstreamNode->port, replicationSource->targetLSN); /* apply new replication source to fetch missing WAL bits */ if (!standby_restart_with_current_replication_source(postgres)) { log_error("Failed to setup replication " "from upstream node " NODE_FORMAT ", see above for details", upstreamNode->nodeId, upstreamNode->name, upstreamNode->host, upstreamNode->port); } /* * Now loop until replay has reached our targetLSN. */ while (!hasReachedLSN) { if (asked_to_stop || asked_to_stop_fast) { log_trace("standby_fetch_missing_wal_and_promote: signaled"); break; } if (!pgsql_has_reached_target_lsn(pgsql, replicationSource->targetLSN, currentLSN, &hasReachedLSN)) { /* errors have already been logged */ return false; } if (!hasReachedLSN) { log_info("Postgres recovery is at LSN %s, waiting for LSN %s", currentLSN, replicationSource->targetLSN); pg_usleep(AWAIT_PROMOTION_SLEEP_TIME_MS * 1000); } } /* done with fast-forwarding, keep the value for node_active() call */ strlcpy(postgres->currentLSN, currentLSN, PG_LSN_MAXLENGTH); /* we might have been interrupted before the end */ if (!hasReachedLSN) { log_error("Fast-forward reached LSN %s, target LSN is %s", postgres->currentLSN, replicationSource->targetLSN); pgsql_finish(pgsql); return false; } log_info("Fast-forward is done, now at LSN %s", postgres->currentLSN); /* * It's necessary to do a checkpoint before allowing the old primary to * rewind, since there can be a race condition in which pg_rewind detects * no change in timeline in the pg_control file, but a checkpoint is * already in progress causing the timelines to diverge before replication * starts. */ if (!pgsql_checkpoint(pgsql)) { log_error("Failed to checkpoint after fast-forward to LSN %s", postgres->currentLSN); return false; } /* disconnect from PostgreSQL now */ pgsql_finish(pgsql); return true; } /* * standby_restart_with_no_primary sets up recovery parameters without a * primary_conninfo, so as to force disconnect from the primary and still * remain a standby that can report its current LSN position, for instance. */ bool standby_restart_with_current_replication_source(LocalPostgresServer *postgres) { PGSQL *pgsql = &(postgres->sqlClient); PostgresSetup *pgSetup = &(postgres->postgresSetup); ReplicationSource *replicationSource = &(postgres->replicationSource); /* when we have a primary, only proceed if we can reach it */ if (!IS_EMPTY_STRING_BUFFER(replicationSource->primaryNode.host)) { if (!pgctl_identify_system(replicationSource)) { log_error("Failed to establish a replication connection " "to the primary node, see above for details"); return false; } } /* cleanup our existing standby setup, including postgresql.auto.conf */ if (!pg_cleanup_standby_mode(pgSetup->control.pg_control_version, pgSetup->pg_ctl, pgSetup->pgdata, pgsql)) { log_error("Failed to clean-up Postgres replication settings, " "see above for details"); return false; } log_info("Stopping Postgres at \"%s\"", pgSetup->pgdata); if (!ensure_postgres_service_is_stopped(postgres)) { log_error("Failed to stop Postgres at \"%s\"", pgSetup->pgdata); return false; } if (!pg_setup_standby_mode(pgSetup->control.pg_control_version, pgSetup->pgdata, pgSetup->pg_ctl, replicationSource)) { log_error("Failed to setup Postgres as a standby, after rewind"); return false; } log_info("Restarting Postgres at \"%s\"", pgSetup->pgdata); if (!ensure_postgres_service_is_running(postgres)) { log_error("Failed to restart Postgres after changing its " "primary conninfo, see above for details"); return false; } return true; } /* * standby_cleanup_as_primary removes the setup for a standby server and * restarts as a primary. It's typically called after standby_fetch_missing_wal * so we expect Postgres to be running as a standby and be "paused". */ bool standby_cleanup_as_primary(LocalPostgresServer *postgres) { PGSQL *pgsql = &(postgres->sqlClient); PostgresSetup *pgSetup = &(postgres->postgresSetup); log_info("Cleaning-up Postgres replication settings"); if (!pg_cleanup_standby_mode(pgSetup->control.pg_control_version, pgSetup->pg_ctl, pgSetup->pgdata, pgsql)) { log_error("Failed to clean-up Postgres replication settings, " "see above for details"); return false; } return true; } /* * standby_check_timeline_with_upstream returns true when the current timeline * on the local node (a standby) is the same as the timeline fetched on the * upstream node setup in its replicationSource. */ bool standby_check_timeline_with_upstream(LocalPostgresServer *postgres) { ReplicationSource *replicationSource = &(postgres->replicationSource); NodeAddress *primaryNode = &(replicationSource->primaryNode); /* fetch timeline information from the upstream node */ if (!pgctl_identify_system(replicationSource)) { log_error("Failed to establish a replication connection " "to the new primary, see above for details"); return false; } /* fetch most recent local metadata, including the timeline id. */ if (!pgsql_get_postgres_metadata(&(postgres->sqlClient), &(postgres->postgresSetup.is_in_recovery), postgres->pgsrSyncState, postgres->currentLSN, &(postgres->postgresSetup.control))) { log_error("Failed to update the local Postgres metadata"); return false; } uint32_t upstreamTimeline = replicationSource->system.timeline; uint32_t localTimeline = postgres->postgresSetup.control.timeline_id; /* we might not be connected to the primary yet */ if (localTimeline == 0) { log_warn("Current received timeline is unknown, pg_autoctl will " "retry this transition."); return false; } /* * We only allow this transition when the standby node as caught-up with * the upstream timeline. As streaming replication is supposed to be a * clean history replay (no PITR shenanigans), it is never expected that * the local timeline would be greater than the timeline found on the * upstream node. */ if (upstreamTimeline < localTimeline) { log_error("Current timeline on upstream node " NODE_FORMAT " is %d, and current timeline on this standby node is %d", primaryNode->nodeId, primaryNode->name, primaryNode->host, primaryNode->port, upstreamTimeline, localTimeline); return false; } else if (upstreamTimeline > localTimeline) { log_warn("Current timeline on upstream node " NODE_FORMAT " is %d, and current timeline on this standby node is still %d", primaryNode->nodeId, primaryNode->name, primaryNode->host, primaryNode->port, upstreamTimeline, localTimeline); return false; } else if (upstreamTimeline == localTimeline) { log_info("Reached timeline %d, same as upstream node " NODE_FORMAT, localTimeline, primaryNode->nodeId, primaryNode->name, primaryNode->host, primaryNode->port); } return upstreamTimeline == localTimeline; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/primary_standby.h000066400000000000000000000110441414244367200241350ustar00rootroot00000000000000/* * src/bin/pg_autoctl/primary.h * Management functions that implement the keeper state machine transitions. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef LOCAL_POSTGRES_H #define LOCAL_POSTGRES_H #include #include "postgres_fe.h" #include "pgsql.h" #include "pgsetup.h" /* Communication device between node-active and postgres processes */ typedef struct LocalExpectedPostgresStatus { char pgStatusPath[MAXPGPATH]; KeeperStatePostgres state; } LocalExpectedPostgresStatus; /* * LocalPostgresServer represents a local postgres database cluster that * we can manage via a SQL connection and operations on the database * directory contained in the PostgresSetup. * * currentLSN value is kept as text for better portability. We do not * perform any operation on the value after it was read from database. */ typedef struct LocalPostgresServer { PGSQL sqlClient; PostgresSetup postgresSetup; ReplicationSource replicationSource; bool pgIsRunning; char pgsrSyncState[PGSR_SYNC_STATE_MAXLENGTH]; char currentLSN[PG_LSN_MAXLENGTH]; uint64_t pgFirstStartFailureTs; int pgStartRetries; PgInstanceKind pgKind; LocalExpectedPostgresStatus expectedPgStatus; char standbyTargetLSN[PG_LSN_MAXLENGTH]; char synchronousStandbyNames[BUFSIZE]; } LocalPostgresServer; void local_postgres_init(LocalPostgresServer *postgres, PostgresSetup *postgresSetup); bool local_postgres_set_status_path(LocalPostgresServer *postgres, bool unlink); bool local_postgres_unlink_status_file(LocalPostgresServer *postgres); void local_postgres_finish(LocalPostgresServer *postgres); bool local_postgres_update(LocalPostgresServer *postgres, bool postgresNotRunningIsOk); bool ensure_postgres_service_is_running(LocalPostgresServer *postgres); bool ensure_postgres_service_is_running_as_subprocess(LocalPostgresServer *postgres); bool ensure_postgres_service_is_stopped(LocalPostgresServer *postgres); bool primary_has_replica(LocalPostgresServer *postgres, char *userName, bool *hasStandby); bool upstream_has_replication_slot(ReplicationSource *upstream, PostgresSetup *pgSetup, bool *hasReplicationSlot); bool primary_create_replication_slot(LocalPostgresServer *postgres, char *replicationSlotName); bool primary_drop_replication_slot(LocalPostgresServer *postgres, char *replicationSlotName); bool primary_drop_replication_slots(LocalPostgresServer *postgres); bool primary_drop_all_replication_slots(LocalPostgresServer *postgres); bool primary_set_synchronous_standby_names(LocalPostgresServer *postgres); bool postgres_replication_slot_create_and_drop(LocalPostgresServer *postgres, NodeAddressArray *nodeArray); bool postgres_replication_slot_maintain(LocalPostgresServer *postgres, NodeAddressArray *nodeArray); bool primary_disable_synchronous_replication(LocalPostgresServer *postgres); bool postgres_add_default_settings(LocalPostgresServer *postgres, const char *hostname); bool primary_create_user_with_hba(LocalPostgresServer *postgres, char *userName, char *password, char *hostname, char *authMethod, HBAEditLevel hbaLevel, int connlimit); bool primary_create_replication_user(LocalPostgresServer *postgres, char *replicationUser, char *replicationPassword); bool standby_init_replication_source(LocalPostgresServer *postgres, NodeAddress *upstreamNode, const char *username, const char *password, const char *slotName, const char *maximumBackupRate, const char *backupDirectory, const char *targetLSN, SSLOptions sslOptions, int currentNodeId); bool standby_init_database(LocalPostgresServer *postgres, const char *hostname, bool skipBaseBackup); bool primary_rewind_to_standby(LocalPostgresServer *postgres); bool postgres_maybe_do_crash_recovery(LocalPostgresServer *postgres); bool standby_promote(LocalPostgresServer *postgres); bool check_postgresql_settings(LocalPostgresServer *postgres, bool *settings_are_ok); bool primary_standby_has_caught_up(LocalPostgresServer *postgres); bool standby_follow_new_primary(LocalPostgresServer *postgres); bool standby_fetch_missing_wal(LocalPostgresServer *postgres); bool standby_restart_with_current_replication_source(LocalPostgresServer *postgres); bool standby_cleanup_as_primary(LocalPostgresServer *postgres); bool standby_check_timeline_with_upstream(LocalPostgresServer *postgres); #endif /* LOCAL_POSTGRES_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_keeper.c000066400000000000000000000600141414244367200237150ustar00rootroot00000000000000/* * src/bin/pg_autoctl/service_keeper.c * The main loop of the pg_autoctl keeper * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "cli_common.h" #include "cli_root.h" #include "defaults.h" #include "fsm.h" #include "keeper.h" #include "keeper_config.h" #include "keeper_pg_init.h" #include "log.h" #include "monitor.h" #include "pgctl.h" #include "pidfile.h" #include "service_keeper.h" #include "service_postgres_ctl.h" #include "signals.h" #include "state.h" #include "string_utils.h" #include "supervisor.h" #include "runprogram.h" static bool keepRunning = true; /* list of hooks to run at reload time */ KeeperReloadFunction KeeperReloadHooksArray[] = { &keeper_reload_configuration, NULL }; KeeperReloadFunction *KeeperReloadHooks = KeeperReloadHooksArray; /* list of hooks to run to update a list of nodes, at node active time */ KeeperNodesArrayRefreshFunction KeeperNodesArrayRefreshArray[] = { &keeper_refresh_hba, NULL }; KeeperNodesArrayRefreshFunction *KeeperRefreshHooks = KeeperNodesArrayRefreshArray; static bool service_keeper_node_active(Keeper *keeper, bool doInit); static void check_for_network_partitions(Keeper *keeper); static bool is_network_healthy(Keeper *keeper); static bool in_network_partition(KeeperStateData *keeperState, uint64_t now, int networkPartitionTimeout); /* * keeper_service_start starts the keeper processes: the node_active main loop * and depending on the current state the Postgres instance. */ bool start_keeper(Keeper *keeper) { const char *pidfile = keeper->config.pathnames.pid; Service subprocesses[] = { { SERVICE_NAME_POSTGRES, RP_PERMANENT, -1, &service_postgres_ctl_start }, { SERVICE_NAME_KEEPER, RP_PERMANENT, -1, &service_keeper_start, (void *) keeper } }; int subprocessesCount = sizeof(subprocesses) / sizeof(subprocesses[0]); return supervisor_start(subprocesses, subprocessesCount, pidfile); } /* * keeper_start_node_active_process starts a sub-process that communicates with * the monitor to implement the node_active protocol. */ bool service_keeper_start(void *context, pid_t *pid) { Keeper *keeper = (Keeper *) context; /* Flush stdio channels just before fork, to avoid double-output problems */ fflush(stdout); fflush(stderr); /* time to create the node_active sub-process */ pid_t fpid = fork(); switch (fpid) { case -1: { log_error("Failed to fork the node-active process"); return false; } case 0: { /* here we call execv() so we never get back */ (void) service_keeper_runprogram(keeper); /* unexpected */ log_fatal("BUG: returned from service_keeper_runprogram()"); exit(EXIT_CODE_INTERNAL_ERROR); } default: { /* fork succeeded, in parent */ log_debug("pg_autoctl node-active process started in subprocess %d", fpid); *pid = fpid; return true; } } } /* * service_keeper_runprogram runs the node_active protocol service: * * $ pg_autoctl do service node-active --pgdata ... * * This function is intended to be called from the child process after a fork() * has been successfully done at the parent process level: it's calling * execve() and will never return. */ void service_keeper_runprogram(Keeper *keeper) { char *args[12]; int argsIndex = 0; char command[BUFSIZE]; /* * use --pgdata option rather than the config. * * On macOS when using /tmp, the file path is then redirected to being * /private/tmp when using realpath(2) as we do in normalize_filename(). So * for that case to be supported, we explicitely re-use whatever PGDATA or * --pgdata was parsed from the main command line to start our sub-process. */ char *pgdata = keeperOptions.pgSetup.pgdata; setenv(PG_AUTOCTL_DEBUG, "1", 1); args[argsIndex++] = (char *) pg_autoctl_program; args[argsIndex++] = "do"; args[argsIndex++] = "service"; args[argsIndex++] = "node-active"; args[argsIndex++] = "--pgdata"; args[argsIndex++] = pgdata; args[argsIndex++] = logLevelToString(log_get_level()); args[argsIndex] = NULL; /* we do not want to call setsid() when running this program. */ Program program = { 0 }; (void) initialize_program(&program, args, false); program.capture = false; /* redirect output, don't capture */ program.stdOutFd = STDOUT_FILENO; program.stdErrFd = STDERR_FILENO; /* log the exact command line we're using */ (void) snprintf_program_command_line(&program, command, BUFSIZE); log_info("%s", command); (void) execute_program(&program); } /* * service_keeper_node_active_init initializes the pg_autoctl service for the * node_active protocol. */ bool service_keeper_node_active_init(Keeper *keeper) { KeeperConfig *config = &(keeper->config); bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; bool monitorDisabledIsOk = true; if (!keeper_config_read_file(config, missingPgdataIsOk, pgIsNotRunningIsOk, monitorDisabledIsOk)) { /* errors have already been logged. */ exit(EXIT_CODE_BAD_CONFIG); } /* * Check that the init is finished. This function is called from * cli_service_run when used in the CLI `pg_autoctl run`, and the * function cli_service_run calls into keeper_init(): we know that we could * read a keeper state file. */ if (file_exists(config->pathnames.init)) { log_warn("The `pg_autoctl create` did not complete, completing now."); if (!keeper_pg_init_continue(keeper)) { /* errors have already been logged. */ return false; } } if (!keeper_init(keeper, config)) { log_fatal("Failed to initialize keeper, see above for details"); exit(EXIT_CODE_PGCTL); } return true; } /* * keeper_node_active_loop implements the main loop of the keeper, which * periodically gets the goal state from the monitor and makes the state * transitions. */ bool keeper_node_active_loop(Keeper *keeper, pid_t start_pid) { Monitor *monitor = &(keeper->monitor); KeeperConfig *config = &(keeper->config); KeeperStateData *keeperState = &(keeper->state); LocalPostgresServer *postgres = &(keeper->postgres); bool doSleep = false; bool couldContactMonitor = false; bool firstLoop = true; bool doInit = true; bool warnedOnCurrentIteration = false; bool warnedOnPreviousIteration = false; bool nodeHasBeenDroppedFromTheMonitor = false; log_debug("pg_autoctl service is starting"); /* setup our monitor client connection with our notification handler */ (void) monitor_setup_notifications(monitor, keeperState->current_group, keeperState->current_node_id); /* * When pg_autoctl drop node is used from a distance, then this node * transitions to the DROPPED_STATE and shutdown cleanly. Now, if a dropped * node is restarted (by systemd, an interactive user, or another way) we * must realise the situation and refrain from entering our main loop. */ if (!config->monitorDisabled) { bool dropped = false; if (!keeper_ensure_node_has_been_dropped(keeper, &dropped)) { /* errors have already been logged */ return false; } if (dropped) { /* signal that it's time to shutdown everything */ log_fatal("This node with id %lld in formation \"%s\" and group %d " "has been dropped from the monitor", (long long) keeperState->current_node_id, config->formation, config->groupId); log_info("To get rid of the configuration file and PGDATA directory, " "run pg_autoctl drop node --pgdata \"%s\" --destroy", config->pgSetup.pgdata); exit(EXIT_CODE_FATAL); } } while (keepRunning) { bool couldContactMonitorThisRound = false; bool needStateChange = false; bool transitionFailed = false; /* * If we're in a stable state (current state and goal state are the * same, and this didn't change in the previous loop), then we can * sleep for a while. As the monitor notifies every state change, we * can also interrupt our sleep as soon as we get the hint. */ if (doSleep && !config->monitorDisabled) { int timeoutMs = PG_AUTOCTL_KEEPER_SLEEP_TIME * 1000; bool groupStateHasChanged = false; /* establish a connection for notifications if none present */ (void) pgsql_prepare_to_wait(&(monitor->notificationClient)); (void) monitor_wait_for_state_change(monitor, config->formation, keeperState->current_group, keeperState->current_node_id, timeoutMs, &groupStateHasChanged); /* when no state change has been notified, close the connection */ if (!groupStateHasChanged && monitor->notificationClient.connectionStatementType == PGSQL_CONNECTION_MULTI_STATEMENT) { pgsql_finish(&(monitor->notificationClient)); } } else if (doSleep && config->monitorDisabled) { int timeoutUs = PG_AUTOCTL_KEEPER_SLEEP_TIME * 1000 * 1000; pg_usleep(timeoutUs); } doSleep = true; /* * Handle signals. * * When asked to STOP, we always finish the current transaction before * doing so, which means we only check if asked_to_stop at the * beginning of the loop. * * We have several places where it's safe to check if SIGQUIT has been * signaled to us and from where we can immediately exit whatever we're * doing. It's important to avoid e.g. leaving state.new files behind. */ if (asked_to_reload || firstLoop) { (void) keeper_call_reload_hooks(keeper, firstLoop, doInit); } if (asked_to_stop || asked_to_stop_fast || asked_to_quit) { break; } /* Check that we still own our PID file, or quit now */ (void) check_pidfile(config->pathnames.pid, start_pid); CHECK_FOR_FAST_SHUTDOWN; /* * Read the current state. While we could preserve the state in memory, * re-reading the file simplifies recovery from failures. For example, * if we fail to write the state file after making a transition, then * we should not tell the monitor that the transition succeeded, because * a subsequent crash of the keeper would cause the states to become * inconsistent. By re-reading the file, we make sure the state on disk * on the keeper is consistent with the state on the monitor * * Also, when --disable-monitor is used, then we get our assigned state * by reading the state file, which is edited by an external process. */ if (!keeper_load_state(keeper)) { log_error("Failed to read keeper state file, retrying..."); CHECK_FOR_FAST_SHUTDOWN; continue; } if (firstLoop) { log_info("pg_autoctl service is running, " "current state is \"%s\"", NodeStateToString(keeperState->current_role)); } /* * Check for any changes in the local PostgreSQL instance, and update * our in-memory values for the replication WAL lag and sync_state. */ if (!keeper_update_pg_state(keeper, LOG_WARN)) { warnedOnCurrentIteration = true; log_warn("Failed to update the keeper's state from the local " "PostgreSQL instance."); } else if (warnedOnPreviousIteration) { log_info("Updated the keeper's state from the local " "PostgreSQL instance, which is %s", postgres->pgIsRunning ? "running" : "not running"); } CHECK_FOR_FAST_SHUTDOWN; /* * If the monitor is disabled, read the list of other nodes from our * file on-disk at config->pathnames.nodes. The following command can * be used to fill-in that file: * * $ pg_autoctl do fsm nodes set nodes.json */ if (config->monitorDisabled) { /* force cache invalidation when reaching WAIT_STANDBY */ bool forceCacheInvalidation = keeperState->current_role == WAIT_STANDBY_STATE; /* maybe update our cached list of other nodes */ if (!keeper_refresh_other_nodes(keeper, forceCacheInvalidation)) { /* we will try again... */ log_warn("Failed to update our list of other nodes"); continue; } } /* * If the monitor is not disabled, call the node_active function on the * monitor and update the keeper data structure accordingy, refreshing * our cache of other nodes if needed. */ else { couldContactMonitorThisRound = service_keeper_node_active(keeper, doInit); if (!couldContactMonitor && couldContactMonitorThisRound && !firstLoop) { /* * Last message the user saw in the output is the following, * and so we should say that we're back to the expected * situation: * * Failed to get the goal state from the monitor */ log_info("Successfully got the goal state from the monitor"); } couldContactMonitor = couldContactMonitorThisRound; } if (keeperState->assigned_role != keeperState->current_role) { needStateChange = true; if (couldContactMonitor) { log_info("Monitor assigned new state \"%s\"", NodeStateToString(keeperState->assigned_role)); } else { /* if network is not healthy we might self-assign a state */ log_info("Reaching new state \"%s\"", NodeStateToString(keeperState->assigned_role)); } } CHECK_FOR_FAST_SHUTDOWN; /* * If we see that PostgreSQL is not running when we know it should be, * the least we can do is start PostgreSQL again. Same if PostgreSQL is * running and we are DEMOTED, or in another one of those states where * the monitor asked us to stop serving queries, in order to ensure * consistency. * * Only enfore current state when we have a recent enough version of * it, meaning that we could contact the monitor. * * We need to prevent the keeper from restarting PostgreSQL at boot * time when meanwhile the Monitor did set our goal_state to DEMOTED * because the other node has been promoted, which could happen if this * node was rebooting for a long enough time. */ if (needStateChange) { /* * First, ensure the current state (make sure Postgres is running * if it should, or Postgres is stopped if it should not run). * * The transition function we call next might depend on our * assumption that Postgres is running in the current state. */ if (keeper_should_ensure_current_state_before_transition(keeper)) { if (!keeper_ensure_current_state(keeper)) { /* * We don't take care of the warnedOnCurrentIteration here * because the real thing that should happen is the * transition to the next state. That's what we keep track * of with "transitionFailed". */ log_warn( "pg_autoctl failed to ensure current state \"%s\": " "PostgreSQL %s running", NodeStateToString(keeperState->current_role), postgres->pgIsRunning ? "is" : "is not"); } } if (!keeper_fsm_reach_assigned_state(keeper)) { log_error("Failed to transition to state \"%s\", retrying... ", NodeStateToString(keeperState->assigned_role)); transitionFailed = true; } } else if (couldContactMonitor || config->monitorDisabled) { if (!keeper_ensure_current_state(keeper)) { warnedOnCurrentIteration = true; log_warn("pg_autoctl failed to ensure current state \"%s\": " "PostgreSQL %s running", NodeStateToString(keeperState->current_role), postgres->pgIsRunning ? "is" : "is not"); } else if (warnedOnPreviousIteration) { log_info("pg_autoctl managed to ensure current state \"%s\": " "PostgreSQL %s running", NodeStateToString(keeperState->current_role), postgres->pgIsRunning ? "is" : "is not"); } } /* now is a good time to make sure we're closing our connections */ pgsql_finish(&(postgres->sqlClient)); CHECK_FOR_FAST_SHUTDOWN; /* * Write the current (changed) state to disk. * * When using a monitor, even if a transition failed, we still write * the state file to update timestamps used for the network partition * checks. * * When the monitor is disabled, only write the state to disk when we * just successfully implemented a state change. */ if (!config->monitorDisabled || (needStateChange && !transitionFailed)) { if (!keeper_store_state(keeper)) { transitionFailed = true; } } /* * If the node has been dropped, we exit the process... after having * done at least another round where we could contact the monitor to * report that we reached the assigned state. */ if ((couldContactMonitor || config->monitorDisabled) && keeperState->current_role == DROPPED_STATE && keeperState->current_role == keeperState->assigned_role) { if (nodeHasBeenDroppedFromTheMonitor) { keepRunning = false; } else { nodeHasBeenDroppedFromTheMonitor = true; } } if ((needStateChange || (!config->monitorDisabled && monitor_has_received_notifications(monitor))) && !transitionFailed) { /* cycle faster if we made a state transition */ doSleep = false; } if (asked_to_stop || asked_to_stop_fast) { keepRunning = false; } if (firstLoop) { firstLoop = false; } /* if we failed to contact the monitor, we must re-try the init steps */ if (doInit && couldContactMonitorThisRound) { doInit = false; } /* * On the first loop, we might have reload-time actions to implement * before and after having contacted the monitor. For instance, * contacting the monitor might show that we're not a primary anymore * after having been DEMOTED during a failover, while this node was * rebooting or something. * * So in some cases, we want to do two rounds of start-up reload: * * reload-hook(firstLoop => true, doInit => true) * reload-hook(firstLoop => true, doInit => false) * * Later SIGHUP signal processing will trigger a call to our reload * hooks with both firstLoop and doInit false, and that's handled * earlier in this loop. */ if (firstLoop) { (void) keeper_call_reload_hooks(keeper, firstLoop, doInit); } /* advance the warnings "counters" */ if (warnedOnPreviousIteration) { warnedOnPreviousIteration = false; } if (warnedOnCurrentIteration) { warnedOnPreviousIteration = true; warnedOnCurrentIteration = false; } } /* One last check that we do not have any connections open */ pgsql_finish(&(keeper->monitor.pgsql)); pgsql_finish(&(monitor->notificationClient)); if (nodeHasBeenDroppedFromTheMonitor) { /* signal that it's time to shutdown everything */ exit(EXIT_CODE_DROPPED); } return true; } /* * keeper_node_active calls the node_active function on the monitor, and when * it could contact the monitor it also updates our copy of the list of other * nodes currenty in the group (keeper->otherNodes). * * keeper_node_active returns true if it could successfully connect to the * monitor, and false otherwise. When it returns false, it also checks for * network partitions and set the goal state to DEMOTE_TIMEOUT_STATE when * needed. */ static bool service_keeper_node_active(Keeper *keeper, bool doInit) { KeeperConfig *config = &(keeper->config); KeeperStateData *keeperState = &(keeper->state); MonitorAssignedState assignedState = { 0 }; uint64_t now = time(NULL); /* * Report the current state to the monitor and get the assigned state. */ if (!keeper_node_active(keeper, doInit, &assignedState)) { log_error("Failed to get the goal state from the monitor"); /* * Check whether we're likely to be in a network partition. * That will cause the assigned_role to become demoted. */ (void) check_for_network_partitions(keeper); return false; } /* * We could contact the monitor, update our internal state. */ keeperState->last_monitor_contact = now; keeperState->assigned_role = assignedState.state; if (keeperState->assigned_role != keeperState->current_role) { log_debug("keeper_node_active: %s ➜ %s", NodeStateToString(keeperState->current_role), NodeStateToString(keeperState->assigned_role)); } /* maybe update our cached list of other nodes */ if (keeperState->current_role == DROPPED_STATE && keeperState->current_role == keeperState->assigned_role) { return true; } bool forceCacheInvalidation = false; if (!keeper_refresh_other_nodes(keeper, forceCacheInvalidation)) { /* * We have a new MD5 but failed to update our list, try again next * round, the monitor might be restarting or something. */ log_error("Failed to update our list of other nodes"); return false; } /* * Also update the groupId and replication slot name in the * configuration file. */ char expectedSlotName[BUFSIZE] = { 0 }; (void) postgres_sprintf_replicationSlotName(assignedState.nodeId, expectedSlotName, sizeof(expectedSlotName)); if (assignedState.groupId != config->groupId || strneq(config->replication_slot_name, expectedSlotName)) { bool postgresNotRunningIsOk = false; if (!keeper_config_update(config, assignedState.nodeId, assignedState.groupId)) { log_error("Failed to update the configuration file " "with groupId %d and replication.slot \"%s\"", assignedState.groupId, expectedSlotName); return false; } if (!keeper_ensure_configuration(keeper, postgresNotRunningIsOk)) { log_error("Failed to update our Postgres configuration " "after a change of groupId or " "replication slot name, see above for details"); return false; } } return true; } /* * check_for_network_partitions checks whether we're likely to be in a network * partition. That will cause the assigned_role to become demoted. */ static void check_for_network_partitions(Keeper *keeper) { KeeperStateData *keeperState = &(keeper->state); if (keeperState->current_role == PRIMARY_STATE) { log_warn("Checking for network partitions..."); if (!is_network_healthy(keeper)) { keeperState->assigned_role = DEMOTE_TIMEOUT_STATE; log_info("Network in not healthy, switching to state %s", NodeStateToString(keeperState->assigned_role)); } else { log_info("Network is healthy"); } } } /* * is_network_healthy returns false if the keeper appears to be in a * network partition, which it assumes to be the case if it cannot * communicate with neither the monitor, nor the secondary for at least * network_partition_timeout seconds. * * On the other side of the network partition, the monitor and the secondary * may proceed with a failover once the network partition timeout has passed, * since they are sure the primary is down at that point. */ static bool is_network_healthy(Keeper *keeper) { KeeperConfig *config = &(keeper->config); KeeperStateData *keeperState = &(keeper->state); LocalPostgresServer *postgres = &(keeper->postgres); int networkPartitionTimeout = config->network_partition_timeout; uint64_t now = time(NULL); bool hasReplica = false; if (keeperState->current_role != PRIMARY_STATE) { /* * Fail-over may only occur if we're currently the primary, so * we don't need to check for network partitions in other states. */ return true; } if (primary_has_replica(postgres, PG_AUTOCTL_REPLICA_USERNAME, &hasReplica) && hasReplica) { keeperState->last_secondary_contact = now; log_warn("We lost the monitor, but still have a standby: " "we're not in a network partition, continuing."); return true; } if (!in_network_partition(keeperState, now, networkPartitionTimeout)) { /* still had recent contact with monitor and/or secondary */ return true; } log_info("Failed to contact the monitor or standby in %d seconds, " "at %d seconds we shut down PostgreSQL to prevent split brain issues", (int) (now - keeperState->last_monitor_contact), networkPartitionTimeout); return false; } /* * in_network_partition determines if we're in a network partition by applying * the configured network_partition_timeout to current known values. Updating * the state before calling this function is advised. */ static bool in_network_partition(KeeperStateData *keeperState, uint64_t now, int networkPartitionTimeout) { uint64_t monitor_contact_lag = (now - keeperState->last_monitor_contact); uint64_t secondary_contact_lag = (now - keeperState->last_secondary_contact); return keeperState->last_monitor_contact > 0 && keeperState->last_secondary_contact > 0 && networkPartitionTimeout < monitor_contact_lag && networkPartitionTimeout < secondary_contact_lag; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_keeper.h000066400000000000000000000011461414244367200237230ustar00rootroot00000000000000 /* * src/bin/pg_autoctl/keeper_service.h * Utilities to start the keeper services. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef KEEPER_SERVICE_H #define KEEPER_SERVICE_H #include #include "keeper.h" #include "keeper_config.h" bool start_keeper(Keeper *keeper); bool service_keeper_start(void *context, pid_t *pid); void service_keeper_runprogram(Keeper *keeper); bool service_keeper_node_active_init(Keeper *keeper); bool keeper_node_active_loop(Keeper *keeper, pid_t start_pid); #endif /* KEEPER_SERVICE_INIT_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_keeper_init.c000066400000000000000000000073141414244367200247440ustar00rootroot00000000000000/* * src/bin/pg_autoctl/service_keeper_init.c * Keeper initialisation service. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "cli_common.h" #include "cli_root.h" #include "defaults.h" #include "fsm.h" #include "keeper.h" #include "keeper_config.h" #include "keeper_pg_init.h" #include "log.h" #include "monitor.h" #include "pgctl.h" #include "pidfile.h" #include "state.h" #include "service_keeper.h" #include "service_keeper_init.h" #include "service_postgres_ctl.h" #include "signals.h" #include "string_utils.h" #include "supervisor.h" /* * service_keeper_init defines and start services needed during the * keeper initialisation when doing `pg_autoctl create postgres`. We already * need to have our Postgres service supervisor sub-process started and ready * to start postgres when reaching initialization stage 2. */ bool service_keeper_init(Keeper *keeper) { const char *pidfile = keeper->config.pathnames.pid; Service subprocesses[] = { { SERVICE_NAME_POSTGRES, RP_PERMANENT, -1, &service_postgres_ctl_start, }, { SERVICE_NAME_KEEPER_INIT, createAndRun ? RP_PERMANENT : RP_TRANSIENT, -1, &service_keeper_init_start, (void *) keeper } }; int subprocessesCount = sizeof(subprocesses) / sizeof(subprocesses[0]); /* when using pg_autoctl create monitor --run, use "node-active" */ if (createAndRun) { strlcpy(subprocesses[1].name, SERVICE_NAME_KEEPER, NAMEDATALEN); } return supervisor_start(subprocesses, subprocessesCount, pidfile); } /* * service_keeper_init_start is a subprocess that runs the installation of the * pg_autoctl keeper and its Postgres service, including initdb or * pg_basebackup. */ bool service_keeper_init_start(void *context, pid_t *pid) { Keeper *keeper = (Keeper *) context; KeeperConfig *config = &(keeper->config); /* Flush stdio channels just before fork, to avoid double-output problems */ fflush(stdout); fflush(stderr); /* time to create the node_active sub-process */ pid_t fpid = fork(); switch (fpid) { case -1: { log_error("Failed to fork the keeper init process"); return false; } case 0: { /* * We are in a sub-process and didn't call exec() on our pg_autoctl * do service listener program yet we do not want to clean-up the * semaphore just yet. Publish that we are a sub-process and only * then quit, avoiding to call the atexit() semaphore clean-up * function. */ IntString semIdString = intToString(log_semaphore.semId); const char *serviceName = createAndRun ? "pg_autoctl: node active" : "pg_autoctl: node installer"; (void) set_ps_title(serviceName); setenv(PG_AUTOCTL_LOG_SEMAPHORE, semIdString.strValue, 1); if (!keeper_pg_init_and_register(keeper)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } if (keeperInitWarnings) { log_info("Keeper has been successfully initialized, " "please fix above warnings to complete installation."); exit(EXIT_CODE_QUIT); } log_info("%s has been successfully initialized.", config->role); if (createAndRun) { /* here we call execv() so we never get back */ (void) service_keeper_runprogram(keeper); /* unexpected */ log_fatal("BUG: returned from service_keeper_runprogram()"); exit(EXIT_CODE_INTERNAL_ERROR); } else { exit(EXIT_CODE_QUIT); } } default: { /* fork succeeded, in parent */ log_debug("pg_autoctl node installer process started in subprocess %d", fpid); *pid = fpid; return true; } } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_keeper_init.h000066400000000000000000000007401414244367200247450ustar00rootroot00000000000000 /* * src/bin/pg_autoctl/keeper_service_init.h * Utilities to start the keeper init services. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef KEEPER_SERVICE_INIT_H #define KEEPER_SERVICE_INIT_H #include #include "pgsql.h" #include "keeper_config.h" bool service_keeper_init(Keeper *keeper); bool service_keeper_init_start(void *context, pid_t *pid); #endif /* KEEPER_SERVICE_INIT_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_monitor.c000066400000000000000000000245631414244367200241420ustar00rootroot00000000000000/* * src/bin/pg_autoctl/monitor_service.c * Utilities to start/stop the pg_autoctl service on a monitor node. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include "cli_common.h" #include "cli_root.h" #include "defaults.h" #include "log.h" #include "monitor.h" #include "monitor_config.h" #include "monitor_pg_init.h" #include "pidfile.h" #include "service_monitor.h" #include "service_postgres_ctl.h" #include "signals.h" #include "string_utils.h" #include "supervisor.h" #include "runprogram.h" static void reload_configuration(Monitor *monitor); static bool monitor_ensure_configuration(Monitor *monitor); /* * monitor_service_start starts the monitor processes: the Postgres instance * and the user-facing LISTEN client that displays notifications. */ bool start_monitor(Monitor *monitor) { MonitorConfig *config = &(monitor->config); PostgresSetup *pgSetup = &(config->pgSetup); LocalPostgresServer postgres = { 0 }; Service subprocesses[] = { { SERVICE_NAME_POSTGRES, RP_PERMANENT, -1, &service_postgres_ctl_start }, { SERVICE_NAME_MONITOR, RP_PERMANENT, -1, &service_monitor_start, (void *) monitor } }; int subprocessesCount = sizeof(subprocesses) / sizeof(subprocesses[0]); /* initialize our local Postgres instance representation */ (void) local_postgres_init(&postgres, pgSetup); return supervisor_start(subprocesses, subprocessesCount, config->pathnames.pid); } /* * service_monitor_start starts a sub-process that listens to the monitor * notifications and outputs them for the user. */ bool service_monitor_start(void *context, pid_t *pid) { Monitor *monitor = (Monitor *) context; /* Flush stdio channels just before fork, to avoid double-output problems */ fflush(stdout); fflush(stderr); /* time to create the node_active sub-process */ pid_t fpid = fork(); switch (fpid) { case -1: { log_error("Failed to fork the monitor listener process"); return false; } case 0: { /* here we call execv() so we never get back */ (void) service_monitor_runprogram(monitor); /* unexpected */ log_fatal("BUG: returned from service_monitor_runprogram()"); exit(EXIT_CODE_INTERNAL_ERROR); } default: { /* fork succeeded, in parent */ log_debug("pg_autoctl listen process started in subprocess %d", fpid); *pid = fpid; return true; } } } /* * service_monitor_runprogram runs the node_active protocol service: * * $ pg_autoctl do service monitor --pgdata ... * * This function is intended to be called from the child process after a fork() * has been successfully done at the parent process level: it's calling * execve() and will never return. */ void service_monitor_runprogram(Monitor *monitor) { char *args[12]; int argsIndex = 0; char command[BUFSIZE]; /* * use --pgdata option rather than the config. * * On macOS when using /tmp, the file path is then redirected to being * /private/tmp when using realpath(2) as we do in normalize_filename(). So * for that case to be supported, we explicitely re-use whatever PGDATA or * --pgdata was parsed from the main command line to start our sub-process. * * The pg_autoctl monitor listener can get started from one of the * following top-level commands: * * - pg_autoctl create monitor --run * - pg_autoctl run * * The monitor specific commands set monitorOptions, the generic command * set keeperOptions. */ char *pgdata = IS_EMPTY_STRING_BUFFER(monitorOptions.pgSetup.pgdata) ? keeperOptions.pgSetup.pgdata : monitorOptions.pgSetup.pgdata; setenv(PG_AUTOCTL_DEBUG, "1", 1); args[argsIndex++] = (char *) pg_autoctl_program; args[argsIndex++] = "do"; args[argsIndex++] = "service"; args[argsIndex++] = "listener"; args[argsIndex++] = "--pgdata"; args[argsIndex++] = pgdata; args[argsIndex++] = logLevelToString(log_get_level()); args[argsIndex] = NULL; /* we do not want to call setsid() when running this program. */ Program program = { 0 }; (void) initialize_program(&program, args, false); program.capture = false; /* redirect output, don't capture */ program.stdOutFd = STDOUT_FILENO; program.stdErrFd = STDERR_FILENO; /* log the exact command line we're using */ (void) snprintf_program_command_line(&program, command, BUFSIZE); log_info("%s", command); (void) execute_program(&program); } /* * monitor_service_run watches over monitor process, restarts if it is * necessary, also loops over a LISTEN command that is notified at every change * of state on the monitor, and prints the change on stdout. */ bool monitor_service_run(Monitor *monitor) { MonitorConfig *mconfig = &(monitor->config); char postgresUri[MAXCONNINFO] = { 0 }; bool loggedAboutListening = false; bool firstLoop = true; LocalPostgresServer postgres = { 0 }; /* Initialize our local connection to the monitor */ if (!monitor_local_init(monitor)) { /* errors have already been logged */ exit(EXIT_CODE_MONITOR); } /* Now get the Monitor URI to display it to the user, and move along */ if (monitor_config_get_postgres_uri(mconfig, postgresUri, MAXCONNINFO)) { log_info("Managing the monitor at %s", postgresUri); } (void) local_postgres_init(&postgres, &(monitor->config.pgSetup)); /* * Main loop for notifications. */ for (;; firstLoop = false) { bool pgIsNotRunningIsOk = true; PostgresSetup *pgSetup = &(postgres.postgresSetup); if (asked_to_reload || firstLoop) { (void) reload_configuration(monitor); } if (asked_to_stop || asked_to_stop_fast) { log_info("Listener service received signal %s, terminating", signal_to_string(get_current_signal(SIGTERM))); break; } /* * On the first loop we don't expect Postgres to be running, and on * following loops it should be all fine. That said, at any point in * time, if Postgres is not running now is a good time to make sure * it's running. * * Also, whenever Postgres has been restarted, we should check about * the version in the shared object library and maybe upgrade the * extension SQL definitions to match. */ if (firstLoop || !pg_setup_is_ready(pgSetup, pgIsNotRunningIsOk)) { MonitorExtensionVersion version = { 0 }; if (!ensure_postgres_service_is_running_as_subprocess(&postgres)) { log_error("Failed to ensure Postgres is running " "as a pg_autoctl subprocess, " "see above for details."); return false; } /* Check version compatibility. */ if (!monitor_ensure_extension_version(monitor, &postgres, &version)) { /* maybe we failed to connect to the monitor */ if (monitor->pgsql.status != PG_CONNECTION_OK) { /* leave some time to the monitor before we try again */ sleep(PG_AUTOCTL_MONITOR_RETRY_TIME); continue; } /* or maybe we failed to update the extension altogether */ return false; } } if (!loggedAboutListening) { log_info("Contacting the monitor to LISTEN to its events."); loggedAboutListening = true; } if (!monitor_get_notifications(monitor, /* we want the time in milliseconds */ PG_AUTOCTL_MONITOR_SLEEP_TIME * 1000)) { log_warn("Re-establishing connection. We might miss notifications."); pgsql_finish(&(monitor->pgsql)); pgsql_finish(&(monitor->notificationClient)); continue; } } pgsql_finish(&(monitor->pgsql)); pgsql_finish(&(monitor->notificationClient)); return true; } /* * reload_configuration reads the supposedly new configuration file and * integrates accepted new values into the current setup. */ static void reload_configuration(Monitor *monitor) { MonitorConfig *config = &(monitor->config); if (file_exists(config->pathnames.config)) { MonitorConfig newConfig = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; /* * Set the same configuration and state file as the current config. */ strlcpy(newConfig.pathnames.config, config->pathnames.config, MAXPGPATH); if (monitor_config_read_file(&newConfig, missingPgdataIsOk, pgIsNotRunningIsOk) && monitor_config_accept_new(config, &newConfig)) { log_info("Reloaded the new configuration from \"%s\"", config->pathnames.config); /* * The new configuration might impact the Postgres setup, such as * when changing the SSL file paths. */ if (!monitor_ensure_configuration(monitor)) { log_warn("Failed to reload pg_autoctl configuration, " "see above for details"); } } else { log_warn("Failed to read configuration file \"%s\", " "continuing with the same configuration.", config->pathnames.config); } } else { log_warn("Configuration file \"%s\" does not exist, " "continuing with the same configuration.", config->pathnames.config); } /* we're done reloading now. */ asked_to_reload = 0; } /* * monitor_ensure_configuration updates the Postgres settings to match the * pg_autoctl configuration file, if necessary. */ static bool monitor_ensure_configuration(Monitor *monitor) { MonitorConfig *config = &(monitor->config); PostgresSetup *pgSetup = &(config->pgSetup); LocalPostgresServer postgres = { 0 }; PostgresSetup *pgSetupReload = &(postgres.postgresSetup); bool missingPgdataIsOk = false; bool pgIsNotRunningIsOk = true; if (!monitor_add_postgres_default_settings(monitor)) { log_error("Failed to initialize our Postgres settings, " "see above for details"); return false; } if (!pg_setup_init(pgSetupReload, pgSetup, missingPgdataIsOk, pgIsNotRunningIsOk)) { log_fatal("Failed to initialize a monitor node, see above for details"); return false; } /* * To reload Postgres config, we need to connect as the local system user, * otherwise using the autoctl_node user does not provide us with enough * privileges. */ strlcpy(pgSetupReload->username, "", NAMEDATALEN); strlcpy(pgSetupReload->dbname, "template1", NAMEDATALEN); local_postgres_init(&postgres, pgSetupReload); if (pg_setup_is_ready(&(postgres.postgresSetup), pgIsNotRunningIsOk)) { if (!pgsql_reload_conf(&(postgres.sqlClient))) { log_warn("Failed to reload Postgres configuration after " "reloading pg_autoctl configuration, " "see above for details"); return false; } pgsql_finish(&(postgres.sqlClient)); } return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_monitor.h000066400000000000000000000012211414244367200241310ustar00rootroot00000000000000/* * src/bin/pg_autoctl/monitor_service.h * Utilities to start/stop the pg_autoctl service on a monitor node. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef MONITOR_SERVICE_H #define MONITOR_SERVICE_H #include #include "pgsql.h" #include "monitor_config.h" bool start_monitor(Monitor *monitor); bool service_monitor_start(void *context, pid_t *pid); bool service_monitor_stop(void *context); bool monitor_service_run(Monitor *monitor); void service_monitor_runprogram(Monitor *monitor); void service_monitor_reload(void *context); #endif /* MONITOR_SERVICE_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_monitor_init.c000066400000000000000000000076601414244367200251640ustar00rootroot00000000000000/* * src/bin/pg_autoctl/service_monitor_init.c * Monitor initialisation service. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include "cli_common.h" #include "cli_root.h" #include "defaults.h" #include "fsm.h" #include "log.h" #include "monitor.h" #include "monitor_pg_init.h" #include "pgctl.h" #include "pidfile.h" #include "state.h" #include "service_monitor.h" #include "service_monitor_init.h" #include "service_postgres_ctl.h" #include "signals.h" #include "string_utils.h" #include "supervisor.h" static bool service_monitor_init_start(void *context, pid_t *pid); /* * monitor_pg_finish_init starts the Postgres instance that we need running to * finish our installation, and finished the installation of the pgautofailover * monitor extension in the Postgres instance. */ bool service_monitor_init(Monitor *monitor) { MonitorConfig *config = &monitor->config; PostgresSetup *pgSetup = &config->pgSetup; LocalPostgresServer postgres = { 0 }; Service subprocesses[] = { { SERVICE_NAME_POSTGRES, RP_PERMANENT, -1, &service_postgres_ctl_start }, { SERVICE_NAME_MONITOR_INIT, createAndRun ? RP_PERMANENT : RP_TRANSIENT, -1, &service_monitor_init_start, (void *) monitor } }; int subprocessesCount = sizeof(subprocesses) / sizeof(subprocesses[0]); /* when using pg_autoctl create monitor --run, use "listener" */ if (createAndRun) { strlcpy(subprocesses[1].name, SERVICE_NAME_MONITOR, NAMEDATALEN); } /* We didn't create our target username/dbname yet */ strlcpy(pgSetup->username, "", NAMEDATALEN); strlcpy(pgSetup->dbname, "", NAMEDATALEN); /* initialize our local Postgres instance representation */ (void) local_postgres_init(&postgres, pgSetup); if (!supervisor_start(subprocesses, subprocessesCount, config->pathnames.pid)) { /* errors have already been logged */ return false; } /* we only get there when the supervisor exited successfully (SIGTERM) */ return true; } /* * service_monitor_init_start is a subprocess that finishes the installation of * the monitor extension for pgautofailover. */ static bool service_monitor_init_start(void *context, pid_t *pid) { Monitor *monitor = (Monitor *) context; MonitorConfig *config = &monitor->config; PostgresSetup *pgSetup = &config->pgSetup; /* Flush stdio channels just before fork, to avoid double-output problems */ fflush(stdout); fflush(stderr); /* time to create the node_active sub-process */ pid_t fpid = fork(); switch (fpid) { case -1: { log_error("Failed to fork the monitor install process"); return false; } case 0: { /* * We are in a sub-process and didn't call exec() on our pg_autoctl * do service listener program yet we do not want to clean-up the * semaphore just yet. Publish that we are a sub-process and only * then quit, avoiding to call the atexit() semaphore clean-up * function. */ const char *serviceName = createAndRun ? "pg_autoctl: monitor listener" : "pg_autoctl: monitor installer"; (void) set_ps_title(serviceName); /* finish the install if necessary */ if (!monitor_install(config->hostname, *pgSetup, false)) { /* errors have already been logged */ exit(EXIT_CODE_INTERNAL_ERROR); } log_info("Monitor has been successfully initialized."); if (createAndRun) { /* here we call execv() so we never get back */ (void) service_monitor_runprogram(monitor); /* unexpected */ log_fatal("BUG: returned from service_monitor_runprogram()"); exit(EXIT_CODE_INTERNAL_ERROR); } else { exit(EXIT_CODE_QUIT); } } default: { /* fork succeeded, in parent */ log_debug("pg_autoctl installer process started in subprocess %d", fpid); *pid = fpid; return true; } } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_monitor_init.h000066400000000000000000000006541414244367200251650ustar00rootroot00000000000000/* * src/bin/pg_autoctl/monitor_service_init.h * Utilities to start the monitor init services. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef MONITOR_SERVICE_INIT_H #define MONITOR_SERVICE_INIT_H #include #include "pgsql.h" #include "monitor_config.h" bool service_monitor_init(Monitor *monitor); #endif /* MONITOR_SERVICE_INIT_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_postgres.c000066400000000000000000000067631414244367200243230ustar00rootroot00000000000000/* * src/bin/pg_autoctl/postgres_service.c * Utilities to start/stop the pg_autoctl service. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include "cli_common.h" #include "cli_root.h" #include "defaults.h" #include "log.h" #include "monitor.h" #include "monitor_config.h" #include "pgsetup.h" #include "pidfile.h" #include "primary_standby.h" #include "service_postgres.h" #include "signals.h" #include "supervisor.h" #include "state.h" #include "string_utils.h" #include "runprogram.h" int countPostgresStart = 0; /* * service_postgres_start starts "postgres" in a sub-process. Rather than using * pg_ctl start, which forks off a deamon, we want to control the sub-process * and maintain it as a process child of pg_autoctl. */ bool service_postgres_start(void *context, pid_t *pid) { PostgresSetup *pgSetup = (PostgresSetup *) context; /* Flush stdio channels just before fork, to avoid double-output problems */ fflush(stdout); fflush(stderr); /* time to create the node_active sub-process */ pid_t fpid = fork(); switch (fpid) { case -1: { log_error("Failed to fork the postgres supervisor process"); return false; } case 0: { (void) set_ps_title("postgres"); log_trace("service_postgres_start: EXEC postgres"); bool listen = true; /* execv() the postgres binary directly, as a sub-process */ (void) pg_ctl_postgres(pgSetup->pg_ctl, pgSetup->pgdata, pgSetup->pgport, pgSetup->listen_addresses, listen); /* unexpected */ log_fatal("BUG: returned from service_keeper_runprogram()"); exit(EXIT_CODE_INTERNAL_ERROR); } default: { int timeout = 10; /* wait for Postgres for 10s */ int logLevel = ++countPostgresStart == 1 ? LOG_INFO : LOG_DEBUG; log_debug("pg_autoctl started postgres in subprocess %d", fpid); *pid = fpid; /* we're starting postgres, reset the cached value for the pid */ pgSetup->pidFile.pid = 0; bool pgIsReady = pg_setup_wait_until_is_ready(pgSetup, timeout, logLevel); /* * If Postgres failed to start the least we can do is log the * "startup.log" file prominently to the user now. */ if (!pgIsReady) { (void) pg_log_startup(pgSetup->pgdata, LOG_ERROR); } else if (log_get_level() <= LOG_DEBUG) { /* * If postgres started successfully we only log startup * messages in DEBUG or TRACE loglevel. Otherwise we get might * see this confusing error, but harmless error message: * ERROR: database "postgres" already exists */ (void) pg_log_startup(pgSetup->pgdata, LOG_DEBUG); } return pgIsReady; } } } /* * service_postgres_stop stops the postgres service, using pg_ctl stop. */ bool service_postgres_stop(Service *service) { PostgresSetup *pgSetup = (PostgresSetup *) service->context; log_info("Stopping pg_autoctl postgres service"); if (!pg_ctl_stop(pgSetup->pg_ctl, pgSetup->pgdata)) { log_error("Failed to stop Postgres, see above for details"); return false; } /* cache invalidation */ service->pid = 0; return true; } /* * service_postgres_reload signal Postgres with a SIGHUP */ void service_postgres_reload(Service *service) { log_info("Reloading pg_autoctl postgres service [%d]", service->pid); if (kill(service->pid, SIGHUP) != 0) { log_error("Failed to send SIGHUP to Postgres pid %d: %m", service->pid); } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_postgres.h000066400000000000000000000011451414244367200243150ustar00rootroot00000000000000/* * src/bin/pg_autoctl/service_postgres.h * Utilities to start/stop the pg_autoctl service on a monitor node. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef SERVICE_POSTGRES_H #define SERVICE_POSTGRES_H #include #include #include "keeper.h" #include "keeper_config.h" #include "supervisor.h" extern int countPostgresStart; bool service_postgres_start(void *context, pid_t *pid); bool service_postgres_stop(Service *service); void service_postgres_reload(Service *service); #endif /* SERVICE_POSTGRES_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_postgres_ctl.c000066400000000000000000000321241414244367200251530ustar00rootroot00000000000000/* * src/bin/pg_autoctl/postgres_service_ctl.c * Utilities to start/stop the pg_autoctl service. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include "cli_common.h" #include "cli_root.h" #include "defaults.h" #include "log.h" #include "monitor.h" #include "monitor_config.h" #include "pgsetup.h" #include "pidfile.h" #include "primary_standby.h" #include "service_postgres.h" #include "service_postgres_ctl.h" #include "signals.h" #include "supervisor.h" #include "state.h" #include "string_utils.h" #include "runprogram.h" static bool shutdownSequenceInProgress = false; static bool ensure_postgres_status(LocalPostgresServer *postgres, Service *service); static bool ensure_postgres_status_stopped(LocalPostgresServer *postgres, Service *service); static bool ensure_postgres_status_running(LocalPostgresServer *postgres, Service *service, bool ensurePostgresSubprocess); /* * service_postgres_ctl_start starts a subprocess that implements the postgres * service depending on the current assigned and goal state of the keeper. */ bool service_postgres_ctl_start(void *context, pid_t *pid) { /* Flush stdio channels just before fork, to avoid double-output problems */ fflush(stdout); fflush(stderr); /* time to create the node_active sub-process */ pid_t fpid = fork(); switch (fpid) { case -1: { log_error("Failed to fork the postgres controller process"); return false; } case 0: { /* here we call execv() so we never get back */ (void) service_postgres_ctl_runprogram(); /* unexpected */ log_fatal("BUG: returned from service_keeper_runprogram()"); exit(EXIT_CODE_INTERNAL_ERROR); } default: { log_debug( "pg_autoctl started postgres controller in subprocess %d", fpid); *pid = fpid; return true; } } } /* * service_postgres_ctl_runprogram runs the postgres controller service: * * $ pg_autoctl do service postgres --pgdata ... */ void service_postgres_ctl_runprogram() { char *args[12]; int argsIndex = 0; char command[BUFSIZE]; /* * use --pgdata option rather than the config. * * On macOS when using /tmp, the file path is then redirected to being * /private/tmp when using realpath(2) as we do in normalize_filename(). So * for that case to be supported, we explicitely re-use whatever PGDATA or * --pgdata was parsed from the main command line to start our sub-process. * * The pg_autoctl postgres controller is used both in the monitor context * and in the keeper context; which means it gets started from one of the * following top-level commands: * * - pg_autoctl create monitor * - pg_autoctl create postgres * - pg_autoctl run * * The monitor specific commands set monitorOptions, the generic and keeper * specific commands set keeperOptions. */ char *pgdata = IS_EMPTY_STRING_BUFFER(monitorOptions.pgSetup.pgdata) ? keeperOptions.pgSetup.pgdata : monitorOptions.pgSetup.pgdata; setenv(PG_AUTOCTL_DEBUG, "1", 1); args[argsIndex++] = (char *) pg_autoctl_program; args[argsIndex++] = "do"; args[argsIndex++] = "service"; args[argsIndex++] = "postgres"; args[argsIndex++] = "--pgdata"; args[argsIndex++] = pgdata; args[argsIndex++] = logLevelToString(log_get_level()); args[argsIndex] = NULL; /* we do not want to call setsid() when running this program. */ Program program = { 0 }; (void) initialize_program(&program, args, false); program.capture = false; /* redirect output, don't capture */ program.stdOutFd = STDOUT_FILENO; program.stdErrFd = STDERR_FILENO; /* log the exact command line we're using */ (void) snprintf_program_command_line(&program, command, BUFSIZE); log_info("%s", command); (void) execute_program(&program); } /* * service_postgres_ctl_loop loops over the current CTL state file and ensure * that Postgres is running when that's expected, or that Postgres is not * running when in a state where we should maintain Postgres down to avoid * split-brain situations. */ void service_postgres_ctl_loop(LocalPostgresServer *postgres) { PostgresSetup *pgSetup = &(postgres->postgresSetup); LocalExpectedPostgresStatus *localStatus = &(postgres->expectedPgStatus); KeeperStatePostgres *pgStatus = &(localStatus->state); /* * We re-use a service definition because that's handy for our code here, * but we implement our own policy for handling the service: the keeper * process might want Postgres to not be running at times, to avoid * split-brain situations. */ Service postgresService = { SERVICE_NAME_POSTGRES, RP_PERMANENT, /* actually micro-managed in this loop */ -1, &service_postgres_start, (void *) pgSetup }; bool pgStatusPathIsReady = false; /* make sure to initialize the expected Postgres status to unknown */ pgStatus->pgExpectedStatus = PG_EXPECTED_STATUS_UNKNOWN; for (;;) { int status; /* we might have to reload, pass the signal down */ if (asked_to_reload) { (void) service_postgres_reload((void *) &postgresService); asked_to_reload = 0; } /* that's expected the shutdown sequence from the supervisor */ if (asked_to_stop || asked_to_stop_fast || asked_to_quit) { if (!shutdownSequenceInProgress) { shutdownSequenceInProgress = true; log_info("Postgres controller service received signal %s, " "terminating", signal_to_string(get_current_signal(SIGTERM))); } if (!ensure_postgres_status_stopped(postgres, &postgresService)) { log_error("Failed to stop Postgres, see above for details"); pg_usleep(100 * 1000); /* 100ms */ continue; } exit(EXIT_CODE_QUIT); } /* * This postgres controller process is running Postgres as a child * process and thus is responsible for calling waitpid() from time to * time. */ pid_t pid = waitpid(-1, &status, WNOHANG); switch (pid) { case -1: { /* if our PostgresService stopped, just continue */ if (errno != ECHILD) { log_error("Failed to call waitpid(): %m"); } break; } case 0: { /* * We're using WNOHANG, 0 means there are no stopped or exited * children, it's all good. It's the expected case when * everything is running smoothly, so enjoy and sleep for * awhile. */ break; } default: { if (pid != postgresService.pid) { /* might be one of our pg_controldata... */ char *verb = WIFEXITED(status) ? "exited" : "failed"; log_debug("waitpid(): process %d has %s", pid, verb); } /* * Postgres is not running anymore, the rest of the code will * handle that situation, just continue. */ break; } } if (pg_setup_pgdata_exists(pgSetup)) { /* * If we have a PGDATA directory, now is a good time to initialize * our LocalPostgresServer structure and its file paths to point at * the right place: we need to normalize PGDATA to its realpath * location. */ if (!pgStatusPathIsReady) { /* initialize our Postgres state file path */ if (!local_postgres_set_status_path(postgres, false)) { /* highly unexpected */ log_error("Failed to build postgres state file pathname, " "see above for details."); /* maybe next round will have better luck? */ pg_usleep(100 * 1000); /* 100ms */ continue; } pgStatusPathIsReady = true; log_trace("Reading current postgres expected status from \"%s\"", localStatus->pgStatusPath); } } else if (!pgStatusPathIsReady) { /* * If PGDATA doesn't exists yet, we didn't have a chance to * normalize its filename and we might be reading the wrong file * for the Postgres expected status. So we first check if our * pgSetup reflects an existing on-disk instance and if not, update * it until it does. * * The keeper init process is reponsible for running pg_ctl initdb. * * Given that we have two processes working concurrently and * deciding at the same time what's next, we need to be cautious * about race conditions. We add extra checks around existence of * files to make sure we don't get started too early. */ PostgresSetup newPgSetup = { 0 }; bool missingPgdataIsOk = true; bool postgresNotRunningIsOk = true; if (pg_setup_init(&newPgSetup, pgSetup, missingPgdataIsOk, postgresNotRunningIsOk) && pg_setup_pgdata_exists(&newPgSetup) && pg_auto_failover_default_settings_file_exists(&newPgSetup)) { *pgSetup = newPgSetup; } pg_usleep(100 * 1000); /* 100ms */ continue; } /* * Maintain a Postgres service as a sub-process. * * Depending on the current state of the keeper, we need to either * ensure that Postgres is running, or that it is NOT running. To avoid * split-brain situations, we need to ensure Postgres is not running in * the DEMOTED state, for instance. * * Adding to that, during the `pg_autoctl create postgres` phase we * also need to start Postgres and sometimes even restart it. */ if (pgStatusPathIsReady && file_exists(localStatus->pgStatusPath)) { const char *filename = localStatus->pgStatusPath; if (!keeper_postgres_state_read(pgStatus, filename)) { /* errors have already been logged, will try again */ pg_usleep(100 * 1000); /* 100ms */ continue; } log_trace("service_postgres_ctl_loop: %s in %s", ExpectedPostgresStatusToString(pgStatus->pgExpectedStatus), filename); if (!ensure_postgres_status(postgres, &postgresService)) { pgStatusPathIsReady = false; } } pg_usleep(100 * 1000); /* 100ms */ } } /* * ensure_postgres_status ensures that the current keeper's state is met with * the current PostgreSQL status, at minimum that PostgreSQL is running when * it's expected to be, etc. * * The Postgres controller process (the code in this file) takes orders from * another process, either the monitor "listener" or the keeper "node active" * process. The orders are sent through a shared file containing the expected * status of the Postgres service. * * This process only reads the file, and the "other" process is responsible for * writing it: deleting a stale version of it at startup, creating it, updating * it. */ static bool ensure_postgres_status(LocalPostgresServer *postgres, Service *service) { KeeperStatePostgres *pgStatus = &(postgres->expectedPgStatus.state); log_trace("ensure_postgres_status: %s", ExpectedPostgresStatusToString(pgStatus->pgExpectedStatus)); switch (pgStatus->pgExpectedStatus) { case PG_EXPECTED_STATUS_UNKNOWN: { /* do nothing */ return true; } case PG_EXPECTED_STATUS_STOPPED: { return ensure_postgres_status_stopped(postgres, service); } case PG_EXPECTED_STATUS_RUNNING: { return ensure_postgres_status_running(postgres, service, false); } case PG_EXPECTED_STATUS_RUNNING_AS_SUBPROCESS: { return ensure_postgres_status_running(postgres, service, true); } } /* make compiler happy */ return false; } /* * ensure_postgres_status_stopped ensures that Postgres is stopped. */ static bool ensure_postgres_status_stopped(LocalPostgresServer *postgres, Service *service) { PostgresSetup *pgSetup = &(postgres->postgresSetup); bool pgIsNotRunningIsOk = true; bool pgIsRunning = pg_setup_is_ready(pgSetup, pgIsNotRunningIsOk); if (pgIsRunning) { /* service_postgres_stop() logs about stopping Postgres */ log_debug("pg_autoctl: stop postgres (pid %d)", service->pid); return service_postgres_stop(service); } return true; } /* * ensure_postgres_status_running ensures that Postgres is running. */ static bool ensure_postgres_status_running(LocalPostgresServer *postgres, Service *service, bool ensurePostgresSubprocess) { PostgresSetup *pgSetup = &(postgres->postgresSetup); /* we might still be starting-up */ bool pgIsNotRunningIsOk = true; bool pgIsRunning = pg_setup_is_ready(pgSetup, pgIsNotRunningIsOk); bool restartPostgres = false; log_trace("ensure_postgres_status_running: %s", pgIsRunning ? "running" : "not running"); if (pgIsRunning) { if (ensurePostgresSubprocess && pgSetup->pidFile.pid != service->pid) { restartPostgres = true; log_warn("Postgres is already running with pid %d, " "which is not a sub-process of pg_autoctl, " "restarting Postgres", pgSetup->pidFile.pid); if (!service_postgres_stop(service)) { log_fatal("Failed to stop Postgres pid %d, " "see above for details", pgSetup->pidFile.pid); return false; } } else { return true; } } if (service_postgres_start(service->context, &(service->pid))) { if (countPostgresStart > 1) { log_warn("PostgreSQL was not running, restarted with pid %d", pgSetup->pidFile.pid); } if (restartPostgres) { log_warn("PostgreSQL had to be stopped and restarted, " "it is now running as a subprocess of pg_autoctl, " "with pid %d", pgSetup->pidFile.pid); } return true; } else { log_warn("Failed to start Postgres instance at \"%s\"", pgSetup->pgdata); return false; } return true; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/service_postgres_ctl.h000066400000000000000000000011151414244367200251540ustar00rootroot00000000000000/* * src/bin/pg_autoctl/service_postgres_ctl.h * Utilities to start/stop the pg_autoctl service on a keeper node. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef SERVICE_POSTGRES_CTL_H #define SERVICE_POSTGRES_CTL_H #include #include #include "keeper.h" #include "keeper_config.h" bool service_postgres_ctl_start(void *context, pid_t *pid); void service_postgres_ctl_runprogram(void); void service_postgres_ctl_loop(LocalPostgresServer *postgres); #endif /* SERVICE_POSTGRES_CTL_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/signals.c000066400000000000000000000116261414244367200223670ustar00rootroot00000000000000/* * src/bin/pg_autoctl/signals.c * Signal handlers for pg_autoctl, used in loop.c and pgsetup.c * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include "postgres_fe.h" /* pqsignal, portable sigaction wrapper */ #include "cli_root.h" #include "defaults.h" #include "lock_utils.h" #include "log.h" #include "signals.h" /* This flag controls termination of the main loop. */ volatile sig_atomic_t asked_to_stop = 0; /* SIGTERM */ volatile sig_atomic_t asked_to_stop_fast = 0; /* SIGINT */ volatile sig_atomic_t asked_to_reload = 0; /* SIGHUP */ volatile sig_atomic_t asked_to_quit = 0; /* SIGQUIT */ /* * set_signal_handlers sets our signal handlers for the 4 signals that we * specifically handle in pg_autoctl. */ void set_signal_handlers(bool exitOnQuit) { /* Establish a handler for signals. */ log_trace("set_signal_handlers%s", exitOnQuit ? " (exit on quit)" : ""); pqsignal(SIGHUP, catch_reload); pqsignal(SIGINT, catch_int); pqsignal(SIGTERM, catch_term); if (exitOnQuit) { pqsignal(SIGQUIT, catch_quit_and_exit); } else { pqsignal(SIGQUIT, catch_quit); } } /* * mask_signals prepares a pselect() call by masking all the signals we handle * in this part of the code, to avoid race conditions with setting our atomic * variables at signal handling. */ bool block_signals(sigset_t *mask, sigset_t *orig_mask) { int signals[] = { SIGHUP, SIGINT, SIGTERM, SIGQUIT, -1 }; if (sigemptyset(mask) == -1) { /* man sigemptyset sayth: No errors are defined. */ log_error("sigemptyset: %m"); return false; } for (int i = 0; signals[i] != -1; i++) { /* * The sigaddset() function may fail if: * * EINVAL The value of the signo argument is an invalid or unsupported * signal number * * This should never happen given the manual set of signals we are * processing here in this loop. */ if (sigaddset(mask, signals[i]) == -1) { log_error("sigaddset: %m"); return false; } } if (sigprocmask(SIG_BLOCK, mask, orig_mask) == -1) { log_error("Failed to block signals: sigprocmask: %m"); return false; } return true; } /* * unblock_signals calls sigprocmask to re-establish the normal signal mask, in * order to allow our code to handle signals again. * * If we fail to unblock signals, then we won't be able to react to any * interruption, reload, or shutdown sequence, and we'd rather exit now. */ void unblock_signals(sigset_t *orig_mask) { /* restore signal masks (un block them) now */ if (sigprocmask(SIG_SETMASK, orig_mask, NULL) == -1) { log_fatal("Failed to restore signals: sigprocmask: %m"); exit(EXIT_CODE_INTERNAL_ERROR); } } /* * catch_reload receives the SIGHUP signal. */ void catch_reload(int sig) { asked_to_reload = 1; pqsignal(sig, catch_reload); } /* * catch_int receives the SIGINT signal. */ void catch_int(int sig) { asked_to_stop_fast = 1; pqsignal(sig, catch_int); } /* * catch_stop receives SIGTERM signal. */ void catch_term(int sig) { asked_to_stop = 1; pqsignal(sig, catch_term); } /* * catch_quit receives the SIGQUIT signal. */ void catch_quit(int sig) { /* default signal handler disposition is to core dump, we don't */ asked_to_quit = 1; pqsignal(sig, catch_quit); } /* * quit_and_exit exit(EXIT_CODE_QUIT) upon receiving the SIGQUIT signal. */ void catch_quit_and_exit(int sig) { /* default signal handler disposition is to core dump, we don't */ exit(EXIT_CODE_QUIT); } /* * get_current_signal returns the current signal to process and gives a prioriy * towards SIGQUIT, then SIGINT, then SIGTERM. */ int get_current_signal(int defaultSignal) { if (asked_to_quit) { return SIGQUIT; } else if (asked_to_stop_fast) { return SIGINT; } else if (asked_to_stop) { return SIGTERM; } /* no termination signal to process at this time, return the default */ return defaultSignal; } /* * pick_stronger_signal returns the "stronger" signal among the two given * arguments. * * Signal processing have a priority or hierarchy of their own. Once we have * received and processed SIGQUIT we want to stay at this signal level. Once we * have received SIGINT we may upgrade to SIGQUIT, but we won't downgrade to * SIGTERM. */ int pick_stronger_signal(int sig1, int sig2) { if (sig1 == SIGQUIT || sig2 == SIGQUIT) { return SIGQUIT; } else if (sig1 == SIGINT || sig2 == SIGINT) { return SIGINT; } else { return SIGTERM; } } /* * signal_to_string is our own specialised function to display a signal. The * strsignal() output does not look like what we need. */ char * signal_to_string(int signal) { switch (signal) { case SIGQUIT: { return "SIGQUIT"; } case SIGTERM: { return "SIGTERM"; } case SIGINT: { return "SIGINT"; } case SIGHUP: { return "SIGHUP"; } default: return "unknown signal"; } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/signals.h000066400000000000000000000021251414244367200223660ustar00rootroot00000000000000/* * src/bin/pg_autoctl/signals.h * Signal handlers for pg_autoctl, used in loop.c and pgsetup.c * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef SIGNALS_H #define SIGNALS_H #include #include /* This flag controls termination of the main loop. */ extern volatile sig_atomic_t asked_to_stop; /* SIGTERM */ extern volatile sig_atomic_t asked_to_stop_fast; /* SIGINT */ extern volatile sig_atomic_t asked_to_reload; /* SIGHUP */ extern volatile sig_atomic_t asked_to_quit; /* SIGQUIT */ #define CHECK_FOR_FAST_SHUTDOWN { if (asked_to_stop_fast) { break; } \ } void set_signal_handlers(bool exitOnQuit); bool block_signals(sigset_t *mask, sigset_t *orig_mask); void unblock_signals(sigset_t *orig_mask); void catch_reload(int sig); void catch_int(int sig); void catch_term(int sig); void catch_quit(int sig); void catch_quit_and_exit(int sig); int get_current_signal(int defaultSignal); int pick_stronger_signal(int sig1, int sig2); char * signal_to_string(int signal); #endif /* SIGNALS_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/state.c000066400000000000000000000577561414244367200220650ustar00rootroot00000000000000/* * src/bin/pg_autoctl/state.c * Keeper state functions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include "postgres_fe.h" #include "libpq-fe.h" #include "parson.h" #include "defaults.h" #include "file_utils.h" #include "keeper_config.h" #include "keeper.h" #include "log.h" #include "pgctl.h" #include "pgsetup.h" #include "pgsql.h" #include "state.h" static bool keeper_state_is_readable(int pg_autoctl_state_version); static bool keeper_init_state_write(KeeperStateInit *initState, const char *filename); static bool keeper_postgres_state_write(KeeperStatePostgres *pgStatus, const char *filename); /* * keeper_state_read initializes our current state in-memory from disk. */ bool keeper_state_read(KeeperStateData *keeperState, const char *filename) { char *content = NULL; long fileSize; log_debug("Reading current state from \"%s\"", filename); if (!read_file(filename, &content, &fileSize)) { log_error("Failed to read Keeper state from file \"%s\"", filename); return false; } int pg_autoctl_state_version = ((KeeperStateData *) content)->pg_autoctl_state_version; if (fileSize >= sizeof(KeeperStateData) && keeper_state_is_readable(pg_autoctl_state_version)) { *keeperState = *(KeeperStateData *) content; free(content); return true; } free(content); /* Looks like it's a mess. */ log_error("Keeper state file \"%s\" exists but is broken or wrong version", filename); return false; } /* * keeper_state_is_readable returns true if we can read a state file from the * given version of pg_autoctl. */ static bool keeper_state_is_readable(int pg_autoctl_state_version) { return pg_autoctl_state_version == PG_AUTOCTL_STATE_VERSION || (pg_autoctl_state_version == 1 && PG_AUTOCTL_STATE_VERSION == 2); } /* * The KeeperState data structure contains only direct values (int, long), not * a single pointer, so writing to disk is a single fwrite() instruction. * */ bool keeper_state_write(KeeperStateData *keeperState, const char *filename) { char buffer[PG_AUTOCTL_KEEPER_STATE_FILE_SIZE]; char tempFileName[MAXPGPATH]; /* we're going to write our contents to keeper.state.new first */ sformat(tempFileName, MAXPGPATH, "%s.new", filename); /* * The keeper process might have been stopped in immediate shutdown mode * (SIGQUIT) and left a stale state.new file around, or maybe another * situation led to a file at tempFileName existing already. Clean-up the * stage before preparing our new state file's content. */ if (!unlink_file(tempFileName)) { /* errors have already been logged */ return false; } log_debug("Writing current state to \"%s\"", tempFileName); /* * Comment kept as is from PostgreSQL source code, function * RewriteControlFile() in postgresql/src/bin/pg_resetwal/pg_resetwal.c * * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding * the excess over sizeof(ControlFileData). This reduces the odds of * premature-EOF errors when reading pg_control. We'll still fail when we * check the contents of the file, but hopefully with a more specific * error than "couldn't read pg_control". */ memset(buffer, 0, PG_AUTOCTL_KEEPER_STATE_FILE_SIZE); /* * Explanation of IGNORE-BANNED: * memcpy is safe to use here. * we have a static assert that sizeof(KeeperStateData) is always * less than the buffer length PG_AUTOCTL_KEEPER_STATE_FILE_SIZE. * also KeeperStateData is a plain struct that does not contain * any pointers in it. Necessary comment about not using pointers * is added to the struct definition. */ memcpy(buffer, keeperState, sizeof(KeeperStateData)); /* IGNORE-BANNED */ int fd = open(tempFileName, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); if (fd < 0) { log_fatal("Failed to create keeper state file \"%s\": %m", tempFileName); return false; } errno = 0; if (write(fd, buffer, PG_AUTOCTL_KEEPER_STATE_FILE_SIZE) != PG_AUTOCTL_KEEPER_STATE_FILE_SIZE) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) { errno = ENOSPC; } log_fatal("Failed to write keeper state file \"%s\": %m", tempFileName); return false; } if (fsync(fd) != 0) { log_fatal("fsync error: %m"); return false; } if (close(fd) != 0) { log_fatal("Failed to close file \"%s\": %m", tempFileName); return false; } log_debug("rename \"%s\" to \"%s\"", tempFileName, filename); /* now remove the old state file, and replace it with the new one */ if (rename(tempFileName, filename) != 0) { log_fatal("Failed to rename \"%s\" to \"%s\": %m", tempFileName, filename); return false; } return true; } /* * keeper_state_init initializes a new state structure with default values. */ void keeper_state_init(KeeperStateData *keeperState) { memset(keeperState, 0, sizeof(KeeperStateData)); keeperState->pg_autoctl_state_version = PG_AUTOCTL_STATE_VERSION; keeperState->current_node_id = -1; keeperState->current_group = 1; /* a node always starts in the init state and transitions from there */ keeperState->current_role = INIT_STATE; /* we do not know our assigned state yet */ keeperState->assigned_role = NO_STATE; /* we do not know the xlog lag of the secondary */ keeperState->xlog_lag = -1; } /* * keeper_state_create_file creates an initial state file from the given * postgres setup and group ID. */ bool keeper_state_create_file(const char *filename) { KeeperStateData keeperState; keeper_state_init(&keeperState); return keeper_state_write(&keeperState, filename); } /* * log_keeper_state dumps the current in memory state to the logs. */ void log_keeper_state(KeeperStateData *keeperState) { const char *current_role = NodeStateToString(keeperState->current_role); const char *assigned_role = NodeStateToString(keeperState->assigned_role); char timestring[MAXCTIMESIZE]; log_trace("state.pg_control_version: %u", keeperState->pg_control_version); log_trace("state.system_identifier: %" PRIu64, keeperState->system_identifier); log_trace("state.pg_autoctl_state_version: %d", keeperState->pg_autoctl_state_version); log_trace("state.current_node_id: %d", keeperState->current_node_id); log_trace("state.current_group: %d", keeperState->current_group); log_trace("state.current_nodes_version: %" PRIu64, keeperState->current_nodes_version); log_trace("state.current_role: %s", current_role); log_trace("state.assigned_role: %s", assigned_role); log_trace("state.last_monitor_contact: %s", epoch_to_string(keeperState->last_monitor_contact, timestring)); log_trace("state.last_secondary_contact: %s", epoch_to_string(keeperState->last_secondary_contact, timestring)); log_trace("state.xlog_lag : %" PRId64, keeperState->xlog_lag); log_trace("state.keeper_is_paused: %d", keeperState->keeper_is_paused); log_trace("state.pg_version: %d", keeperState->pg_version); } /* * print_keeper_state prints the current in-memory state of the keeper to given * FILE output (stdout, stderr, etc). */ void print_keeper_state(KeeperStateData *keeperState, FILE *stream) { const char *current_role = NodeStateToString(keeperState->current_role); const char *assigned_role = NodeStateToString(keeperState->assigned_role); char timestring[MAXCTIMESIZE]; /* * First, the roles. */ fformat(stream, "Current Role: %s\n", current_role); fformat(stream, "Assigned Role: %s\n", assigned_role); /* * Now, other nodes situation, are we in a network partition. */ fformat(stream, "Last Monitor Contact: %s\n", epoch_to_string(keeperState->last_monitor_contact, timestring)); fformat(stream, "Last Secondary Contact: %s\n", epoch_to_string(keeperState->last_secondary_contact, timestring)); /* * pg_autoctl information. */ fformat(stream, "pg_autoctl state version: %d\n", keeperState->pg_autoctl_state_version); fformat(stream, "group: %d\n", keeperState->current_group); fformat(stream, "node id: %d\n", keeperState->current_node_id); fformat(stream, "nodes version: %" PRIu64 "\n", keeperState->current_nodes_version); /* * PostgreSQL bits. */ fformat(stream, "PostgreSQL Version: %u\n", keeperState->pg_control_version); fformat(stream, "PostgreSQL CatVersion: %u\n", keeperState->catalog_version_no); fformat(stream, "PostgreSQL System Id: %" PRIu64 "\n", keeperState->system_identifier); fflush(stream); } /* * keeperStateAsJSON */ bool keeperStateAsJSON(KeeperStateData *keeperState, JSON_Value *js) { JSON_Object *jsobj = json_value_get_object(js); const char *current_role = NodeStateToString(keeperState->current_role); const char *assigned_role = NodeStateToString(keeperState->assigned_role); char timestring[MAXCTIMESIZE] = { 0 }; json_object_set_string(jsobj, "current_role", current_role); json_object_set_string(jsobj, "assigned_role", assigned_role); json_object_set_number(jsobj, "version", (double) keeperState->pg_autoctl_state_version); json_object_set_number(jsobj, "groupId", (double) keeperState->current_group); json_object_set_number(jsobj, "nodeId", (double) keeperState->current_node_id); json_object_set_string(jsobj, "last_monitor_contact", epoch_to_string(keeperState->last_monitor_contact, timestring)); json_object_set_string(jsobj, "last_secondary_contact", epoch_to_string(keeperState->last_secondary_contact, timestring)); json_object_set_number(jsobj, "pgversion", (double) keeperState->pg_control_version); return true; } /* * print_keeper_init_state prints the given initilization state of the keeper * to given FILE output (stdout, stderr, etc). */ void print_keeper_init_state(KeeperStateInit *initState, FILE *stream) { fformat(stream, "Postgres state at keeper init: %s\n", PreInitPostgreInstanceStateToString(initState->pgInitState)); fflush(stream); } /* * NodeStateToString converts a NodeState ENUM value into a string for use in * user reporting. */ const char * NodeStateToString(NodeState s) { switch (s) { case NO_STATE: { return "unknown"; } case INIT_STATE: { return "init"; } case SINGLE_STATE: { return "single"; } case PRIMARY_STATE: { return "primary"; } case WAIT_PRIMARY_STATE: { return "wait_primary"; } case WAIT_STANDBY_STATE: { return "wait_standby"; } case DEMOTED_STATE: { return "demoted"; } case DEMOTE_TIMEOUT_STATE: { return "demote_timeout"; } case DRAINING_STATE: { return "draining"; } case SECONDARY_STATE: { return "secondary"; } case CATCHINGUP_STATE: { return "catchingup"; } case PREP_PROMOTION_STATE: { return "prepare_promotion"; } case STOP_REPLICATION_STATE: { return "stop_replication"; } case MAINTENANCE_STATE: { return "maintenance"; } case JOIN_PRIMARY_STATE: { return "join_primary"; } case APPLY_SETTINGS_STATE: { return "apply_settings"; } case PREPARE_MAINTENANCE_STATE: { return "prepare_maintenance"; } case WAIT_MAINTENANCE_STATE: { return "wait_maintenance"; } case REPORT_LSN_STATE: { return "report_lsn"; } case FAST_FORWARD_STATE: { return "fast_forward"; } case JOIN_SECONDARY_STATE: { return "join_secondary"; } case DROPPED_STATE: { return "dropped"; } case ANY_STATE: { return "#any state#"; } default: return "Unknown State"; } } /* * NodeStateFromString converts a string representation of a node state into * the corresponding internal ENUM value. */ NodeState NodeStateFromString(const char *str) { if (strcmp(str, "unknown") == 0) { return NO_STATE; } else if (strcmp(str, "init") == 0) { return INIT_STATE; } else if (strcmp(str, "single") == 0) { return SINGLE_STATE; } else if (strcmp(str, "primary") == 0) { return PRIMARY_STATE; } else if (strcmp(str, "wait_primary") == 0) { return WAIT_PRIMARY_STATE; } else if (strcmp(str, "wait_standby") == 0) { return WAIT_STANDBY_STATE; } else if (strcmp(str, "demoted") == 0) { return DEMOTED_STATE; } else if (strcmp(str, "demote_timeout") == 0) { return DEMOTE_TIMEOUT_STATE; } else if (strcmp(str, "draining") == 0) { return DRAINING_STATE; } else if (strcmp(str, "secondary") == 0) { return SECONDARY_STATE; } else if (strcmp(str, "catchingup") == 0) { return CATCHINGUP_STATE; } else if (strcmp(str, "prepare_promotion") == 0) { return PREP_PROMOTION_STATE; } else if (strcmp(str, "stop_replication") == 0) { return STOP_REPLICATION_STATE; } else if (strcmp(str, "maintenance") == 0) { return MAINTENANCE_STATE; } else if (strcmp(str, "join_primary") == 0) { return JOIN_PRIMARY_STATE; } else if (strcmp(str, "apply_settings") == 0) { return APPLY_SETTINGS_STATE; } else if (strcmp(str, "prepare_maintenance") == 0) { return PREPARE_MAINTENANCE_STATE; } else if (strcmp(str, "wait_maintenance") == 0) { return WAIT_MAINTENANCE_STATE; } else if (strcmp(str, "report_lsn") == 0) { return REPORT_LSN_STATE; } else if (strcmp(str, "fast_forward") == 0) { return FAST_FORWARD_STATE; } else if (strcmp(str, "join_secondary") == 0) { return JOIN_SECONDARY_STATE; } else if (strcmp(str, "dropped") == 0) { return DROPPED_STATE; } else { log_fatal("Failed to parse state string \"%s\"", str); return NO_STATE; } return NO_STATE; } /* * epoch_to_string converts a number of seconds from epoch into a date time * string. * * This string is stored in buffer. On error NULL is returned else the buffer * is returned. The buffer should (at least) be MAXCTIMESIZE large. */ const char * epoch_to_string(uint64_t seconds, char *buffer) { if (seconds <= 0) { strlcpy(buffer, "0", MAXCTIMESIZE); return buffer; } char *result = ctime_r((time_t *) &seconds, buffer); if (result == NULL) { log_error("Failed to convert epoch %" PRIu64 " to string: %m", seconds); return NULL; } if (strlen(result) != 0 && result[strlen(result) - 1] == '\n') { /* * ctime_r normally returns a string that ends with \n, which we don't * want. We strip it by replacing it with a null string terminator. */ result[strlen(result) - 1] = '\0'; } return buffer; } /* * PreInitPostgreInstanceStateToString returns the string that represents the * init state of the local PostgreSQL instance. */ char * PreInitPostgreInstanceStateToString(PreInitPostgreInstanceState pgInitState) { switch (pgInitState) { case PRE_INIT_STATE_EMPTY: { return "PGDATA does not exist"; } case PRE_INIT_STATE_EXISTS: { return "PGDATA exists"; } case PRE_INIT_STATE_RUNNING: { return "PostgreSQL is running"; } case PRE_INIT_STATE_PRIMARY: { return "PostgreSQL is running and a primary server"; } default: return "unknown"; } /* keep compiler happy */ return "unknown"; } /* * keeper_init_state_create create our pg_autoctl.init file. * * This file is created when entering keeper init and deleted only when the * init has been successful. This allows the code to take smarter decisions and * decipher in between a previous init having failed halfway through or * initializing from scratch in conditions not supported (pre-existing and * running cluster, etc). */ bool keeper_init_state_create(KeeperStateInit *initState, PostgresSetup *pgSetup, const char *filename) { if (!keeper_init_state_discover(initState, pgSetup, filename)) { /* errors have already been logged */ return false; } log_info("Writing keeper init state file at \"%s\"", filename); log_debug("keeper_init_state_create: version = %d", initState->pg_autoctl_state_version); log_debug("keeper_init_state_create: pgInitState = %s", PreInitPostgreInstanceStateToString(initState->pgInitState)); return keeper_init_state_write(initState, filename); } /* * keeper_init_state_write writes our pg_autoctl.init file. */ static bool keeper_init_state_write(KeeperStateInit *initState, const char *filename) { char buffer[PG_AUTOCTL_KEEPER_STATE_FILE_SIZE] = { 0 }; memset(buffer, 0, PG_AUTOCTL_KEEPER_STATE_FILE_SIZE); /* * Explanation of IGNORE-BANNED: * memcpy is safe to use here. * we have a static assert that sizeof(KeeperStateInit) is always * less than the buffer length PG_AUTOCTL_KEEPER_STATE_FILE_SIZE. * also KeeperStateData is a plain struct that does not contain * any pointers in it. Necessary comment about not using pointers * is added to the struct definition. */ memcpy(buffer, initState, sizeof(KeeperStateInit)); /* IGNORE-BANNED */ int fd = open(filename, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); if (fd < 0) { log_fatal("Failed to create keeper init state file \"%s\": %m", filename); return false; } errno = 0; if (write(fd, buffer, PG_AUTOCTL_KEEPER_STATE_FILE_SIZE) != PG_AUTOCTL_KEEPER_STATE_FILE_SIZE) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) { errno = ENOSPC; } log_fatal("Failed to write keeper init state file \"%s\": %m", filename); return false; } if (fsync(fd) != 0) { log_fatal("fsync error: %m"); return false; } close(fd); return true; } /* * keeper_init_state_discover discovers the current KeeperStateInit from the * command line options, by checking everything we can about the possibly * existing Postgres instance. */ bool keeper_init_state_discover(KeeperStateInit *initState, PostgresSetup *pgSetup, const char *filename) { PostgresSetup newPgSetup = { 0 }; bool missingPgdataIsOk = true; bool pgIsNotRunningIsOk = true; initState->pg_autoctl_state_version = PG_AUTOCTL_STATE_VERSION; if (!pg_setup_init(&newPgSetup, pgSetup, missingPgdataIsOk, pgIsNotRunningIsOk)) { log_fatal("Failed to initialize the keeper init state, " "see above for details"); return false; } if (pg_setup_role(pgSetup) == POSTGRES_ROLE_PRIMARY) { initState->pgInitState = PRE_INIT_STATE_PRIMARY; } else if (pg_setup_is_running(pgSetup)) { initState->pgInitState = PRE_INIT_STATE_RUNNING; } else if (pg_setup_pgdata_exists(pgSetup)) { initState->pgInitState = PRE_INIT_STATE_EXISTS; } else { initState->pgInitState = PRE_INIT_STATE_EMPTY; } return true; } /* * keeper_init_state_read reads the information kept in the keeper init file. */ bool keeper_init_state_read(KeeperStateInit *initState, const char *filename) { char *content = NULL; long fileSize; log_debug("Reading current init state from \"%s\"", filename); if (!read_file(filename, &content, &fileSize)) { log_error("Failed to read Keeper state from file \"%s\"", filename); return false; } int pg_autoctl_state_version = ((KeeperStateInit *) content)->pg_autoctl_state_version; if (fileSize >= sizeof(KeeperStateInit) && pg_autoctl_state_version == PG_AUTOCTL_STATE_VERSION) { *initState = *(KeeperStateInit *) content; free(content); return true; } free(content); /* Looks like it's a mess. */ log_error("Keeper init state file \"%s\" exists but " "is broken or wrong version (%d)", filename, pg_autoctl_state_version); return false; } /* * ExpectedPostgresStatusToString return the string that represents our * expected PostgreSQL state. */ char * ExpectedPostgresStatusToString(ExpectedPostgresStatus pgExpectedStatus) { switch (pgExpectedStatus) { case PG_EXPECTED_STATUS_UNKNOWN: { return "unknown"; } case PG_EXPECTED_STATUS_STOPPED: { return "Postgres should be stopped"; } case PG_EXPECTED_STATUS_RUNNING: { return "Postgres should be running"; } case PG_EXPECTED_STATUS_RUNNING_AS_SUBPROCESS: { return "Postgres should be running as a pg_autoctl subprocess"; } } /* make compiler happy */ return "unknown"; } /* * keeper_set_postgres_state_unknown updates the Postgres expected status file * to unknown. */ bool keeper_set_postgres_state_unknown(KeeperStatePostgres *pgStatus, const char *filename) { pgStatus->pgExpectedStatus = PG_EXPECTED_STATUS_UNKNOWN; return keeper_postgres_state_update(pgStatus, filename); } /* * keeper_set_postgres_state_running updates the Postgres expected status file * to running. */ bool keeper_set_postgres_state_running(KeeperStatePostgres *pgStatus, const char *filename) { pgStatus->pgExpectedStatus = PG_EXPECTED_STATUS_RUNNING; return keeper_postgres_state_update(pgStatus, filename); } /* * keeper_set_postgres_state_running updates the Postgres expected status file * to running as subprocess. */ bool keeper_set_postgres_state_running_as_subprocess(KeeperStatePostgres *pgStatus, const char *filename) { pgStatus->pgExpectedStatus = PG_EXPECTED_STATUS_RUNNING_AS_SUBPROCESS; return keeper_postgres_state_update(pgStatus, filename); } /* * keeper_set_postgres_state_stopped updates the Postgres expected status file * to stopped. */ bool keeper_set_postgres_state_stopped(KeeperStatePostgres *pgStatus, const char *filename) { pgStatus->pgExpectedStatus = PG_EXPECTED_STATUS_STOPPED; return keeper_postgres_state_update(pgStatus, filename); } /* * keeper_postgres_state_create creates our pg_autoctl.pg file. */ bool keeper_postgres_state_update(KeeperStatePostgres *pgStatus, const char *filename) { pgStatus->pg_autoctl_state_version = PG_AUTOCTL_STATE_VERSION; log_debug("Writing keeper postgres expected state file at \"%s\"", filename); log_debug("keeper_postgres_state_create: version = %d", pgStatus->pg_autoctl_state_version); log_debug("keeper_postgres_state_create: ExpectedPostgresStatus = %s", ExpectedPostgresStatusToString(pgStatus->pgExpectedStatus)); return keeper_postgres_state_write(pgStatus, filename); } /* * keeper_postgres_state_write writes our pg_autoctl.init file. */ static bool keeper_postgres_state_write(KeeperStatePostgres *pgStatus, const char *filename) { char buffer[PG_AUTOCTL_KEEPER_STATE_FILE_SIZE] = { 0 }; log_trace("keeper_postgres_state_write %s in %s", ExpectedPostgresStatusToString(pgStatus->pgExpectedStatus), filename); memset(buffer, 0, PG_AUTOCTL_KEEPER_STATE_FILE_SIZE); /* * Explanation of IGNORE-BANNED: * memcpy is safe to use here. * we have a static assert that sizeof(KeeperStateInit) is always * less than the buffer length PG_AUTOCTL_KEEPER_STATE_FILE_SIZE. * also KeeperStateData is a plain struct that does not contain * any pointers in it. Necessary comment about not using pointers * is added to the struct definition. */ memcpy(buffer, pgStatus, sizeof(KeeperStatePostgres)); /* IGNORE-BANNED */ int fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); if (fd < 0) { log_fatal( "Failed to create keeper postgres expected status file \"%s\": %m", filename); return false; } errno = 0; if (write(fd, buffer, PG_AUTOCTL_KEEPER_STATE_FILE_SIZE) != PG_AUTOCTL_KEEPER_STATE_FILE_SIZE) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) { errno = ENOSPC; } log_fatal( "Failed to write keeper postgres expected status file \"%s\": %m", filename); return false; } if (fsync(fd) != 0) { log_fatal("fsync error: %m"); return false; } close(fd); return true; } /* * keeper_postgres_state_read reads the information kept in the keeper postgres * file. */ bool keeper_postgres_state_read(KeeperStatePostgres *pgStatus, const char *filename) { char *content = NULL; long fileSize; if (!read_file(filename, &content, &fileSize)) { log_error("Failed to read postgres expected status from file \"%s\"", filename); return false; } int pg_autoctl_state_version = ((KeeperStatePostgres *) content)->pg_autoctl_state_version; if (fileSize >= sizeof(KeeperStateInit) && pg_autoctl_state_version == PG_AUTOCTL_STATE_VERSION) { *pgStatus = *(KeeperStatePostgres *) content; free(content); return true; } free(content); /* Looks like it's a mess. */ log_error("Keeper postgres expected status file \"%s\" exists but " "is broken or wrong version (%d)", filename, pg_autoctl_state_version); return false; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/state.h000066400000000000000000000176771414244367200220700ustar00rootroot00000000000000/* * src/bin/pg_autoctl/state.h * Keeper state data structure and function definitions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef STATE_H #define STATE_H #include #include "parson.h" #include "pgsetup.h" /* * We need 80 bytes, but we'll mimic what PostgreSQL is doing with the handling * of its super important pg_control file, see the following files: * * postgresql/src/include/catalog/pg_control.h * postgresql/src/bin/pg_resetwal/pg_resetwal.c */ #define PG_AUTOCTL_KEEPER_STATE_FILE_SIZE 1024 /* * The keeper State Machine handle the following possible states: */ typedef enum { NO_STATE = 0, INIT_STATE, SINGLE_STATE, PRIMARY_STATE, WAIT_PRIMARY_STATE, WAIT_STANDBY_STATE, DEMOTED_STATE, DEMOTE_TIMEOUT_STATE, DRAINING_STATE, SECONDARY_STATE, CATCHINGUP_STATE, PREP_PROMOTION_STATE, STOP_REPLICATION_STATE, MAINTENANCE_STATE, JOIN_PRIMARY_STATE, APPLY_SETTINGS_STATE, PREPARE_MAINTENANCE_STATE, WAIT_MAINTENANCE_STATE, REPORT_LSN_STATE, FAST_FORWARD_STATE, JOIN_SECONDARY_STATE, DROPPED_STATE, /* Allow some wildcard-matching transitions (from ANY state to) */ ANY_STATE = 128 } NodeState; #define MAX_NODE_STATE_LEN 19 /* "prepare_maintenance" */ /* * ANY_STATE matches with any state, as its name implies: */ #define state_matches(x, y) (x == ANY_STATE || y == ANY_STATE || x == y) /* * PostgreSQL prepared transaction names are up to 200 bytes. */ #define PREPARED_TRANSACTION_NAMELEN 200 /* * The Keeper's state is composed of information from three different sources: * - the PostgreSQL instance we keep, * - the pg_auto_failover Monitor, via the pgautofailover.node_active protocol, * - the Keeper itself. * * The state is authoritative only for entries that are maintained by the * keeper itself, that's: * - current_role * - last_secondary_contact * - xlog_location note: should we keep that? * - keeper_is_paused * * Note: The struct is serialized/serialiazed to/from state file. Therefore * keeping the memory layout the same is important. Please * - do not change the order of fields * - do not add a new field in between, always add to the end * - do not use any pointers * * The nodeId used to be a 32 bits integer on the monitor, and has been * upgraded to a bigint (64 bits). That said, the on-disk state file still * works internally with a 32 bits number for the nodeId. * * When that's needed, we could create a compatibility function that knows how * to read the old state with an int32_t and then fill-in the new struct with a * 664 bits number instead, and serialize that to disk transparently. As it is * not expected to find nodeId in the wild, this work has not been done yet. */ typedef struct { int pg_autoctl_state_version; /* PostgreSQL instance information, from pg_ctl and pg_controldata */ int pg_version; uint32_t pg_control_version; /* PG_CONTROL_VERSION */ uint32_t catalog_version_no; /* see catversion.h */ uint64_t system_identifier; /* Information we get from the Monitor */ int current_node_id; int current_group; NodeState assigned_role; uint64_t current_nodes_version; uint64_t last_monitor_contact; /* keeper's current state, authoritative */ NodeState current_role; uint64_t last_secondary_contact; int64_t xlog_lag; int keeper_is_paused; } KeeperStateData; _Static_assert(sizeof(KeeperStateData) < PG_AUTOCTL_KEEPER_STATE_FILE_SIZE, "Size of KeeperStateData is larger than expected. " "Please review PG_AUTOCTL_KEEPER_STATE_FILE_SIZE"); /* * The init file contains the status of the target Postgres instance when the * pg_autoctl create command ran the first time. We need to be able to make * init time decision again if we're interrupted half-way and later want to * proceed. The instruction for the user to proceed in that case is to run the * pg_autoctl create command again. * * We also update the init file with the current stage of the initialisation * process. This allows communication to happen between the init process and * the Postgres FSM supervisor process. The Postgres FSM supervisors knows it * must start Postgres when reaching init stage 2. */ typedef enum { PRE_INIT_STATE_UNKNOWN = 0, PRE_INIT_STATE_EMPTY, PRE_INIT_STATE_EXISTS, PRE_INIT_STATE_RUNNING, PRE_INIT_STATE_PRIMARY } PreInitPostgreInstanceState; /* * Note: The struct is serialized/serialiazed to/from state file. Therefore * keeping the memory layout the same is important. Please * - do not change the order of fields * - do not add a new field in between, always add to the end * - do not use any pointers */ typedef struct { int pg_autoctl_state_version; PreInitPostgreInstanceState pgInitState; } KeeperStateInit; _Static_assert(sizeof(KeeperStateInit) < PG_AUTOCTL_KEEPER_STATE_FILE_SIZE, "Size of KeeperStateInit is larger than expected. " "Please review PG_AUTOCTL_KEEPER_STATE_FILE_SIZE"); /* * pg_autoctl manages Postgres as a child process. The FSM loop runs in the * node-active sub-process, and that's where decisions are made depending on * the current state and transition whether Postgres should be running or not. * * The communication between the node-active process and the Postgres * start/stop controller process is done by means of the Postgres state file, * which is basically a boolean. That said, we want to make sure we read the * file content correctly, so 0 is unknown. */ typedef enum { PG_EXPECTED_STATUS_UNKNOWN = 0, PG_EXPECTED_STATUS_STOPPED, PG_EXPECTED_STATUS_RUNNING, PG_EXPECTED_STATUS_RUNNING_AS_SUBPROCESS } ExpectedPostgresStatus; /* * Note: This struct is serialized/deserialized to/from state file. Therefore * keeping the memory layout the same is important. Please * - do not change the order of fields * - do not add a new field in between, always add to the end * - do not use any pointers */ typedef struct { int pg_autoctl_state_version; ExpectedPostgresStatus pgExpectedStatus; } KeeperStatePostgres; _Static_assert(sizeof(KeeperStatePostgres) < PG_AUTOCTL_KEEPER_STATE_FILE_SIZE, "Size of KeeperStatePostgres is larger than expected. " "Please review PG_AUTOCTL_KEEPER_STATE_FILE_SIZE"); const char * NodeStateToString(NodeState s); NodeState NodeStateFromString(const char *str); const char * epoch_to_string(uint64_t seconds, char *buffer); void keeper_state_init(KeeperStateData *keeperState); bool keeper_state_create_file(const char *filename); bool keeper_state_read(KeeperStateData *keeperState, const char *filename); bool keeper_state_write(KeeperStateData *keeperState, const char *filename); void log_keeper_state(KeeperStateData *keeperState); void print_keeper_state(KeeperStateData *keeperState, FILE *fp); bool keeperStateAsJSON(KeeperStateData *keeperState, JSON_Value *js); void print_keeper_init_state(KeeperStateInit *initState, FILE *stream); char * PreInitPostgreInstanceStateToString(PreInitPostgreInstanceState pgInitState); bool keeper_init_state_create(KeeperStateInit *initState, PostgresSetup *pgSetup, const char *filename); bool keeper_init_state_read(KeeperStateInit *initState, const char *filename); bool keeper_init_state_discover(KeeperStateInit *initState, PostgresSetup *pgSetup, const char *filename); char * ExpectedPostgresStatusToString(ExpectedPostgresStatus pgExpectedStatus); bool keeper_set_postgres_state_unknown(KeeperStatePostgres *pgStatus, const char *filename); bool keeper_set_postgres_state_running(KeeperStatePostgres *pgStatus, const char *filename); bool keeper_set_postgres_state_running_as_subprocess(KeeperStatePostgres *pgStatus, const char *filename); bool keeper_set_postgres_state_stopped(KeeperStatePostgres *pgStatus, const char *filename); bool keeper_postgres_state_update(KeeperStatePostgres *pgStatus, const char *filename); bool keeper_postgres_state_read(KeeperStatePostgres *pgStatus, const char *filename); #endif /* STATE_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/string_utils.c000066400000000000000000000201101414244367200234410ustar00rootroot00000000000000/* * src/bin/pg_autoctl/string_utils.c * Implementations of utility functions for string handling * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "defaults.h" #include "file_utils.h" #include "log.h" #include "parsing.h" #include "string_utils.h" /* * intToString converts an int to an IntString, which contains a decimal string * representation of the integer. */ IntString intToString(int64_t number) { IntString intString; intString.intValue = number; sformat(intString.strValue, INTSTRING_MAX_DIGITS, "%" PRId64, number); return intString; } /* * converts given string to 64 bit integer value. * returns 0 upon failure and sets error flag */ bool stringToInt(const char *str, int *number) { char *endptr; if (str == NULL) { return false; } if (number == NULL) { return false; } errno = 0; long long int n = strtoll(str, &endptr, 10); if (str == endptr) { return false; } else if (errno != 0) { return false; } else if (*endptr != '\0') { return false; } else if (n < INT_MIN || n > INT_MAX) { return false; } *number = n; return true; } /* * converts given string to 64 bit integer value. * returns 0 upon failure and sets error flag */ bool stringToInt64(const char *str, int64_t *number) { char *endptr; if (str == NULL) { return false; } if (number == NULL) { return false; } errno = 0; long long int n = strtoll(str, &endptr, 10); if (str == endptr) { return false; } else if (errno != 0) { return false; } else if (*endptr != '\0') { return false; } else if (n < INT64_MIN || n > INT64_MAX) { return false; } *number = n; return true; } /* * converts given string to 64 bit unsigned integer value. * returns 0 upon failure and sets error flag */ bool stringToUInt(const char *str, unsigned int *number) { char *endptr; if (str == NULL) { return false; } if (number == NULL) { return false; } errno = 0; unsigned long long n = strtoull(str, &endptr, 10); if (str == endptr) { return false; } else if (errno != 0) { return false; } else if (*endptr != '\0') { return false; } else if (n > UINT_MAX) { return false; } *number = n; return true; } /* * converts given string to 64 bit unsigned integer value. * returns 0 upon failure and sets error flag */ bool stringToUInt64(const char *str, uint64_t *number) { char *endptr; if (str == NULL) { return false; } if (number == NULL) { return false; } errno = 0; unsigned long long n = strtoull(str, &endptr, 10); if (str == endptr) { return false; } else if (errno != 0) { return false; } else if (*endptr != '\0') { return false; } else if (n > UINT64_MAX) { return false; } *number = n; return true; } /* * converts given string to short value. * returns 0 upon failure and sets error flag */ bool stringToShort(const char *str, short *number) { char *endptr; if (str == NULL) { return false; } if (number == NULL) { return false; } errno = 0; long long int n = strtoll(str, &endptr, 10); if (str == endptr) { return false; } else if (errno != 0) { return false; } else if (*endptr != '\0') { return false; } else if (n < SHRT_MIN || n > SHRT_MAX) { return false; } *number = n; return true; } /* * converts given string to unsigned short value. * returns 0 upon failure and sets error flag */ bool stringToUShort(const char *str, unsigned short *number) { char *endptr; if (str == NULL) { return false; } if (number == NULL) { return false; } errno = 0; unsigned long long n = strtoull(str, &endptr, 10); if (str == endptr) { return false; } else if (errno != 0) { return false; } else if (*endptr != '\0') { return false; } else if (n > USHRT_MAX) { return false; } *number = n; return true; } /* * converts given string to 32 bit integer value. * returns 0 upon failure and sets error flag */ bool stringToInt32(const char *str, int32_t *number) { char *endptr; if (str == NULL) { return false; } if (number == NULL) { return false; } errno = 0; long long int n = strtoll(str, &endptr, 10); if (str == endptr) { return false; } else if (errno != 0) { return false; } else if (*endptr != '\0') { return false; } else if (n < INT32_MIN || n > INT32_MAX) { return false; } *number = n; return true; } /* * converts given string to 32 bit unsigned int value. * returns 0 upon failure and sets error flag */ bool stringToUInt32(const char *str, uint32_t *number) { char *endptr; if (str == NULL) { return false; } if (number == NULL) { return false; } errno = 0; unsigned long long n = strtoull(str, &endptr, 10); if (str == endptr) { return false; } else if (errno != 0) { return false; } else if (*endptr != '\0') { return false; } else if (n > UINT32_MAX) { return false; } *number = n; return true; } /* * converts given string to a double precision float value. * returns 0 upon failure and sets error flag */ bool stringToDouble(const char *str, double *number) { char *endptr; if (str == NULL) { return false; } if (number == NULL) { return false; } errno = 0; double n = strtod(str, &endptr); if (str == endptr) { return false; } else if (errno != 0) { return false; } else if (*endptr != '\0') { return false; } else if (n > DBL_MAX) { return false; } *number = n; return true; } /* * IntervalToString prepares a string buffer to represent a given interval * value given as a double precision float number. */ bool IntervalToString(double seconds, char *buffer, size_t size) { if (seconds < 1.0) { /* when we have < 1s, we round to 1s */ sformat(buffer, size, " %ds", 1); } else if (seconds < 60.0) { int s = (int) seconds; sformat(buffer, size, "%2ds", s); } else if (seconds < (60.0 * 60.0)) { int mins = (int) (seconds / 60.0); int secs = (int) (seconds - (mins * 60.0)); sformat(buffer, size, "%2dm%02ds", mins, secs); } else if (seconds < (24.0 * 60.0 * 60.0)) { int hours = (int) (seconds / (60.0 * 60.0)); int mins = (int) ((seconds - (hours * 60.0 * 60.0)) / 60.0); sformat(buffer, size, "%2dh%02dm", hours, mins); } else { int days = (int) (seconds / (24.0 * 60.0 * 60.0)); int hours = (int) ((seconds - (days * 24.0 * 60.0 * 60.0)) / (60.0 * 60.0)); sformat(buffer, size, "%2dd%02dh", days, hours); } return true; } /* * splitLines prepares a multi-line error message in a way that calling code * can loop around one line at a time and call log_error() or log_warn() on * individual lines. */ int splitLines(char *errorMessage, char **linesArray, int size) { int lineNumber = 0; char *currentLine = errorMessage; if (errorMessage == NULL) { return 0; } do { char *newLinePtr = strchr(currentLine, '\n'); if (newLinePtr == NULL) { if (strlen(currentLine) > 0) { linesArray[lineNumber++] = currentLine; } currentLine = NULL; } else { *newLinePtr = '\0'; linesArray[lineNumber++] = currentLine; currentLine = ++newLinePtr; } } while (currentLine != NULL && *currentLine != '\0' && lineNumber < size); return lineNumber; } /* * processBufferCallback is a function callback to use with the subcommands.c * library when we want to output a command's output as it's running, such as * when running a pg_basebackup command. */ void processBufferCallback(const char *buffer, bool error) { char *outLines[BUFSIZE] = { 0 }; int lineCount = splitLines((char *) buffer, outLines, BUFSIZE); int lineNumber = 0; for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { if (strneq(outLines[lineNumber], "")) { /* * pg_basebackup and other utilities write their progress output on * stderr, we don't want to have ERROR message when it's all good. * As a result we always target INFO log level here. */ log_info("%s", outLines[lineNumber]); } } } pg_auto_failover-1.6.3/src/bin/pg_autoctl/string_utils.h000066400000000000000000000022711414244367200234560ustar00rootroot00000000000000/* * src/bin/pg_autoctl/string_utils.h * Utility functions for string handling * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef STRING_UTILS_H #define STRING_UTILS_H #include /* maximum decimal int64 length with minus and NUL */ #define INTSTRING_MAX_DIGITS 21 typedef struct IntString { int64_t intValue; char strValue[INTSTRING_MAX_DIGITS]; } IntString; IntString intToString(int64_t number); bool stringToInt(const char *str, int *number); bool stringToUInt(const char *str, unsigned int *number); bool stringToInt64(const char *str, int64_t *number); bool stringToUInt64(const char *str, uint64_t *number); bool stringToShort(const char *str, short *number); bool stringToUShort(const char *str, unsigned short *number); bool stringToInt32(const char *str, int32_t *number); bool stringToUInt32(const char *str, uint32_t *number); bool stringToDouble(const char *str, double *number); bool IntervalToString(double seconds, char *buffer, size_t size); int splitLines(char *errorMessage, char **linesArray, int size); void processBufferCallback(const char *buffer, bool error); #endif /* STRING_UTILS_h */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/supervisor.c000066400000000000000000000566201414244367200231530ustar00rootroot00000000000000/* * src/bin/pg_autoctl/supervisor.c * Supervisor for services run in sub-processes. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include "postgres_fe.h" #include "pqexpbuffer.h" #include "cli_root.h" #include "defaults.h" #include "env_utils.h" #include "fsm.h" #include "keeper.h" #include "keeper_config.h" #include "keeper_pg_init.h" #include "log.h" #include "monitor.h" #include "pgctl.h" #include "pidfile.h" #include "state.h" #include "supervisor.h" #include "signals.h" #include "string_utils.h" static bool supervisor_init(Supervisor *supervisor); static SupervisorExitMode supervisor_loop(Supervisor *supervisor); static bool supervisor_find_service(Supervisor *supervisor, pid_t pid, Service **result); static void supervisor_stop_subprocesses(Supervisor *supervisor); static void supervisor_stop_other_services(Supervisor *supervisor, pid_t pid); static bool supervisor_signal_process_group(int signal); static void supervisor_reload_services(Supervisor *supervisor); static void supervisor_handle_signals(Supervisor *supervisor); static void supervisor_shutdown_sequence(Supervisor *supervisor); static bool supervisor_restart_service(Supervisor *supervisor, Service *service, int status); static bool supervisor_may_restart(Service *service); static bool supervisor_update_pidfile(Supervisor *supervisor); /* * supervisor_start starts given services as sub-processes and then supervise * them. */ bool supervisor_start(Service services[], int serviceCount, const char *pidfile) { int serviceIndex = 0; bool success = true; Supervisor supervisor = { services, serviceCount, { 0 }, -1 }; /* copy the pidfile over to our supervisor structure */ strlcpy(supervisor.pidfile, pidfile, MAXPGPATH); /* * Create our PID file, or quit now if another pg_autoctl instance is * runnning. */ if (!supervisor_init(&supervisor)) { log_fatal("Failed to setup pg_autoctl pidfile and signal handlers"); return false; } /* * Start all the given services, in order. * * If we fail to start one of the given services, then we SIGQUIT the * services we managed to start before, in reverse order of starting-up, * and stop here. */ for (serviceIndex = 0; serviceIndex < serviceCount; serviceIndex++) { Service *service = &(services[serviceIndex]); log_debug("Starting pg_autoctl %s service", service->name); bool started = (*service->startFunction)(service->context, &(service->pid)); if (started) { uint64_t now = time(NULL); RestartCounters *counters = &(service->restartCounters); counters->count = 1; counters->position = 0; counters->startTime[counters->position] = now; log_info("Started pg_autoctl %s service with pid %d", service->name, service->pid); } else { int idx = 0; log_error("Failed to start service %s, " "stopping already started services and pg_autoctl", service->name); for (idx = serviceIndex - 1; idx > 0; idx--) { if (kill(services[idx].pid, SIGQUIT) != 0) { log_error("Failed to send SIGQUIT to service %s with pid %d", services[idx].name, services[idx].pid); } } /* we return false always, even if supervisor_stop is successful */ (void) supervisor_stop(&supervisor); return false; } } /* * We need to update our pid file with the PID for every service. */ if (!supervisor_update_pidfile(&supervisor)) { log_fatal("Failed to update pidfile \"%s\", stopping all services now", supervisor.pidfile); supervisor.exitMode = SUPERVISOR_EXIT_ERROR; supervisor.shutdownSequenceInProgress = true; (void) supervisor_stop_subprocesses(&supervisor); return false; } /* now supervise sub-processes and implement retry strategy */ switch (supervisor_loop(&supervisor)) { case SUPERVISOR_EXIT_FATAL: { log_fatal("A subprocess has reported a fatal error, stopping now. " "See above for details."); success = false; break; } case SUPERVISOR_EXIT_ERROR: { log_fatal("Something went wrong in sub-process supervision, " "stopping now. See above for details."); success = false; break; } case SUPERVISOR_EXIT_CLEAN: { success = true; break; } } return supervisor_stop(&supervisor) && success; } /* * service_supervisor calls waitpid() in a loop until the sub processes that * implement our main activities have stopped, and then it cleans-up the PID * file. */ static SupervisorExitMode supervisor_loop(Supervisor *supervisor) { int subprocessCount = supervisor->serviceCount; bool firstLoop = true; /* wait until all subprocesses are done */ while (subprocessCount > 0) { pid_t pid; int status; /* Check that we still own our PID file, or quit now */ (void) check_pidfile(supervisor->pidfile, supervisor->pid); /* If necessary, now is a good time to reload services */ if (asked_to_reload) { log_info("pg_autoctl received a SIGHUP signal, " "reloading configuration"); (void) supervisor_reload_services(supervisor); } if (firstLoop) { firstLoop = false; } else { /* avoid busy looping on waitpid(WNOHANG) */ pg_usleep(100 * 1000); /* 100 ms */ } /* ignore errors */ pid = waitpid(-1, &status, WNOHANG); switch (pid) { case -1: { if (errno == ECHILD) { /* no more childrens */ if (asked_to_stop || asked_to_stop_fast || asked_to_quit) { /* off we go */ log_info("Internal subprocesses are done, stopping"); return true; } log_fatal("Unexpected ECHILD error from waitpid()"); return false; } else { log_debug("Failed to call waitpid(): %m"); } break; } case 0: { /* * We're using WNOHANG, 0 means there are no stopped or exited * children, it's all good. It's the expected case when * everything is running smoothly, so enjoy and sleep for * awhile. */ /* handle SIGTERM and SIGINT if we've received them */ (void) supervisor_handle_signals(supervisor); /* if we're in a shutdown sequence, make sure we terminate */ if (supervisor->shutdownSequenceInProgress) { (void) supervisor_shutdown_sequence(supervisor); } break; } default: { Service *dead = NULL; /* map the dead child pid to the known dead internal service */ if (!supervisor_find_service(supervisor, pid, &dead)) { log_error("Unknown subprocess died with pid %d", pid); break; } /* one child process is no more */ --subprocessCount; /* apply the service restart policy */ if (supervisor_restart_service(supervisor, dead, status)) { ++subprocessCount; } break; } } } /* we track in the main loop if it's a cleanExit or not */ return supervisor->exitMode; } /* * supervisor_find_service loops over the SubProcess array to find given pid and * return its entry in the array. */ static bool supervisor_find_service(Supervisor *supervisor, pid_t pid, Service **result) { int serviceCount = supervisor->serviceCount; int serviceIndex = 0; for (serviceIndex = 0; serviceIndex < serviceCount; serviceIndex++) { if (pid == supervisor->services[serviceIndex].pid) { *result = &(supervisor->services[serviceIndex]); return true; } } return false; } /* * supervisor_reload_services sends SIGHUP to all our services. */ static void supervisor_reload_services(Supervisor *supervisor) { int serviceCount = supervisor->serviceCount; int serviceIndex = 0; for (serviceIndex = 0; serviceIndex < serviceCount; serviceIndex++) { Service *service = &(supervisor->services[serviceIndex]); log_info("Reloading service \"%s\" by signaling pid %d with SIGHUP", service->name, service->pid); if (kill(service->pid, SIGHUP) != 0) { log_error("Failed to send SIGHUP to service %s with pid %d", service->name, service->pid); } } /* reset our signal handling facility */ asked_to_reload = 0; } /* * supervisor_stop_subprocesses calls the stopFunction for all the registered * services to initiate the shutdown sequence. */ static void supervisor_stop_subprocesses(Supervisor *supervisor) { int signal = get_current_signal(SIGTERM); int serviceCount = supervisor->serviceCount; int serviceIndex = 0; for (serviceIndex = 0; serviceIndex < serviceCount; serviceIndex++) { Service *service = &(supervisor->services[serviceIndex]); if (kill(service->pid, signal) != 0) { log_error("Failed to send signal %s to service %s with pid %d", strsignal(signal), service->name, service->pid); } } } /* * supervisor_stop_other_subprocesses sends the QUIT signal to other known * sub-processes when on of does is reported dead. */ static void supervisor_stop_other_services(Supervisor *supervisor, pid_t pid) { int signal = get_current_signal(SIGTERM); int serviceCount = supervisor->serviceCount; int serviceIndex = 0; /* * In case of unexpected stop (bug), we stop the other processes too. * Someone might then notice (such as systemd) and restart the whole * thing again. */ if (!(asked_to_stop || asked_to_stop_fast)) { for (serviceIndex = 0; serviceIndex < serviceCount; serviceIndex++) { Service *service = &(supervisor->services[serviceIndex]); if (service->pid != pid) { if (kill(service->pid, signal) != 0) { log_error("Failed to send signal %s to service %s with pid %d", signal_to_string(signal), service->name, service->pid); } } } } } /* * supervisor_signal_process_group sends a signal to our own process group, * which we are the leader of. * * That's used when we have received a signal already (asked_to_stop || * asked_to_stop_fast) and our sub-processes are still running after a while. */ static bool supervisor_signal_process_group(int signal) { pid_t pid = getpid(); pid_t pgrp = getpgid(pid); if (pgrp == -1) { log_fatal("Failed to get the process group id of pid %d: %m", pid); return false; } if (killpg(pgrp, signal) != 0) { log_error("Failed to send %s to the keeper's pid %d: %m", signal_to_string(signal), pgrp); return false; } return true; } /* * supervisor_init initializes our PID file and sets our signal handlers. */ static bool supervisor_init(Supervisor *supervisor) { bool exitOnQuit = false; log_trace("supervisor_init"); /* Establish a handler for signals. */ (void) set_signal_handlers(exitOnQuit); /* Check that the keeper service is not already running */ if (read_pidfile(supervisor->pidfile, &(supervisor->pid))) { log_fatal("An instance of pg_autoctl is already running with PID %d, " "as seen in pidfile \"%s\"", supervisor->pid, supervisor->pidfile); return false; } /* Ok, we're going to start. Time to create our PID file. */ supervisor->pid = getpid(); if (!create_pidfile(supervisor->pidfile, supervisor->pid)) { log_fatal("Failed to write our PID to \"%s\"", supervisor->pidfile); return false; } return true; } /* * supervisor_stop stops the service and removes the pid file. */ bool supervisor_stop(Supervisor *supervisor) { log_info("Stop pg_autoctl"); if (!remove_pidfile(supervisor->pidfile)) { log_error("Failed to remove pidfile \"%s\"", supervisor->pidfile); return false; } return true; } /* * If we have received a signal that instructs a shutdown, such as SIGTERM or * SIGINT, then we need to do one of these things: * * - first time we receive the signal, begin a shutdown sequence for all * services and the main supervisor itself, * * - when receiving the signal again, if it's a SIGTERM continue the shutdown * sequence, * * - when receiving a SIGINT forward it to our services so as to finish as fast * as we can, and from then on always use SIGINT (to that end we use * supervisor->shutdownSignal) * * Sending SIGTERM and then later SIGINT if the process is still running is a * classic way to handle service shutdown. */ static void supervisor_handle_signals(Supervisor *supervisor) { int signal = get_current_signal(SIGTERM); const char *signalStr = signal_to_string(signal); /* if no signal has been received, we have nothing to do here */ if (!(asked_to_stop || asked_to_stop_fast || asked_to_quit)) { return; } /* * Once we have received and processed SIGQUIT we want to stay at this * signal level. Once we have received SIGINT we may upgrade to SIGQUIT, * but we won't downgrade to SIGTERM. */ supervisor->shutdownSignal = pick_stronger_signal(supervisor->shutdownSignal, signal); log_info("pg_autoctl received signal %s, terminating", signalStr); /* the first time we receive a signal, set the shutdown properties */ if (!supervisor->shutdownSequenceInProgress) { supervisor->exitMode = SUPERVISOR_EXIT_CLEAN; supervisor->shutdownSequenceInProgress = true; } /* forward the signal to all our service to terminate them */ (void) supervisor_stop_subprocesses(supervisor); /* allow for processing signals again: reset signal variables */ switch (signal) { case SIGINT: { asked_to_stop_fast = 0; break; } case SIGTERM: { asked_to_stop = 0; break; } case SIGQUIT: { asked_to_quit = 0; break; } } } /* * supervisor_shutdown_sequence handles the shutdown sequence of the supervisor * and insist towards registered services that now is the time to shutdown when * they fail to do so timely. * * The stoppingLoopCounter is zero on the first loop and we do nothing, when * it's 1 we have been waiting once without any child process reported absent * by waitpid(), tell the user we are waiting. * * At 50 loops (typically we add a 100ms wait per loop), send either SIGTERM or * SIGINT. * * At every 100 loops, send SIGINT. */ static void supervisor_shutdown_sequence(Supervisor *supervisor) { if (supervisor->stoppingLoopCounter == 1) { log_info("Waiting for subprocesses to terminate."); } /* * If we've been waiting for quite a while for sub-processes to terminate. * Let's signal again all our process group ourselves and see what happens * next. */ if (supervisor->stoppingLoopCounter == 50) { log_info("pg_autoctl services are still running, " "signaling them with %s.", signal_to_string(supervisor->shutdownSignal)); if (!supervisor_signal_process_group(supervisor->shutdownSignal)) { log_warn("Still waiting for subprocesses to terminate."); } } /* * Wow it's been a very long time now... */ if (supervisor->stoppingLoopCounter > 0 && supervisor->stoppingLoopCounter % 100 == 0) { log_info("pg_autoctl services are still running, " "signaling them with SIGINT."); /* raise the signal from SIGTERM to SIGINT now */ supervisor->shutdownSignal = pick_stronger_signal(supervisor->shutdownSignal, SIGINT); if (!supervisor_signal_process_group(supervisor->shutdownSignal)) { log_warn("Still waiting for subprocesses to terminate."); } } /* increment our counter */ supervisor->stoppingLoopCounter++; } /* * supervisor_restart_service restarts given service and maintains its MaxR and * MaxT counters. */ static bool supervisor_restart_service(Supervisor *supervisor, Service *service, int status) { uint64_t now = time(NULL); int logLevel = LOG_ERROR; RestartCounters *counters = &(service->restartCounters); /* * If we're in the middle of a shutdown sequence, we won't have to restart * services and apply any restart strategy etc. */ if (supervisor->shutdownSequenceInProgress) { log_trace("supervisor_restart_service: shutdownSequenceInProgress"); return false; } /* refrain from an ERROR message for a TEMPORARY service */ if (service->policy == RP_TEMPORARY) { logLevel = LOG_INFO; } /* when a sub-process has quit and we're not shutting down, warn about it */ else if (WIFEXITED(status) && WEXITSTATUS(status) == EXIT_CODE_QUIT) { logLevel = LOG_WARN; } if (WIFEXITED(status)) { int returnCode = WEXITSTATUS(status); /* sometimes we don't want to restart even a PERMANENT service */ if (returnCode == EXIT_CODE_DROPPED) { supervisor->exitMode = SUPERVISOR_EXIT_CLEAN; supervisor->shutdownSequenceInProgress = true; (void) supervisor_stop_other_services(supervisor, service->pid); return false; } else if (returnCode == EXIT_CODE_FATAL) { supervisor->exitMode = SUPERVISOR_EXIT_FATAL; supervisor->shutdownSequenceInProgress = true; (void) supervisor_stop_other_services(supervisor, service->pid); return false; } /* general case, log and continue to restart the service */ log_level(logLevel, "pg_autoctl service %s exited with exit status %d", service->name, returnCode); } else if (WIFSIGNALED(status)) { int signal = WTERMSIG(status); log_level(logLevel, "pg_autoctl service %s exited after receiving signal %s", service->name, strsignal(signal)); } else if (WIFSTOPPED(status)) { /* well that's unexpected, we're not using WUNTRACED */ log_level(logLevel, "pg_autoctl service %s has been stopped and can be restarted", service->name); return false; } /* * We don't restart temporary processes at all: we're done already. */ if (service->policy == RP_TEMPORARY) { return true; } /* * Check that we are allowed to restart: apply MaxR/MaxT as per the * tracking we do in the counters ring buffer. */ if (supervisor_may_restart(service)) { /* update our ring buffer: move our clock hand */ int position = (counters->position + 1) % SUPERVISOR_SERVICE_MAX_RETRY; /* we have restarted once more */ counters->count += 1; counters->position = position; counters->startTime[counters->position] = now; } else { /* exit with a non-zero exit code, and process with shutdown sequence */ supervisor->exitMode = SUPERVISOR_EXIT_ERROR; supervisor->shutdownSequenceInProgress = true; (void) supervisor_stop_other_services(supervisor, service->pid); return false; } /* * When a transient service has quit happily (with a zero exit status), we * just shutdown the whole pg_autoctl. We consider this a clean shutdown. * * The main use case here is with the initialization of a node: unless * using the --run option, we want to shutdown as soon as the * initialisation is done. * * That's when using the "create" subcommand as in: * * pg_autoctl create monitor * pg_autoctl create postgres */ if (service->policy == RP_TRANSIENT && WIFEXITED(status) && WEXITSTATUS(status) == EXIT_CODE_QUIT) { /* exit with a happy exit code, and process with shutdown sequence */ supervisor->exitMode = SUPERVISOR_EXIT_CLEAN; supervisor->shutdownSequenceInProgress = true; (void) supervisor_stop_other_services(supervisor, service->pid); return false; } /* * Now the service RestartPolicy is either RP_PERMANENT, and we need to * restart it no matter what, or RP_TRANSIENT with a failure status * (non-zero return code), and we need to start the service in that case * too. */ log_info("Restarting service %s", service->name); bool restarted = (*service->startFunction)(service->context, &(service->pid)); if (!restarted) { log_fatal("Failed to restart service %s", service->name); /* exit with a non-zero exit code, and process with shutdown sequence */ supervisor->exitMode = SUPERVISOR_EXIT_ERROR; supervisor->shutdownSequenceInProgress = true; (void) supervisor_stop_other_services(supervisor, service->pid); return false; } /* * Now we have restarted the service, it has a new PID and we need to * update our PID file with the new information. Failing to update the PID * file is a fatal error: the `pg_autoctl restart` command can't work then. */ if (!supervisor_update_pidfile(supervisor)) { log_fatal("Failed to update pidfile \"%s\", stopping all services now", supervisor->pidfile); supervisor->exitMode = SUPERVISOR_EXIT_ERROR; supervisor->shutdownSequenceInProgress = true; (void) supervisor_stop_subprocesses(supervisor); return false; } return true; } /* * supervisor_count_restarts returns true when we have restarted more than * SUPERVISOR_SERVICE_MAX_RETRY in the last SUPERVISOR_SERVICE_MAX_TIME period * of time. */ static bool supervisor_may_restart(Service *service) { uint64_t now = time(NULL); RestartCounters *counters = &(service->restartCounters); int position = counters->position; char timestring[BUFSIZE] = { 0 }; log_debug("supervisor_may_restart: service \"%s\" restarted %d times, " "most recently at %s, %d seconds ago", service->name, counters->count, epoch_to_string(counters->startTime[position], timestring), (int) (now - counters->startTime[position])); /* until we have restarted MaxR times, we know we can restart */ if (counters->count <= SUPERVISOR_SERVICE_MAX_RETRY) { return true; } /* * When we have restarted more than MaxR times, the only case when we can't * restart again is if the oldest entry in the counters startTime array is * older than our MaxT. * * The oldest entry in the ring buffer is the one just after the current * one: */ position = (position + 1) % SUPERVISOR_SERVICE_MAX_RETRY; uint64_t oldestRestartTime = counters->startTime[position]; if ((now - oldestRestartTime) <= SUPERVISOR_SERVICE_MAX_TIME) { log_fatal("pg_autoctl service %s has already been " "restarted %d times in the last %d seconds, " "stopping now", service->name, SUPERVISOR_SERVICE_MAX_RETRY, (int) (now - oldestRestartTime)); return false; } return true; } /* * supervisor_update_pidfile creates a pidfile with all our PIDs in there. */ static bool supervisor_update_pidfile(Supervisor *supervisor) { int serviceCount = supervisor->serviceCount; int serviceIndex = 0; PQExpBuffer content = createPQExpBuffer(); if (content == NULL) { log_error("Failed to allocate memory to update our PID file"); return false; } if (!prepare_pidfile_buffer(content, supervisor->pid)) { /* errors have already been logged */ destroyPQExpBuffer(content); return false; } /* now add a line per service */ for (serviceIndex = 0; serviceIndex < serviceCount; serviceIndex++) { Service *service = &(supervisor->services[serviceIndex]); /* one line per service, pid space name */ appendPQExpBuffer(content, "%d %s\n", service->pid, service->name); } bool success = write_file(content->data, content->len, supervisor->pidfile); destroyPQExpBuffer(content); return success; } /* * supervisor_find_service_pid reads the pidfile contents and process it line * by line to find the pid of the given service name. */ bool supervisor_find_service_pid(const char *pidfile, const char *serviceName, pid_t *pid) { long fileSize = 0L; char *fileContents = NULL; char *fileLines[BUFSIZE] = { 0 }; int lineNumber; if (!file_exists(pidfile)) { return false; } if (!read_file(pidfile, &fileContents, &fileSize)) { return false; } int lineCount = splitLines(fileContents, fileLines, BUFSIZE); for (lineNumber = 0; lineNumber < lineCount; lineNumber++) { char *separator = NULL; /* skip first lines, see pidfile.h (where we count from 1) */ if ((lineNumber + 1) < PIDFILE_LINE_FIRST_SERVICE) { continue; } if ((separator = strchr(fileLines[lineNumber], ' ')) == NULL) { log_error("Failed to find first space separator in line: \"%s\"", fileLines[lineNumber]); continue; } if (streq(serviceName, separator + 1)) { *separator = '\0'; stringToInt(fileLines[lineNumber], pid); free(fileContents); return true; } } free(fileContents); return false; } pg_auto_failover-1.6.3/src/bin/pg_autoctl/supervisor.h000066400000000000000000000102251414244367200231470ustar00rootroot00000000000000/* * src/bin/pg_autoctl/supervisor.h * Utilities to start/stop the pg_autoctl services. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef SUPERVISOR_H #define SUPERVISOR_H #include #include /* * pg_autoctl runs sub-processes as "services", and we need to use the same * service names in several places: * * - the main pidfile, * - the per-service name for the pidfile is derived from this, * - the pg_autoctl do service getpid|restart commands */ #define SERVICE_NAME_POSTGRES "postgres" #define SERVICE_NAME_KEEPER "node-active" #define SERVICE_NAME_MONITOR "listener" /* * At pg_autoctl create time we use a transient service to initialize our local * node. When using the --run option, the transient service is terminated and * we start the permanent service with the name defined above. */ #define SERVICE_NAME_KEEPER_INIT "node-init" #define SERVICE_NAME_MONITOR_INIT "monitor-init" /* * Our supervisor process may retart a service sub-process when it quits, * depending on the exit status and the restart policy that has been choosen: * * - A permanent child process is always restarted. * * - A temporary child process is never restarted. * * - A transient child process is restarted only if it terminates abnormally, * that is, with an exit code other EXIT_CODE_QUIT (zero). */ typedef enum { RP_PERMANENT = 0, RP_TEMPORARY, RP_TRANSIENT } RestartPolicy; /* * Supervisor restart strategy. * * The idea is to restart processes that have failed, so that we can stay * available without external intervention. Sometimes though if the * configuration is wrong or the data directory damaged beyond repair or for * some reasons, the service can't be restarted. * * This strategy is inspired by http://erlang.org/doc/man/supervisor.html and * http://erlang.org/doc/design_principles/sup_princ.html#maximum-restart-intensity * * If more than MaxR number of restarts occur in the last MaxT seconds, the * supervisor terminates all the child processes and then itself. The * termination reason for the supervisor itself in that case will be * shutdown. * * SUPERVISOR_SERVICE_MAX_RETRY is MaxR, SUPERVISOR_SERVICE_MAX_TIME is MaxT. */ #define SUPERVISOR_SERVICE_MAX_RETRY 5 #define SUPERVISOR_SERVICE_MAX_TIME 300 /* in seconds */ /* * We use a "ring buffer" of the MaxR most recent retries. * * With an array of SUPERVISOR_SERVICE_MAX_RETRY we can track this amount of * retries and compare the oldest one with the current time to decide if we are * allowed to restart or now, applying MaxT. */ typedef struct RestartCounters { int count; /* how many restarts including first start */ int position; /* array index */ uint64_t startTime[SUPERVISOR_SERVICE_MAX_RETRY]; } RestartCounters; /* * The supervisor works with an array of Service entries. Each service defines * its behavior thanks to a start function, a stop function, and a reload * function. Those are called at different points to adjust to the situation as * seen by the supervisor. * * In particular, services may be started more than once when they fail. */ typedef struct Service { char name[NAMEDATALEN]; /* Service name for the user */ RestartPolicy policy; /* Should we restart the service? */ pid_t pid; /* Service PID */ bool (*startFunction)(void *context, pid_t *pid); void *context; /* Service Context (Monitor or Keeper struct) */ RestartCounters restartCounters; } Service; typedef enum { SUPERVISOR_EXIT_ERROR = 0, SUPERVISOR_EXIT_CLEAN, SUPERVISOR_EXIT_FATAL } SupervisorExitMode; typedef struct Supervisor { Service *services; int serviceCount; char pidfile[MAXPGPATH]; pid_t pid; SupervisorExitMode exitMode; bool shutdownSequenceInProgress; int shutdownSignal; int stoppingLoopCounter; } Supervisor; bool supervisor_start(Service services[], int serviceCount, const char *pidfile); bool supervisor_stop(Supervisor *supervisor); bool supervisor_find_service_pid(const char *pidfile, const char *serviceName, pid_t *pid); #endif /* SUPERVISOR_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/system_utils.c000066400000000000000000000061351414244367200234720ustar00rootroot00000000000000/* * src/bin/pg_autoctl/hardware_utils.c * Utility functions for getting CPU and Memory information. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #if defined(__linux__) #include #else #include #include #include #endif #include #include "log.h" #include "file_utils.h" #include "system_utils.h" #if defined(__linux__) static bool get_system_info_linux(SystemInfo *sysInfo); #endif #if defined(__APPLE__) || defined(BSD) static bool get_system_info_bsd(SystemInfo *sysInfo); #endif /* * get_system_info probes for system information and fills the given SystemInfo * structure with what we found: number of CPUs and total amount of memory. */ bool get_system_info(SystemInfo *sysInfo) { #if defined(__APPLE__) || defined(BSD) return get_system_info_bsd(sysInfo); #elif defined(__linux__) return get_system_info_linux(sysInfo); #else log_error("Failed to get system information: " "Operating System not supported"); return false; #endif } /* * On Linux, use sysinfo(2) and getnprocs(3) */ #if defined(__linux__) static bool get_system_info_linux(SystemInfo *sysInfo) { struct sysinfo linuxSysInfo = { 0 }; if (sysinfo(&linuxSysInfo) != 0) { log_error("Failed to call sysinfo(): %m"); return false; } sysInfo->ncpu = get_nprocs(); sysInfo->totalram = linuxSysInfo.totalram; return true; } #endif /* * FreeBSD, OpenBSD, and darwin use the sysctl(3) API. */ #if defined(__APPLE__) || defined(BSD) static bool get_system_info_bsd(SystemInfo *sysInfo) { unsigned int ncpu = 0; /* the API requires an integer here */ int ncpuMIB[2] = { CTL_HW, HW_NCPU }; #if defined(HW_MEMSIZE) int ramMIB[2] = { CTL_HW, HW_MEMSIZE }; /* MacOS */ #elif defined(HW_PHYSMEM64) int ramMIB[2] = { CTL_HW, HW_PHYSMEM64 }; /* OpenBSD */ #else int ramMIB[2] = { CTL_HW, HW_PHYSMEM }; /* FreeBSD */ #endif size_t cpuSize = sizeof(ncpu); size_t memSize = sizeof(sysInfo->totalram); if (sysctl(ncpuMIB, 2, &ncpu, &cpuSize, NULL, 0) == -1) { log_error("Failed to probe number of CPUs: %m"); return false; } sysInfo->ncpu = (unsigned short) ncpu; if (sysctl(ramMIB, 2, &(sysInfo->totalram), &memSize, NULL, 0) == -1) { log_error("Failed to probe Physical Memory: %m"); return false; } return true; } #endif /* * pretty_print_bytes pretty prints bytes in a human readable form. Given * 17179869184 it places the string "16 GB" in the given buffer. */ void pretty_print_bytes(char *buffer, size_t size, uint64_t bytes) { const char *suffixes[7] = { "B", /* Bytes */ "kB", /* Kilo */ "MB", /* Mega */ "GB", /* Giga */ "TB", /* Tera */ "PB", /* Peta */ "EB" /* Exa */ }; uint sIndex = 0; long double count = bytes; while (count >= 10240 && sIndex < 7) { sIndex++; count /= 1024; } /* forget about having more precision, Postgres wants integers here */ sformat(buffer, size, "%d %s", (int) count, suffixes[sIndex]); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/system_utils.h000066400000000000000000000011741414244367200234750ustar00rootroot00000000000000/* * src/bin/pg_autoctl/system_utils.h * Utility functions for getting CPU and Memory information. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef SYSTEM_UTILS_H #define SYSTEM_UTILS_H #include /* taken from sysinfo(2) on Linux */ typedef struct SystemInfo { uint64_t totalram; /* Total usable main memory size */ unsigned short ncpu; /* Number of current processes */ } SystemInfo; bool get_system_info(SystemInfo *sysInfo); void pretty_print_bytes(char *buffer, size_t size, uint64_t bytes); #endif /* SYSTEM_UTILS_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/systemd_config.c000066400000000000000000000111221414244367200237330ustar00rootroot00000000000000/* * src/bin/pg_autoctl/systemd_config.c * Keeper configuration functions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include "postgres_fe.h" #include "cli_root.h" #include "defaults.h" #include "ini_file.h" #include "systemd_config.h" #include "log.h" #include "runprogram.h" #define OPTION_SYSTEMD_DESCRIPTION(config) \ make_strbuf_option_default("Unit", "Description", NULL, true, BUFSIZE, \ config->Description, "pg_auto_failover") #define OPTION_SYSTEMD_WORKING_DIRECTORY(config) \ make_strbuf_option_default("Service", "WorkingDirectory", \ NULL, true, BUFSIZE, \ config->WorkingDirectory, "/var/lib/postgresql") #define OPTION_SYSTEMD_ENVIRONMENT_PGDATA(config) \ make_strbuf_option_default("Service", "Environment", \ NULL, true, BUFSIZE, \ config->EnvironmentPGDATA, \ "PGDATA=/var/lib/postgresql/11/pg_auto_failover") #define OPTION_SYSTEMD_USER(config) \ make_strbuf_option_default("Service", "User", NULL, true, BUFSIZE, \ config->User, "postgres") #define OPTION_SYSTEMD_EXECSTART(config) \ make_strbuf_option_default("Service", "ExecStart", NULL, true, BUFSIZE, \ config->ExecStart, "/usr/bin/pg_autoctl run") #define OPTION_SYSTEMD_RESTART(config) \ make_strbuf_option_default("Service", "Restart", NULL, true, BUFSIZE, \ config->Restart, "always") #define OPTION_SYSTEMD_STARTLIMITBURST(config) \ make_int_option_default("Service", "StartLimitBurst", NULL, true, \ &(config->StartLimitBurst), 20) #define OPTION_SYSTEMD_EXECRELOAD(config) \ make_strbuf_option_default("Service", "ExecReload", NULL, true, BUFSIZE, \ config->ExecReload, "/usr/bin/pg_autoctl reload") #define OPTION_SYSTEMD_WANTEDBY(config) \ make_strbuf_option_default("Install", "WantedBy", NULL, true, BUFSIZE, \ config->WantedBy, "multi-user.target") #define SET_INI_OPTIONS_ARRAY(config) \ { \ OPTION_SYSTEMD_DESCRIPTION(config), \ OPTION_SYSTEMD_WORKING_DIRECTORY(config), \ OPTION_SYSTEMD_ENVIRONMENT_PGDATA(config), \ OPTION_SYSTEMD_USER(config), \ OPTION_SYSTEMD_EXECSTART(config), \ OPTION_SYSTEMD_RESTART(config), \ OPTION_SYSTEMD_STARTLIMITBURST(config), \ OPTION_SYSTEMD_EXECRELOAD(config), \ OPTION_SYSTEMD_WANTEDBY(config), \ INI_OPTION_LAST \ } /* * systemd_config_init initializes a SystemdServiceConfig with the default * values. */ void systemd_config_init(SystemdServiceConfig *config, const char *pgdata) { IniOption systemdOptions[] = SET_INI_OPTIONS_ARRAY(config); /* time to setup config->pathnames.systemd */ sformat(config->pathnames.systemd, MAXPGPATH, "/etc/systemd/system/%s", KEEPER_SYSTEMD_FILENAME); /* * In its operations pg_autoctl might remove PGDATA and replace it with a * new directory, at pg_basebackup time. It turns out that systemd does not * like that, at all. Let's assign WorkingDirectory to a safe place, like * the HOME of the USER running the service. * * Also we expect to be running the service with the user that owns the * PGDATA directory, rather than the current user. After all, the command * * $ pg_autoctl show systemd -q | sudo tee /etc/systemd/system/... * * Might be ran as root. */ struct stat pgdataStat; if (stat(config->pgSetup.pgdata, &pgdataStat) != 0) { log_error("Failed to grab file stat(1) for \"%s\": %m", config->pgSetup.pgdata); exit(EXIT_CODE_INTERNAL_ERROR); } struct passwd *pw = getpwuid(pgdataStat.st_uid); if (pw) { log_debug("username found in passwd: %s's HOME is \"%s\"", pw->pw_name, pw->pw_dir); strlcpy(config->WorkingDirectory, pw->pw_dir, MAXPGPATH); } /* adjust defaults to known values from the config */ sformat(config->EnvironmentPGDATA, BUFSIZE, "'PGDATA=%s'", config->pgSetup.pgdata); /* adjust the user to the owner of PGDATA */ strlcpy(config->User, pw->pw_name, NAMEDATALEN); /* adjust the program to the current full path of argv[0] */ sformat(config->ExecStart, BUFSIZE, "%s run", pg_autoctl_program); sformat(config->ExecReload, BUFSIZE, "%s reload", pg_autoctl_program); if (!ini_validate_options(systemdOptions)) { log_error("Please review your setup options per above messages"); exit(EXIT_CODE_BAD_CONFIG); } } /* * keeper_config_write write the current config to given STREAM. */ bool systemd_config_write(FILE *stream, SystemdServiceConfig *config) { IniOption systemdOptions[] = SET_INI_OPTIONS_ARRAY(config); return write_ini_to_stream(stream, systemdOptions); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/systemd_config.h000066400000000000000000000017571414244367200237550ustar00rootroot00000000000000/* * src/bin/pg_autoctl/systemd_config.h * Keeper integration with systemd service configuration file * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef SYSTEMD_CONFIG_H #define SYSTEMD_CONFIG_H #include #include #include "config.h" typedef struct SystemdServiceConfig { ConfigFilePaths pathnames; /* UNIT */ char Description[BUFSIZE]; /* Service */ char WorkingDirectory[MAXPGPATH]; char EnvironmentPGDATA[BUFSIZE]; char User[NAMEDATALEN]; char ExecStart[BUFSIZE]; char Restart[BUFSIZE]; int StartLimitBurst; char ExecReload[BUFSIZE]; /* Install */ char WantedBy[BUFSIZE]; /* PostgreSQL setup */ PostgresSetup pgSetup; } SystemdServiceConfig; void systemd_config_init(SystemdServiceConfig *config, const char *pgdata); bool systemd_config_write_file(SystemdServiceConfig *config); bool systemd_config_write(FILE *stream, SystemdServiceConfig *config); #endif /* SYSTEMD_CONFIG_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/watch.c000066400000000000000000001022111414244367200220240ustar00rootroot00000000000000/* * src/bin/pg_autoctl/watch.c * Implementation of a CLI to show events, states, and URI from the * pg_auto_failover monitor. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #include #include #include #include #include #include #include #include #include #include "postgres_fe.h" #include "cli_common.h" #include "commandline.h" #include "defaults.h" #include "env_utils.h" #include "ipaddr.h" #include "keeper_config.h" #include "keeper.h" #include "monitor_config.h" #include "monitor_pg_init.h" #include "monitor.h" #include "nodestate_utils.h" #include "parsing.h" #include "pgctl.h" #include "pghba.h" #include "pgsetup.h" #include "pgsql.h" #include "pidfile.h" #include "state.h" #include "string_utils.h" #include "watch.h" #include "watch_colspecs.h" volatile sig_atomic_t window_size_changed = 0; /* SIGWINCH */ static bool cli_watch_update_from_monitor(WatchContext *context); static bool cli_watch_process_keys(WatchContext *context); static int print_watch_header(WatchContext *context, int r); static int print_watch_footer(WatchContext *context); static int print_nodes_array(WatchContext *context, int r, int c); static int print_events_array(WatchContext *context, int r, int c); static void print_current_time(WatchContext *context, int r); static ColPolicy * pick_column_policy(WatchContext *context); static bool compute_column_spec_lens(WatchContext *context); static int compute_column_size(ColumnType type, NodeAddressHeaders *headers); static void print_column_headers(WatchContext *context, ColPolicy *policy, int r, int c); static void print_node_state(WatchContext *context, ColPolicy *policy, int index, int r, int c); static EventColPolicy * pick_event_column_policy(WatchContext *context); static bool compute_event_column_spec_lens(WatchContext *context); static bool compute_events_sizes(WatchContext *context); static int compute_event_column_size(EventColumnType type, MonitorEventsHeaders *headers); static void print_events_headers(WatchContext *context, EventColPolicy *policy, int r, int c); static int print_event(WatchContext *context, EventColPolicy *policy, int index, int r, int c); static void watch_set_state_attributes(NodeState state, bool toggle); static void clear_line_at(int row); /* * catch_sigwinch is registered as the SIGWINCH signal handler. */ static void catch_sigwinch(int sig) { window_size_changed = 1; pqsignal(sig, catch_sigwinch); } /* * watch_main_loop takes over the terminal window and displays the state and * events in there, refreshing the output often, as when using the watch(1) * command, or similar to what top(1) would be doing. */ void cli_watch_main_loop(WatchContext *context) { WatchContext previous = { 0 }; int step = -1; /* the main loop */ for (;;) { instr_time start; instr_time duration; INSTR_TIME_SET_CURRENT(start); /* * First, update the data that we want to display, and process key * strokes. We are going to update our data set twice per second, and * we want to react to key strokes and other events much faster than * this, every 50ms. */ step = (step + 1) % 10; (void) cli_watch_update(context, step); if (context->shouldExit) { break; } /* now display the context we have */ if (context->couldContactMonitor) { (void) cli_watch_render(context, &previous); } else if (!context->cookedMode) { /* get back to "cooked" terminal mode, showing stderr logs */ context->cookedMode = true; def_prog_mode(); endwin(); } /* and then sleep for the rest of the 50 ms */ INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); int sleepMs = 50 - INSTR_TIME_GET_MILLISEC(duration); if (sleepMs > 0) { pg_usleep(sleepMs * 1000); } /* update the previous context */ previous = *context; } (void) cli_watch_end_window(context); } /* * watch_init_window takes care of displaying information on the current * interactive terminal window, handled with the ncurses API. */ void cli_watch_init_window(WatchContext *context) { struct winsize size = { 0 }; int ioctl_result = 0; if ((ioctl_result = ioctl(STDOUT_FILENO, TIOCGWINSZ, (char *) &size)) >= 0) { resize_term(size.ws_row, size.ws_col); } initscr(); /* Start curses mode */ cbreak(); /* Line buffering disabled */ intrflush(stdscr, FALSE); /* No flushing on interrupts */ keypad(stdscr, TRUE); /* We get F1, F2 etc.. */ noecho(); /* Don't echo() while we do getch */ nonl(); /* Do not translate RETURN key */ nodelay(stdscr, TRUE); /* Non blocking getch() variants */ curs_set(0); /* Do not display the cursor */ pqsignal(SIGWINCH, catch_sigwinch); refresh(); /* get the current size of the window */ getmaxyx(stdscr, context->rows, context->cols); } /* * watch_end_window finishes our ncurses session and gives control back. */ void cli_watch_end_window(WatchContext *context) { refresh(); endwin(); } /* * watch_update updates the context to be displayed on the terminal window. */ bool cli_watch_update(WatchContext *context, int step) { /* only update data from the monitor at step 0 */ if (step == 0) { context->couldContactMonitor = cli_watch_update_from_monitor(context); } /* now process any key pressed by the user */ bool processKeys = cli_watch_process_keys(context); /* failure to process keys signals we should exit now */ context->shouldExit = (processKeys == false); return true; } /* * cli_watch_update_from_monitor fetches the data to display from the * pg_auto_failover monitor database. */ static bool cli_watch_update_from_monitor(WatchContext *context) { Monitor *monitor = &(context->monitor); CurrentNodeStateArray *nodesArray = &(context->nodesArray); MonitorEventsArray *eventsArray = &(context->eventsArray); /* * We use a transaction despite being read-only, because we want to re-use * a single connection to the monitor. */ PGSQL *pgsql = &(monitor->pgsql); pgsql->connectionStatementType = PGSQL_CONNECTION_MULTI_STATEMENT; if (!monitor_get_current_state(monitor, context->formation, context->groupId, nodesArray)) { /* errors have already been logged */ return false; } if (!monitor_get_formation_number_sync_standbys( monitor, context->formation, &(context->number_sync_standbys))) { /* errors have already been logged */ return false; } if (!monitor_get_last_events(monitor, context->formation, context->groupId, EVENTS_BUFFER_COUNT, eventsArray)) { /* errors have already been logged */ return false; } /* time to finish our connection */ pgsql_finish(pgsql); return true; } /* Capture CTRL + a key */ #define ctrl(x) ((x) & 0x1f) /* * cli_watch_process_keys processes the user input. */ static bool cli_watch_process_keys(WatchContext *context) { int ch; /* * Reset our move from the last update session. We need to keep the END * movement set in between update calls, though, because this one is * handled on a line-by-line basis, and is not reflected on the value of * context->startCol. */ if (context->move != WATCH_MOVE_FOCUS_END) { context->move = WATCH_MOVE_FOCUS_NONE; } do { /* we have setup ncurses in non-blocking behaviour */ ch = getch(); if (ch == KEY_F(1) || ch == 'q') { return false; } else if (ch == KEY_RESIZE || window_size_changed == 1) { struct winsize size = { 0 }; window_size_changed = 0; /* get current terminal rows and columns and resize our display */ int ioctl_result = ioctl(STDOUT_FILENO, TIOCGWINSZ, (char *) &size); if (ioctl_result >= 0) { if (context->rows != size.ws_row || context->cols != size.ws_col) { context->rows = size.ws_row; context->cols = size.ws_col; resizeterm(context->rows, context->cols); } } } /* left and right moves are conditionnal / relative */ else if (ch == KEY_LEFT || ch == ctrl('b') || ch == 'h') { if (context->move == WATCH_MOVE_FOCUS_NONE) { context->move = WATCH_MOVE_FOCUS_LEFT; if (context->startCol > 0) { /* move by half the description column */ context->startCol -= (context->cols - 21) / 2; if (context->startCol < 0) { context->startCol = 0; } } } else if (context->move == WATCH_MOVE_FOCUS_RIGHT) { context->move = WATCH_MOVE_FOCUS_NONE; } } /* left and right moves are conditionnal / relative */ else if (ch == KEY_RIGHT || ch == ctrl('f') || ch == 'l') { if (context->move == WATCH_MOVE_FOCUS_NONE) { context->move = WATCH_MOVE_FOCUS_RIGHT; /* move by half the description column */ context->startCol += (context->cols - 21) / 2; } else if (context->move == WATCH_MOVE_FOCUS_LEFT) { context->move = WATCH_MOVE_FOCUS_NONE; } } /* home and end moves are unconditionnal / absolute */ else if (ch == KEY_HOME || ch == ctrl('a') || ch == '0') { context->move = WATCH_MOVE_FOCUS_HOME; context->startCol = 0; } else if (ch == KEY_END || ch == ctrl('e') || ch == '$') { context->move = WATCH_MOVE_FOCUS_END; } /* up is C-p in Emacs, k in vi(m) */ else if (ch == KEY_UP || ch == ctrl('p') || ch == 'k') { context->move = WATCH_MOVE_FOCUS_UP; if (context->selectedRow > 0) { --context->selectedRow; } } /* page up, which is also C-u in the terminal with less/more etc */ else if (ch == KEY_PPAGE || ch == ctrl('u')) { if (context->selectedRow > 0 && context->selectedRow <= 6) { context->selectedRow = 1; } else if (context->selectedRow > 6) { context->selectedRow -= 5; } } /* down is C-n in Emacs, j in vi(m) */ else if (ch == KEY_DOWN || ch == ctrl('n') || ch == 'j') { context->move = WATCH_MOVE_FOCUS_DOWN; if (context->selectedRow < context->rows) { ++context->selectedRow; } } /* page down, which is also C-d in the terminal with less/more etc */ else if (ch == KEY_NPAGE || ch == ctrl('d')) { if (context->selectedRow < context->rows && context->selectedRow >= (context->rows - 6)) { context->selectedRow = context->rows - 1; } else if (context->selectedRow < (context->rows - 6)) { context->selectedRow += 5; } } /* cancel current selected row */ else if (ch == KEY_DL || ch == KEY_DC) { context->selectedRow = 0; context->selectedArea = 0; } } while (ch != ERR); return true; } /* * watch_render displays the context on the terminal window. */ bool cli_watch_render(WatchContext *context, WatchContext *previous) { int printedRows = 0; /* on the first call to render, initialize the ncurses terminal control */ if (!context->initialized) { (void) cli_watch_init_window(context); context->initialized = true; } /* * When we fail to contact the monitor, we switch the terminal back to * cookedMode so that the usual stderr logs are visible. In that case the * render function is not called. When cli_watch_render() is called again, * it means we could contact the monitor and get an update, and we need to * take control of the terminal again. */ if (context->cookedMode) { reset_prog_mode(); refresh(); context->cookedMode = false; } /* adjust selected row to fit the selected area */ int nodeHeaderRow = 2; int firstNodeRow = nodeHeaderRow + 1; int lastNodeRow = firstNodeRow + context->nodesArray.count - 1; int eventHeaderRow = lastNodeRow + 2; /* blank line, evenzt headers */ int firstEventRow = eventHeaderRow + 1; int lastEventRow = firstEventRow + context->eventsArray.count - 1; if (lastEventRow > context->rows) { lastEventRow = context->rows; } /* first usage of the arrow keys select an area */ if (context->selectedArea == 0 && context->selectedRow > 0) { context->selectedArea = 1; } /* * Adjust the selectedRow position to make sure we always select a row * that's part of the data: avoid empty separation lines, avoid header * lines. * * We conceptually divide the screen in two areas: first, the nodes array * area, and then the events area. When scrolling away from an area we may * jump to the other area directly. */ if (context->selectedArea == 1) { if (context->selectedRow < firstNodeRow) { context->selectedRow = firstNodeRow; } else if (context->selectedRow > lastNodeRow) { context->selectedArea = 2; context->selectedRow = firstEventRow; } } else if (context->selectedArea == 2) { if (context->selectedRow < firstEventRow) { context->selectedArea = 1; context->selectedRow = lastNodeRow; } else if (context->selectedRow > lastEventRow) { context->selectedRow = lastEventRow; } } /* * Print the main header and then the nodes array. */ printedRows += print_watch_header(context, 0); /* skip empty lines and headers */ (void) clear_line_at(1); ++printedRows; int nodeRows = print_nodes_array(context, nodeHeaderRow, 0); printedRows += nodeRows; (void) clear_line_at(printedRows); /* * Now print the events array. Because that operation is more expensive, * and because most of the times there is no event happening, we compare * the current context with the previous one and avoid this part of the * code entirely when we figure out that we would only redisplay what's * already visible on the terminal. */ if (context->rows != previous->rows || context->cols != previous->cols || context->selectedRow != previous->selectedRow || context->selectedArea != previous->selectedArea || context->startCol != previous->startCol || context->cookedMode != previous->cookedMode || context->eventsArray.count != previous->eventsArray.count || (context->eventsArray.events[0].eventId != previous->eventsArray.events[0].eventId)) { (void) clear_line_at(++printedRows); printedRows += print_events_array(context, eventHeaderRow, 0); /* clean the remaining rows that we didn't use for displaying events */ if (printedRows < context->rows) { for (int r = printedRows; r < context->rows; r++) { (void) clear_line_at(r); } } } /* now display the footer */ (void) print_watch_footer(context); refresh(); return true; } /* * print_watch_header prints the first line of the screen, with the current * formation that's being displayed, the number_sync_standbys, and the current * time. */ static int print_watch_header(WatchContext *context, int r) { int c = 0; (void) print_current_time(context, r); mvprintw(r, c, "Formation: "); /* that's 11 chars */ c += 11; attron(A_BOLD); mvprintw(r, c, "%s", context->formation); attroff(A_BOLD); c += strlen(context->formation); /* * Check if we have enough room for a full label here: * - add 9 cols for the date at the end of the line * - add 18 cols for the label " - Sync Standbys: " * - add 3 cols for the number itself (e.g. "1") */ if (context->cols > (c + 9 + 18 + 3)) { mvprintw(r, c, " - Sync Standbys: "); /* that's 18 chars */ c += 18; } else { mvprintw(r, c, " - nss: "); /* that's 8 chars */ c += 8; } attron(A_BOLD); mvprintw(r, c, "%d", context->number_sync_standbys); attroff(A_BOLD); /* we only use one row */ return 1; } /* * print_current_time prints the current time on the far right of the first * line of the screen. */ static void print_current_time(WatchContext *context, int r) { uint64_t now = time(NULL); char timestring[MAXCTIMESIZE] = { 0 }; /* make sure we start with an empty line */ (void) clear_line_at(0); /* format the current time to be user-friendly */ epoch_to_string(now, timestring); /* "Wed Jun 30 21:49:08 1993" -> "21:49:08" */ timestring[11 + 8] = '\0'; clear_line_at(r); mvprintw(r, context->cols - 9, "%s", timestring + 11); } /* * print_watch_footer prints the last line of the screen, an help message. */ static int print_watch_footer(WatchContext *context) { int r = context->rows - 1; char *help = "Press F1 to exit"; attron(A_STANDOUT); mvprintw(r, context->cols - strlen(help), help); attroff(A_STANDOUT); /* we only use one row */ return 1; } /* * print_nodes_array prints a nodes array at the given position (r, c) in a * window of size (context->rows, context->cols). */ static int print_nodes_array(WatchContext *context, int r, int c) { CurrentNodeStateArray *nodesArray = &(context->nodesArray); int lines = 0; int currentRow = r; (void) compute_column_spec_lens(context); ColPolicy *columnPolicy = pick_column_policy(context); if (columnPolicy == NULL) { clear(); mvprintw(0, 0, "Window too small: %dx%d", context->rows, context->cols); refresh(); return false; } /* display the headers */ clear_line_at(currentRow); (void) print_column_headers(context, columnPolicy, currentRow++, c); ++lines; /* display the data */ for (int index = 0; index < nodesArray->count; index++) { bool selected = currentRow == context->selectedRow; clear_line_at(currentRow); if (selected) { attron(A_REVERSE); } (void) print_node_state(context, columnPolicy, index, currentRow++, c); if (selected) { attroff(A_REVERSE); } ++lines; if (context->rows <= currentRow) { break; } } return lines; } /* * pick_column_spec chooses which column spec should be used depending on the * current size (rows, cols) of the display, and given update column specs with * the actual lenghts of the data to be displayed. */ static ColPolicy * pick_column_policy(WatchContext *context) { ColPolicy *bestPolicy = NULL; for (int i = 0; i < ColumnPoliciesCount; i++) { /* minimal, terse, verbose, full */ ColPolicy *policy = &(ColumnPolicies[i]); if (policy->totalSize <= context->cols && bestPolicy == NULL) { bestPolicy = policy; } else if (policy->totalSize <= context->cols && policy->totalSize >= bestPolicy->totalSize) { bestPolicy = policy; } } return bestPolicy; } /* * compute_column_spec_lens computes the len of each known column * specification, given the actual data to print. */ static bool compute_column_spec_lens(WatchContext *context) { CurrentNodeStateArray *nodesArray = &(context->nodesArray); PgInstanceKind firstNodeKind = NODE_KIND_UNKNOWN; if (nodesArray->count > 0) { firstNodeKind = nodesArray->nodes[0].pgKind; } (void) nodestatePrepareHeaders(nodesArray, firstNodeKind); for (int i = 0; i < ColumnPoliciesCount; i++) { /* minimal, terse, verbose, full */ ColPolicy *policy = &(ColumnPolicies[i]); /* reset last computed size */ policy->totalSize = 0; for (int col = 0; policy->specs[col].type != COLUMN_TYPE_LAST; col++) { ColumnType cType = policy->specs[col].type; int headerLen = strlen(policy->specs[col].name); int dataLen = compute_column_size(cType, &(nodesArray->headers)); /* the column header name might be larger than the data */ int len = headerLen > dataLen ? headerLen : dataLen; policy->specs[col].len = len; policy->totalSize += len + 1; /* add one space between columns */ } /* remove extra space after last column */ policy->totalSize -= 1; } return true; } /* * compute_column_size returns the size needed to display a given column type * given the pre-computed size of the nodes array header, where the alignment * with the rest of the array is taken in consideration. */ static int compute_column_size(ColumnType type, NodeAddressHeaders *headers) { switch (type) { case COLUMN_TYPE_NAME: { return headers->maxNameSize; } case COLUMN_TYPE_ID: { return headers->maxNodeSize; } case COLUMN_TYPE_REPLICATION_QUORUM: { /* that one is going to be "yes" or "no" */ return 3; } case COLUMN_TYPE_CANDIDATE_PRIORITY: { /* that's an integer in the range 0..100 */ return 3; } case COLUMN_TYPE_HOST_PORT: { return headers->maxHostSize; } case COLUMN_TYPE_TLI_LSN: { return headers->maxLSNSize; } case COLUMN_TYPE_CONN_HEALTH: { return headers->maxHealthSize; } case COLUMN_TYPE_CONN_HEALTH_LAG: { /* that's an interval in seconds/mins/hours/days: XXuYYu */ return 7; } case COLUMN_TYPE_CONN_REPORT_LAG: { /* that's an interval in seconds/mins/hours/days: XXuYYu */ return 7; } case COLUMN_TYPE_REPORTED_STATE: { return headers->maxStateSize; } case COLUMN_TYPE_ASSIGNED_STATE: { return headers->maxStateSize; } default: { log_fatal("BUG: compute_column_size(%d)", type); exit(EXIT_CODE_INTERNAL_ERROR); } } /* keep compiler happy */ return 0; } /* * print_column_headers prints the headers of the selection column policy. */ static void print_column_headers(WatchContext *context, ColPolicy *policy, int r, int c) { int cc = c; clear_line_at(r); attron(A_STANDOUT); for (int col = 0; col < COLUMN_TYPE_LAST; col++) { int len = policy->specs[col].len; char *name = policy->specs[col].name; mvprintw(r, cc, "%*s ", len, name); cc += len + 1; } attroff(A_STANDOUT); } /* * print_node_state prints the given nodestate with the selected column policy. */ static void print_node_state(WatchContext *context, ColPolicy *policy, int index, int r, int c) { CurrentNodeStateArray *nodesArray = &(context->nodesArray); CurrentNodeState *nodeState = &(nodesArray->nodes[index]); char hostport[BUFSIZE] = { 0 }; char composedId[BUFSIZE] = { 0 }; char tliLSN[BUFSIZE] = { 0 }; char connection[BUFSIZE] = { 0 }; char healthChar = nodestateHealthToChar(nodeState->health); (void) nodestatePrepareNode(&(nodesArray->headers), &(nodeState->node), nodeState->groupId, hostport, composedId, tliLSN); if (healthChar == ' ') { sformat(connection, BUFSIZE, "%s", nodestateConnectionType(nodeState)); } else { sformat(connection, BUFSIZE, "%s %c", nodestateConnectionType(nodeState), healthChar); } int cc = c; for (int col = 0; policy->specs[col].type != COLUMN_TYPE_LAST; col++) { ColumnType cType = policy->specs[col].type; int len = policy->specs[col].len; switch (cType) { case COLUMN_TYPE_NAME: { mvprintw(r, cc, "%*s", len, nodeState->node.name); break; } case COLUMN_TYPE_ID: { mvprintw(r, cc, "%*s", len, composedId); break; } case COLUMN_TYPE_REPLICATION_QUORUM: { mvprintw(r, cc, "%*s", len, nodeState->replicationQuorum ? "yes" : "no"); break; } case COLUMN_TYPE_CANDIDATE_PRIORITY: { mvprintw(r, cc, "%*d", len, nodeState->candidatePriority); break; } case COLUMN_TYPE_HOST_PORT: { mvprintw(r, cc, "%*s", len, hostport); break; } case COLUMN_TYPE_TLI_LSN: { mvprintw(r, cc, "%*s", len, tliLSN); break; } case COLUMN_TYPE_CONN_HEALTH: { if (nodeState->health != 1) { attron(A_REVERSE | A_BOLD); } mvprintw(r, cc, "%*s", len, connection); if (nodeState->health != 1) { attroff(A_REVERSE | A_BOLD); } break; } case COLUMN_TYPE_CONN_HEALTH_LAG: { char str[9] = { 0 }; (void) IntervalToString(nodeState->healthLag, str, sizeof(str)); mvprintw(r, cc, "%*s", len, str); break; } case COLUMN_TYPE_CONN_REPORT_LAG: { char str[9] = { 0 }; if (nodeState->reportLag > 10.0) { attron(A_REVERSE); } (void) IntervalToString(nodeState->reportLag, str, sizeof(str)); mvprintw(r, cc, "%*s", len, str); if (nodeState->reportLag > 10.0) { attroff(A_REVERSE); } break; } case COLUMN_TYPE_REPORTED_STATE: { watch_set_state_attributes(nodeState->reportedState, true); mvprintw(r, cc, "%*s", len, NodeStateToString(nodeState->reportedState)); watch_set_state_attributes(nodeState->reportedState, false); break; } case COLUMN_TYPE_ASSIGNED_STATE: { watch_set_state_attributes(nodeState->goalState, true); mvprintw(r, cc, "%*s", len, NodeStateToString(nodeState->goalState)); watch_set_state_attributes(nodeState->goalState, false); break; } default: { log_fatal("BUG: print_node_state(%d)", cType); exit(EXIT_CODE_INTERNAL_ERROR); } } cc += len; mvprintw(r, cc++, " "); } } /* * Routine used to set attributes to display node states. */ static void watch_set_state_attributes(NodeState state, bool toggle) { switch (state) { /* states where Postgres is not running */ case DEMOTED_STATE: case DEMOTE_TIMEOUT_STATE: case DRAINING_STATE: case REPORT_LSN_STATE: { if (toggle) { attron(A_BOLD); } else { attroff(A_BOLD); } break; } /* states where the node is not participating in the failover */ case MAINTENANCE_STATE: case WAIT_MAINTENANCE_STATE: case PREPARE_MAINTENANCE_STATE: case WAIT_STANDBY_STATE: case DROPPED_STATE: { if (toggle) { attron(A_DIM | A_UNDERLINE); } else { attroff(A_DIM | A_UNDERLINE); } break; } default: { /* do not change attributes for most cases */ break; } } } /* * clear_line_at clears the line at given row number by displaying space * characters on the whole line. */ static void clear_line_at(int row) { move(row, 0); clrtoeol(); } /* * pick_event_column_policy chooses which column spec should be used depending * on the current size (rows, cols) of the display, and given update column * specs with the actual lenghts of the data to be displayed. */ static EventColPolicy * pick_event_column_policy(WatchContext *context) { EventColPolicy *bestPolicy = NULL; for (int i = 0; i < EventColumnPoliciesCount; i++) { /* minimal, terse, verbose, full */ EventColPolicy *policy = &(EventColumnPolicies[i]); if (policy->totalSize <= context->cols && bestPolicy == NULL) { bestPolicy = policy; } else if (policy->totalSize <= context->cols && policy->totalSize >= bestPolicy->totalSize) { bestPolicy = policy; } } return bestPolicy; } /* * compute_column_spec_lens computes the len of each known column * specification, given the actual data to print. */ static bool compute_event_column_spec_lens(WatchContext *context) { MonitorEventsHeaders *headers = &(context->eventsHeaders); (void) compute_events_sizes(context); for (int i = 0; i < EventColumnPoliciesCount; i++) { /* minimal, terse, verbose, full */ EventColPolicy *policy = &(EventColumnPolicies[i]); /* reset last computed size */ policy->totalSize = 0; for (int col = 0; policy->specs[col].type != EVENT_COLUMN_TYPE_LAST; col++) { EventColumnType cType = policy->specs[col].type; int headerLen = strlen(policy->specs[col].name); int dataLen = compute_event_column_size(cType, headers); /* the column header name might be larger than the data */ int len = headerLen > dataLen ? headerLen : dataLen; policy->specs[col].len = len; policy->totalSize += len + 1; /* add one space between columns */ } /* remove extra space after last column */ policy->totalSize -= 1; } return true; } /* * compute events len properties (maximum length for the columns we have) */ static bool compute_events_sizes(WatchContext *context) { MonitorEventsArray *eventsArray = &(context->eventsArray); MonitorEventsHeaders *headers = &(context->eventsHeaders); for (int index = 0; index < eventsArray->count; index++) { MonitorEvent *event = &(eventsArray->events[index]); int idSize = log10(event->eventId) + 1; int nameSize = strlen(event->nodeName); int timeSize = 19; /* "YYYY-MM-DD HH:MI:SS" is 19 chars long */ int descSize = 60; /* desc. has horizontal scrolling */ if (headers->maxEventIdSize < idSize) { headers->maxEventIdSize = idSize; } if (headers->maxEventTimeSize < timeSize) { headers->maxEventTimeSize = timeSize; } if (headers->maxEventNodeNameSize < nameSize) { headers->maxEventNodeNameSize = nameSize; } if (headers->maxEventDescSize < descSize) { headers->maxEventDescSize = descSize; } } return true; } /* * compute_event_column_size returns the size needed to display a given column * type given the pre-computed size of the events array header, where the * alignment with the rest of the array is taken in consideration. */ static int compute_event_column_size(EventColumnType type, MonitorEventsHeaders *headers) { switch (type) { case EVENT_COLUMN_TYPE_ID: { return headers->maxEventIdSize; } case EVENT_COLUMN_TYPE_TIME: { return headers->maxEventTimeSize; } case EVENT_COLUMN_TYPE_NODE_NAME: { return headers->maxEventNodeNameSize; } case EVENT_COLUMN_TYPE_DESCRIPTION: { return headers->maxEventDescSize; } default: { log_fatal("BUG: compute_event_column_size(%d)", type); exit(EXIT_CODE_INTERNAL_ERROR); } } /* keep compiler happy */ return 0; } /* * print_events_array prints an events array at the given position (r, c) in a * window of size (context-rows, context->cols). */ static int print_events_array(WatchContext *context, int r, int c) { MonitorEventsArray *eventsArray = &(context->eventsArray); int lines = 0; int currentRow = r; int maxStartCol = 0; /* compute column sizes */ (void) compute_event_column_spec_lens(context); /* pick a display policy for the events table */ EventColPolicy *eventColumnPolicy = pick_event_column_policy(context); if (eventColumnPolicy == NULL) { clear(); mvprintw(0, 0, "Window too small: %dx%d", context->rows, context->cols); refresh(); return false; } /* display the events headers */ (void) print_events_headers(context, eventColumnPolicy, currentRow++, c); ++lines; int capacity = context->rows - currentRow; int start = eventsArray->count <= capacity ? 0 : eventsArray->count - capacity; /* display most recent events first */ for (int index = eventsArray->count - 1; index >= start; index--) { bool selected = currentRow == context->selectedRow; clear_line_at(currentRow); if (selected) { attron(A_REVERSE); } int sc = print_event(context, eventColumnPolicy, index, currentRow, c); if (sc > maxStartCol) { maxStartCol = sc; } if (selected) { attroff(A_REVERSE); } if (context->rows < currentRow) { break; } ++currentRow; ++lines; } /* reset context->startCol to something sensible when it needs to be */ if (maxStartCol > 0 && maxStartCol < context->startCol) { context->startCol = maxStartCol; } return lines; } /* * print_node_state prints the given nodestate with the selected column policy. */ static int print_event(WatchContext *context, EventColPolicy *policy, int index, int r, int c) { MonitorEventsArray *eventsArray = &(context->eventsArray); MonitorEvent *event = &(eventsArray->events[index]); int cc = c; int startCol = context->startCol; for (int col = 0; policy->specs[col].type < EVENT_COLUMN_TYPE_LAST; col++) { EventColumnType cType = policy->specs[col].type; int len = policy->specs[col].len; switch (cType) { case EVENT_COLUMN_TYPE_ID: { mvprintw(r, cc, "%*d", len, event->eventId); break; } case EVENT_COLUMN_TYPE_TIME: { mvprintw(r, cc, "%*s", len, event->eventTime); break; } case EVENT_COLUMN_TYPE_NODE_NAME: { mvprintw(r, cc, "%*s", len, event->nodeName); break; } case EVENT_COLUMN_TYPE_DESCRIPTION: { char *text = event->description; int len = strlen(text); /* when KEY_END is used, ensure we see the end of text */ if (context->move == WATCH_MOVE_FOCUS_END) { /* * The eventTime format plus spacing takes up 21 chars * on-screen */ if (strlen(text) > (context->cols - cc)) { text = text + len - cc; } } else if (context->startCol > 0 && len > (context->cols - cc)) { /* * Shift our text following the current startCol, or if we * don't have that many chars in the text, then shift from * as much as we can in steps of 10 increments. */ int step = (context->cols - cc) / 2; for (; startCol > 0; startCol -= step) { if (len >= startCol) { text = text + startCol; break; } } } mvprintw(r, cc, "%s%s", text == event->description ? " " : " -- ", text); break; } default: { log_fatal("BUG: print_event(%d)", cType); exit(EXIT_CODE_INTERNAL_ERROR); } } /* We know DESCRIPTION is the last column, and we skip computing its * actual size... so the len of this field is a static value (60). * Avoid printing the column separator in the middle of the actual * description text. */ if (cType != EVENT_COLUMN_TYPE_DESCRIPTION) { cc += len; mvprintw(r, cc, " "); cc += 2; } } return startCol; } /* * print_column_headers prints the headers of the selection column policy. */ static void print_events_headers(WatchContext *context, EventColPolicy *policy, int r, int c) { int cc = c; clear_line_at(r); attron(A_STANDOUT); for (int col = 0; col < EVENT_COLUMN_TYPE_LAST; col++) { int len = policy->specs[col].len; char *name = policy->specs[col].name; EventColumnType cType = policy->specs[col].type; /* the description field takes all that's left on the display */ if (cType == EVENT_COLUMN_TYPE_DESCRIPTION) { mvprintw(r, cc, " %-*s", context->cols - cc - 1, name); } else { mvprintw(r, cc, "%*s", len, name); } cc += len; mvprintw(r, cc, " "); cc += 2; } attroff(A_STANDOUT); } pg_auto_failover-1.6.3/src/bin/pg_autoctl/watch.h000066400000000000000000000035721414244367200220430ustar00rootroot00000000000000/* * src/bin/pg_autoctl/watch.h * Implementation of a CLI to show events, states, and URI from the * pg_auto_failover monitor. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef WATCH_H #define WATCH_H #include #include #include #include #include "defaults.h" #include "monitor.h" #include "nodestate_utils.h" typedef enum { WATCH_MOVE_FOCUS_NONE = 0, WATCH_MOVE_FOCUS_LEFT, WATCH_MOVE_FOCUS_RIGHT, WATCH_MOVE_FOCUS_HOME, WATCH_MOVE_FOCUS_END, WATCH_MOVE_FOCUS_UP, WATCH_MOVE_FOCUS_DOWN } WatchMoveFocus; /* compute max size of items to display for events */ typedef struct MonitorEventsHeaders { int maxEventIdSize; int maxEventTimeSize; int maxEventNodeNameSize; int maxEventDescSize; } MonitorEventsHeaders; #define EVENTS_BUFFER_COUNT 80 /* share a context between the update and render functions */ typedef struct WatchContext { /* state of the display */ int rows; int cols; int selectedRow; int selectedArea; /* area 1: node states, area 2: node events */ int startCol; WatchMoveFocus move; /* internal state */ bool initialized; bool cookedMode; bool shouldExit; /* true when q or F1 have been pressed */ bool couldContactMonitor; /* parameters used to fetch the data we display */ Monitor monitor; char formation[NAMEDATALEN]; int groupId; int number_sync_standbys; /* data to display */ CurrentNodeStateArray nodesArray; MonitorEventsArray eventsArray; MonitorEventsHeaders eventsHeaders; } WatchContext; void cli_watch_main_loop(WatchContext *context); void cli_watch_init_window(WatchContext *context); void cli_watch_end_window(WatchContext *context); bool cli_watch_update(WatchContext *context, int step); bool cli_watch_render(WatchContext *context, WatchContext *previous); #endif /* WATCH_H */ pg_auto_failover-1.6.3/src/bin/pg_autoctl/watch_colspecs.h000066400000000000000000000165721414244367200237420ustar00rootroot00000000000000/* * src/bin/pg_autoctl/watch.h * Implementation of a CLI to show events, states, and URI from the * pg_auto_failover monitor. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * */ #ifndef WATCH_COLSPECSH #define WATCH_COLSPECSH #include "watch.h" /* Column Specifications, so that we adapt to the actual/current screen size */ typedef enum { COLUMN_TYPE_NAME = 0, COLUMN_TYPE_ID, COLUMN_TYPE_REPLICATION_QUORUM, COLUMN_TYPE_CANDIDATE_PRIORITY, COLUMN_TYPE_HOST_PORT, COLUMN_TYPE_TLI_LSN, COLUMN_TYPE_CONN_HEALTH, COLUMN_TYPE_CONN_HEALTH_LAG, COLUMN_TYPE_CONN_REPORT_LAG, COLUMN_TYPE_REPORTED_STATE, COLUMN_TYPE_ASSIGNED_STATE, COLUMN_TYPE_LAST } ColumnType; typedef struct ColSpec { ColumnType type; char name[NAMEDATALEN]; int len; } ColSpec; #define MAX_COL_SPECS 12 typedef struct ColPolicy { char name[NAMEDATALEN]; int totalSize; ColSpec specs[MAX_COL_SPECS]; } ColPolicy; /* * A column policy is a list of column specifications. * * We have a static list of policies, and we pick one at run-time depending on * the current size of the terminal window and depending on the actual data * size to be displayed, which is also dynamic. */ ColPolicy ColumnPolicies[] = { { "minimal", 0, { { COLUMN_TYPE_ID, "Id", 0 }, { COLUMN_TYPE_REPORTED_STATE, "Reported State", 0 }, { COLUMN_TYPE_ASSIGNED_STATE, "Assigned State", 0 }, { COLUMN_TYPE_LAST, "", 0 } } }, { "very terse", 0, { { COLUMN_TYPE_NAME, "Name", 0 }, { COLUMN_TYPE_CONN_REPORT_LAG, "Report", 0 }, { COLUMN_TYPE_REPORTED_STATE, "Reported State", 0 }, { COLUMN_TYPE_ASSIGNED_STATE, "Assigned State", 0 }, { COLUMN_TYPE_LAST, "", 0 } } }, { "quite terse", 0, { { COLUMN_TYPE_NAME, "Name", 0 }, { COLUMN_TYPE_ID, "Id", 0 }, { COLUMN_TYPE_CONN_REPORT_LAG, "Report", 0 }, { COLUMN_TYPE_REPORTED_STATE, "Reported State", 0 }, { COLUMN_TYPE_ASSIGNED_STATE, "Assigned State", 0 }, { COLUMN_TYPE_LAST, "", 0 } } }, { "terse", 0, { { COLUMN_TYPE_NAME, "Name", 0 }, { COLUMN_TYPE_CONN_HEALTH, "Connection", 0 }, { COLUMN_TYPE_CONN_REPORT_LAG, "Report", 0 }, { COLUMN_TYPE_REPORTED_STATE, "Reported State", 0 }, { COLUMN_TYPE_ASSIGNED_STATE, "Assigned State", 0 }, { COLUMN_TYPE_LAST, "", 0 } } }, { "standard", 0, { { COLUMN_TYPE_NAME, "Name", 0 }, { COLUMN_TYPE_ID, "Id", 0 }, { COLUMN_TYPE_CONN_HEALTH, "Connection", 0 }, { COLUMN_TYPE_CONN_REPORT_LAG, "Last Report", 0 }, { COLUMN_TYPE_REPORTED_STATE, "Reported State", 0 }, { COLUMN_TYPE_ASSIGNED_STATE, "Assigned State", 0 }, { COLUMN_TYPE_LAST, "", 0 } } }, { "semi verbose", 0, { { COLUMN_TYPE_NAME, "Name", 0 }, { COLUMN_TYPE_ID, "Id", 0 }, { COLUMN_TYPE_CONN_HEALTH_LAG, "Check", 0 }, { COLUMN_TYPE_CONN_HEALTH, "Connection", 0 }, { COLUMN_TYPE_CONN_REPORT_LAG, "Report", 0 }, { COLUMN_TYPE_REPORTED_STATE, "Reported State", 0 }, { COLUMN_TYPE_ASSIGNED_STATE, "Assigned State", 0 }, { COLUMN_TYPE_LAST, "", 0 } } }, { "verbose", 0, { { COLUMN_TYPE_NAME, "Name", 0 }, { COLUMN_TYPE_ID, "Node", 0 }, { COLUMN_TYPE_REPLICATION_QUORUM, "Quorum", 0 }, { COLUMN_TYPE_CANDIDATE_PRIORITY, "Priority", 0 }, { COLUMN_TYPE_CONN_HEALTH_LAG, "Check", 0 }, { COLUMN_TYPE_CONN_HEALTH, "Connection", 0 }, { COLUMN_TYPE_CONN_REPORT_LAG, "Report", 0 }, { COLUMN_TYPE_REPORTED_STATE, "Reported State", 0 }, { COLUMN_TYPE_ASSIGNED_STATE, "Assigned State", 0 }, { COLUMN_TYPE_LAST, "", 0 } } }, { "almost full", 0, { { COLUMN_TYPE_NAME, "Name", 0 }, { COLUMN_TYPE_ID, "Node", 0 }, { COLUMN_TYPE_REPLICATION_QUORUM, "Quorum", 0 }, { COLUMN_TYPE_CANDIDATE_PRIORITY, "Priority", 0 }, { COLUMN_TYPE_TLI_LSN, "TLI: LSN", 0 }, { COLUMN_TYPE_CONN_HEALTH_LAG, "Check", 0 }, { COLUMN_TYPE_CONN_HEALTH, "Connection", 0 }, { COLUMN_TYPE_CONN_REPORT_LAG, "Report", 0 }, { COLUMN_TYPE_REPORTED_STATE, "Reported State", 0 }, { COLUMN_TYPE_ASSIGNED_STATE, "Assigned State", 0 }, { COLUMN_TYPE_LAST, "", 0 } } }, { "full", 0, { { COLUMN_TYPE_NAME, "Name", 0 }, { COLUMN_TYPE_ID, "Node", 0 }, { COLUMN_TYPE_REPLICATION_QUORUM, "Quorum", 0 }, { COLUMN_TYPE_CANDIDATE_PRIORITY, "Priority", 0 }, { COLUMN_TYPE_HOST_PORT, "Host:Port", 0 }, { COLUMN_TYPE_TLI_LSN, "TLI: LSN", 0 }, { COLUMN_TYPE_CONN_HEALTH_LAG, "Check", 0 }, { COLUMN_TYPE_CONN_HEALTH, "Connection", 0 }, { COLUMN_TYPE_CONN_REPORT_LAG, "Last Report", 0 }, { COLUMN_TYPE_REPORTED_STATE, "Reported State", 0 }, { COLUMN_TYPE_ASSIGNED_STATE, "Assigned State", 0 }, { COLUMN_TYPE_LAST, "", 0 } } }, { "fully verbose", 0, { { COLUMN_TYPE_NAME, "Name", 0 }, { COLUMN_TYPE_ID, "Node", 0 }, { COLUMN_TYPE_REPLICATION_QUORUM, "Quorum", 0 }, { COLUMN_TYPE_CANDIDATE_PRIORITY, "Priority", 0 }, { COLUMN_TYPE_HOST_PORT, "Host:Port", 0 }, { COLUMN_TYPE_TLI_LSN, "TLI: LSN", 0 }, { COLUMN_TYPE_CONN_HEALTH_LAG, "Last Check", 0 }, { COLUMN_TYPE_CONN_HEALTH, "Connection", 0 }, { COLUMN_TYPE_CONN_REPORT_LAG, "Last Report", 0 }, { COLUMN_TYPE_REPORTED_STATE, "Reported State", 0 }, { COLUMN_TYPE_ASSIGNED_STATE, "Assigned State", 0 }, { COLUMN_TYPE_LAST, "", 0 } } } }; int ColumnPoliciesCount = sizeof(ColumnPolicies) / sizeof(ColumnPolicies[0]); /* * Events column specifications and policies. */ typedef enum { EVENT_COLUMN_TYPE_ID = 0, EVENT_COLUMN_TYPE_TIME, EVENT_COLUMN_TYPE_FORMATION, EVENT_COLUMN_TYPE_NODE_ID, EVENT_COLUMN_TYPE_GROUP_ID, EVENT_COLUMN_TYPE_NODE_NAME, EVENT_COLUMN_TYPE_NODE_HOST, EVENT_COLUMN_TYPE_NODE_PORT, EVENT_COLUMN_TYPE_REPORTED_STATE, EVENT_COLUMN_TYPE_ASSIGNED_STATE, EVENT_COLUMN_TYPE_REPLICATION_STATE, EVENT_COLUMN_TYPE_TIMELINE, EVENT_COLUMN_TYPE_LSN, EVENT_COLUMN_TYPE_CANDIDATE_PRIORITY, EVENT_COLUMN_TYPE_REPLICATION_QUORUM, EVENT_COLUMN_TYPE_DESCRIPTION, EVENT_COLUMN_TYPE_LAST } EventColumnType; typedef struct EventColSpec { EventColumnType type; char name[NAMEDATALEN]; int len; } EventColSpec; #define MAX_EVENT_COL_SPECS 16 typedef struct EventColPolicy { char name[NAMEDATALEN]; int totalSize; EventColSpec specs[MAX_COL_SPECS]; } EventColPolicy; /* * A column policy is a list of column specifications. * * We have a static list of policies, and we pick one at run-time depending on * the current size of the terminal window and depending on the actual data * size to be displayed, which is also dynamic. */ EventColPolicy EventColumnPolicies[] = { { "minimal", 0, { { EVENT_COLUMN_TYPE_ID, "Id", 0 }, { EVENT_COLUMN_TYPE_DESCRIPTION, "Description", 0 }, { EVENT_COLUMN_TYPE_LAST, "", 0 } } }, { "standard", 0, { { EVENT_COLUMN_TYPE_TIME, "Event Time", 0 }, { EVENT_COLUMN_TYPE_DESCRIPTION, "Description", 0 }, { EVENT_COLUMN_TYPE_LAST, "", 0 } } }, { "almost verbose", 0, { { EVENT_COLUMN_TYPE_ID, "Id", 0 }, { EVENT_COLUMN_TYPE_TIME, "Event Time", 0 }, { EVENT_COLUMN_TYPE_DESCRIPTION, "Description", 0 }, { EVENT_COLUMN_TYPE_LAST, "", 0 } } }, { "verbose", 0, { { EVENT_COLUMN_TYPE_ID, "Id", 0 }, { EVENT_COLUMN_TYPE_TIME, "Event Time", 0 }, { EVENT_COLUMN_TYPE_NODE_NAME, "Name", 0 }, { EVENT_COLUMN_TYPE_DESCRIPTION, "Description", 0 }, { EVENT_COLUMN_TYPE_LAST, "", 0 } } }, }; int EventColumnPoliciesCount = sizeof(EventColumnPolicies) / sizeof(EventColumnPolicies[0]); #endif /* WATCH_COLSPECSH */ pg_auto_failover-1.6.3/src/monitor/000077500000000000000000000000001414244367200173335ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/monitor/.gitignore000066400000000000000000000000101414244367200213120ustar00rootroot00000000000000results pg_auto_failover-1.6.3/src/monitor/Makefile000066400000000000000000000041311414244367200207720ustar00rootroot00000000000000# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the PostgreSQL License. EXTENSION = pgautofailover EXTVERSION = 1.6 SRC_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) DATA_built = $(EXTENSION)--$(EXTVERSION).sql DATA = $(EXTENSION)--1.0.sql $(patsubst ${SRC_DIR}%,%,$(wildcard ${SRC_DIR}$(EXTENSION)--*--*.sql)) # compilation configuration MODULE_big = $(EXTENSION) OBJS = $(patsubst ${SRC_DIR}%.c,%.o,$(wildcard ${SRC_DIR}*.c)) PG_CPPFLAGS = -std=c99 -Wall -Werror -Wno-unused-parameter -Iinclude -I$(libpq_srcdir) -g SHLIB_LINK = $(libpq) REGRESS = create_extension monitor workers dummy_update drop_extension upgrade PG_CONFIG ?= pg_config PGXS = $(shell $(PG_CONFIG) --pgxs) USE_PGXS = 1 .PHONY: cleanup-before-install DEFAULT_CFLAGS = -std=c99 -D_GNU_SOURCE -g DEFAULT_CFLAGS += $(shell $(PG_CONFIG) --cflags) DEFAULT_CFLAGS += -Wformat DEFAULT_CFLAGS += -Wall DEFAULT_CFLAGS += -Werror=implicit-int DEFAULT_CFLAGS += -Werror=implicit-function-declaration DEFAULT_CFLAGS += -Werror=return-type DEFAULT_CFLAGS += -Wno-declaration-after-statement # Needed for OSX DEFAULT_CFLAGS += -Wno-missing-braces ifdef USE_SECURITY_FLAGS # Flags taken from: https://liquid.microsoft.com/Web/Object/Read/ms.security/Requirements/Microsoft.Security.SystemsADM.10203#guide SECURITY_CFLAGS=-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -z noexecstack -fpic -shared -Wl,-z,relro -Wl,-z,now -Wformat -Wformat-security -Werror=format-security DEFAULT_CFLAGS += $(SECURITY_CFLAGS) endif override CFLAGS := $(DEFAULT_CFLAGS) $(CFLAGS) include $(PGXS) ifdef USE_SECURITY_FLAGS # Flags taken from: https://liquid.microsoft.com/Web/Object/Read/ms.security/Requirements/Microsoft.Security.SystemsADM.10203#guide SECURITY_BITCODE_CFLAGS=-fsanitize=safe-stack -fstack-protector-strong -flto -fPIC -Wformat -Wformat-security -Werror=format-security override BITCODE_CFLAGS := $(BITCODE_CFLAGS) $(SECURITY_BITCODE_CFLAGS) endif cleanup-before-install: rm -f $(DESTDIR)$(datadir)/$(datamoduledir)/pgautofailover* install: cleanup-before-install $(EXTENSION)--$(EXTVERSION).sql: $(EXTENSION).sql cat $^ > $@ pg_auto_failover-1.6.3/src/monitor/README.md000066400000000000000000000005751414244367200206210ustar00rootroot00000000000000# pg_auto_failover: the Monitor This directory contains the pg_auto_failover monitor. The monitor handles the group state machines of one or more groups of servers. Each group of servers implements a single Highly Available PostgreSQL Service. Several groups can be organized as part of the same formation. The pg_auto_failover monitor is implemented as a PostgreSQL extension. pg_auto_failover-1.6.3/src/monitor/conninfo.c000066400000000000000000000051371414244367200213160ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/conninfo.c * * This file contains functions to get the primary connection info from * recovery.conf. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" #include "c.h" #include "fmgr.h" #include "funcapi.h" #include "libpq-fe.h" #include "miscadmin.h" #include "storage/fd.h" #include "utils/guc.h" #include "conninfo.h" /* named constants */ #define RECOVERY_COMMAND_FILE "recovery.conf" /* private function declarations */ static char * ReadPrimaryConnInfoFromRecoveryConf(void); /* * ReadPrimaryHostAddress reads the hostname and port as defined in recovery.conf * into the parameters. */ int ReadPrimaryHostAddress(char **primaryName, char **primaryPort) { char *errorMessage = NULL; PQconninfoOption *currentOption = NULL; char *connInfo = ReadPrimaryConnInfoFromRecoveryConf(); if (connInfo == NULL) { return -1; } PQconninfoOption *options = PQconninfoParse(connInfo, &errorMessage); if (options == NULL) { pfree(connInfo); return -1; } for (currentOption = options; currentOption->keyword != NULL; currentOption++) { char *keyword = currentOption->keyword; char *value = currentOption->val; if (value == NULL) { continue; } if (strcmp(keyword, "host") == 0 || strcmp(keyword, "hostaddr") == 0) { *primaryName = pstrdup(value); } else if (strcmp(keyword, "port") == 0) { *primaryPort = pstrdup(value); } } PQconninfoFree(options); pfree(connInfo); return 0; } /* * ReadPrimaryConnInfoFromRecoveryConf gets the unaltered primary_conninfo * field from the recovery.conf file. */ static char * ReadPrimaryConnInfoFromRecoveryConf(void) { ConfigVariable *item = NULL; ConfigVariable *head = NULL; ConfigVariable *tail = NULL; char *primaryConnInfo = NULL; FILE *fd = AllocateFile(RECOVERY_COMMAND_FILE, "r"); if (fd == NULL) { ereport(LOG, (errcode_for_file_access(), errmsg("could not open recovery command file \"%s\": %m", RECOVERY_COMMAND_FILE))); return NULL; } /* * Since we're asking ParseConfigFp() to report errors as FATAL, there's * no need to check the return value. */ (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail); FreeFile(fd); for (item = head; item; item = item->next) { if (strcmp(item->name, "primary_conninfo") == 0) { primaryConnInfo = pstrdup(item->value); } } FreeConfigVariables(head); return primaryConnInfo; } pg_auto_failover-1.6.3/src/monitor/conninfo.h000066400000000000000000000007541414244367200213230ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/conninfo.h * * Declarations for public functions and types related to reading * connection info from recovery.conf. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #pragma once extern int ReadPrimaryHostAddress(char **primaryName, char **primaryPort); pg_auto_failover-1.6.3/src/monitor/expected/000077500000000000000000000000001414244367200211345ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/monitor/expected/create_extension.out000066400000000000000000000003041414244367200252210ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. create extension pgautofailover cascade; NOTICE: installing required extension "btree_gist" pg_auto_failover-1.6.3/src/monitor/expected/drop_extension.out000066400000000000000000000002061414244367200247230ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. drop extension pgautofailover; pg_auto_failover-1.6.3/src/monitor/expected/dummy_update.out000066400000000000000000000014501414244367200243620ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. select version from pg_available_extension_versions where name = 'pgautofailover' and version = 'dummy'; version --------- dummy (1 row) alter extension pgautofailover update to dummy; select installed_version from pg_available_extensions where name = 'pgautofailover'; installed_version ------------------- dummy (1 row) -- should error because installed extension isn't compatible with .so select * from pgautofailover.get_primary('unknown formation'); ERROR: loaded "pgautofailover" library version differs from installed extension version DETAIL: Loaded library requires 1.6, but the installed extension version is dummy. HINT: Run ALTER EXTENSION pgautofailover UPDATE and try again. pg_auto_failover-1.6.3/src/monitor/expected/monitor.out000066400000000000000000000167351414244367200233700ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. \x on select * from pgautofailover.register_node('default', 'localhost', 9876, 'postgres'); -[ RECORD 1 ]---------------+------- assigned_node_id | 1 assigned_group_id | 0 assigned_group_state | single assigned_candidate_priority | 100 assigned_replication_quorum | t assigned_node_name | node_1 select * from pgautofailover.set_node_system_identifier(1, 6852685710417058800); -[ RECORD 1 ]-------- node_id | 1 node_name | node_1 node_host | localhost node_port | 9876 -- node_1 reports single select * from pgautofailover.node_active('default', 1, 0, current_group_role => 'single'); -[ RECORD 1 ]---------------+------- assigned_node_id | 1 assigned_group_id | 0 assigned_group_state | single assigned_candidate_priority | 100 assigned_replication_quorum | t -- register node_2 select * from pgautofailover.register_node('default', 'localhost', 9877, 'postgres'); -[ RECORD 1 ]---------------+------------- assigned_node_id | 2 assigned_group_id | 0 assigned_group_state | wait_standby assigned_candidate_priority | 100 assigned_replication_quorum | t assigned_node_name | node_2 -- node_2 reports wait_standby already select * from pgautofailover.node_active('default', 2, 0, current_group_role => 'wait_standby'); -[ RECORD 1 ]---------------+------------- assigned_node_id | 2 assigned_group_id | 0 assigned_group_state | wait_standby assigned_candidate_priority | 100 assigned_replication_quorum | t -- node_1 reports single again, and gets assigned wait_primary select * from pgautofailover.node_active('default', 1, 0, current_group_role => 'single'); -[ RECORD 1 ]---------------+------------- assigned_node_id | 1 assigned_group_id | 0 assigned_group_state | wait_primary assigned_candidate_priority | 100 assigned_replication_quorum | t -- node_1 now reports wait_primary select * from pgautofailover.node_active('default', 1, 0, current_group_role => 'wait_primary'); -[ RECORD 1 ]---------------+------------- assigned_node_id | 1 assigned_group_id | 0 assigned_group_state | wait_primary assigned_candidate_priority | 100 assigned_replication_quorum | t -- node_2 now reports wait_standby, gets assigned catchingup select * from pgautofailover.node_active('default', 2, 0, current_group_role => 'wait_standby'); -[ RECORD 1 ]---------------+----------- assigned_node_id | 2 assigned_group_id | 0 assigned_group_state | catchingup assigned_candidate_priority | 100 assigned_replication_quorum | t -- register node_3 concurrently to node2 (probably) doing pg_basebackup select * from pgautofailover.register_node('default', 'localhost', 9879, 'postgres'); -[ RECORD 1 ]---------------+------------- assigned_node_id | 3 assigned_group_id | 0 assigned_group_state | wait_standby assigned_candidate_priority | 100 assigned_replication_quorum | t assigned_node_name | node_3 select formationid, nodename, goalstate, reportedstate from pgautofailover.node; -[ RECORD 1 ]-+------------- formationid | default nodename | node_1 goalstate | wait_primary reportedstate | wait_primary -[ RECORD 2 ]-+------------- formationid | default nodename | node_2 goalstate | catchingup reportedstate | wait_standby -[ RECORD 3 ]-+------------- formationid | default nodename | node_3 goalstate | wait_standby reportedstate | init table pgautofailover.formation; -[ RECORD 1 ]--------+--------- formationid | default kind | pgsql dbname | postgres opt_secondary | t number_sync_standbys | 1 -- dump the pgautofailover.node table, omitting the timely columns select formationid, nodeid, groupid, nodehost, nodeport, goalstate, reportedstate, reportedpgisrunning, reportedrepstate from pgautofailover.node order by nodeid; -[ RECORD 1 ]-------+------------- formationid | default nodeid | 1 groupid | 0 nodehost | localhost nodeport | 9876 goalstate | wait_primary reportedstate | wait_primary reportedpgisrunning | t reportedrepstate | unknown -[ RECORD 2 ]-------+------------- formationid | default nodeid | 2 groupid | 0 nodehost | localhost nodeport | 9877 goalstate | catchingup reportedstate | wait_standby reportedpgisrunning | t reportedrepstate | unknown -[ RECORD 3 ]-------+------------- formationid | default nodeid | 3 groupid | 0 nodehost | localhost nodeport | 9879 goalstate | wait_standby reportedstate | init reportedpgisrunning | t reportedrepstate | async select * from pgautofailover.get_primary('unknown formation'); ERROR: group has no writable node right now select * from pgautofailover.get_primary(group_id => -10); ERROR: group has no writable node right now select * from pgautofailover.get_primary(); -[ RECORD 1 ]---+---------- primary_node_id | 1 primary_name | node_1 primary_host | localhost primary_port | 9876 select * from pgautofailover.get_primary('default', 0); -[ RECORD 1 ]---+---------- primary_node_id | 1 primary_name | node_1 primary_host | localhost primary_port | 9876 select * from pgautofailover.get_other_nodes(1); -[ RECORD 1 ]---+---------- node_id | 2 node_name | node_2 node_host | localhost node_port | 9877 node_lsn | 0/0 node_is_primary | f -[ RECORD 2 ]---+---------- node_id | 3 node_name | node_3 node_host | localhost node_port | 9879 node_lsn | 0/0 node_is_primary | f -- remove the primary node select pgautofailover.remove_node(1); -[ RECORD 1 ]-- remove_node | t table pgautofailover.formation; -[ RECORD 1 ]--------+--------- formationid | default kind | pgsql dbname | postgres opt_secondary | t number_sync_standbys | 0 select pgautofailover.remove_node(1, force => 'true'); -[ RECORD 1 ]-- remove_node | t -- dump the pgautofailover.node table, omitting the timely columns select formationid, nodeid, groupid, nodehost, nodeport, goalstate, reportedstate, reportedpgisrunning, reportedrepstate from pgautofailover.node order by nodeid; -[ RECORD 1 ]-------+------------- formationid | default nodeid | 2 groupid | 0 nodehost | localhost nodeport | 9877 goalstate | report_lsn reportedstate | wait_standby reportedpgisrunning | t reportedrepstate | unknown -[ RECORD 2 ]-------+------------- formationid | default nodeid | 3 groupid | 0 nodehost | localhost nodeport | 9879 goalstate | report_lsn reportedstate | init reportedpgisrunning | t reportedrepstate | async select * from pgautofailover.set_node_system_identifier(2, 6852685710417058800); -[ RECORD 1 ]-------- node_id | 2 node_name | node_2 node_host | localhost node_port | 9877 -- should fail as there's no primary at this point select pgautofailover.perform_failover(); ERROR: couldn't find the primary node in formation "default", group 0 pg_auto_failover-1.6.3/src/monitor/expected/upgrade.out000066400000000000000000000023551414244367200233210ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. CREATE EXTENSION pgautofailover VERSION '1.0'; \dx pgautofailover List of installed extensions Name | Version | Schema | Description ----------------+---------+--------+------------------ pgautofailover | 1.0 | public | pg_auto_failover (1 row) ALTER EXTENSION pgautofailover UPDATE TO '1.1'; \dx pgautofailover List of installed extensions Name | Version | Schema | Description ----------------+---------+--------+------------------ pgautofailover | 1.1 | public | pg_auto_failover (1 row) ALTER EXTENSION pgautofailover UPDATE TO '1.2'; \dx pgautofailover List of installed extensions Name | Version | Schema | Description ----------------+---------+--------+------------------ pgautofailover | 1.2 | public | pg_auto_failover (1 row) ALTER EXTENSION pgautofailover UPDATE TO '1.3'; \dx pgautofailover List of installed extensions Name | Version | Schema | Description ----------------+---------+--------+------------------ pgautofailover | 1.3 | public | pg_auto_failover (1 row) DROP EXTENSION pgautofailover; pg_auto_failover-1.6.3/src/monitor/expected/workers.out000066400000000000000000000040041414244367200233570ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. -- This only tests that names are assigned properly \x on -- create a citus formation select * from pgautofailover.create_formation('citus', 'citus', 'citus', true, 0); -[ RECORD 1 ]--------+------ formation_id | citus kind | citus dbname | citus opt_secondary | t number_sync_standbys | 0 -- register the first coordinator select * from pgautofailover.register_node('citus', 'localhost', 9876, dbname => 'citus', desired_group_id => 0, node_kind => 'coordinator'); -[ RECORD 1 ]---------------+-------------- assigned_node_id | 4 assigned_group_id | 0 assigned_group_state | single assigned_candidate_priority | 100 assigned_replication_quorum | t assigned_node_name | coordinator_4 select * from pgautofailover.set_node_system_identifier(4, 6862008014275870855); -[ RECORD 1 ]------------ node_id | 4 node_name | coordinator_4 node_host | localhost node_port | 9876 -- coordinator_1 reports single select * from pgautofailover.node_active('citus', 4, 0, current_group_role => 'single'); -[ RECORD 1 ]---------------+------- assigned_node_id | 4 assigned_group_id | 0 assigned_group_state | single assigned_candidate_priority | 100 assigned_replication_quorum | t -- register first worker select * from pgautofailover.register_node('citus', 'localhost', 9878, dbname => 'citus', desired_group_id => 1, node_kind => 'worker'); -[ RECORD 1 ]---------------+--------- assigned_node_id | 5 assigned_group_id | 1 assigned_group_state | single assigned_candidate_priority | 100 assigned_replication_quorum | t assigned_node_name | worker_5 pg_auto_failover-1.6.3/src/monitor/formation_metadata.c000066400000000000000000000504641414244367200233460ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/formation_metadata.c * * Implementation of functions related to formation metadata. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" #include "health_check.h" #include "metadata.h" #include "formation_metadata.h" #include "node_metadata.h" #include "notifications.h" #include "access/htup_details.h" #include "access/xlogdefs.h" #include "catalog/pg_type.h" #include "executor/spi.h" #include "nodes/makefuncs.h" #include "nodes/parsenodes.h" #include "parser/parse_type.h" #include "storage/lockdefs.h" #include "utils/builtins.h" #include "utils/syscache.h" PG_FUNCTION_INFO_V1(create_formation); PG_FUNCTION_INFO_V1(drop_formation); PG_FUNCTION_INFO_V1(enable_secondary); PG_FUNCTION_INFO_V1(disable_secondary); PG_FUNCTION_INFO_V1(set_formation_number_sync_standbys); Datum AutoFailoverFormationGetDatum(FunctionCallInfo fcinfo, AutoFailoverFormation *formation); /* * GetFormation returns an AutoFailoverFormation structure with the formationId * and its kind, when the formation has already been created, or NULL * otherwise. */ AutoFailoverFormation * GetFormation(const char *formationId) { AutoFailoverFormation *formation = NULL; MemoryContext callerContext = CurrentMemoryContext; Oid argTypes[] = { TEXTOID /* formationid */ }; Datum argValues[] = { CStringGetTextDatum(formationId), /* formationid */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *selectQuery = "SELECT * FROM " AUTO_FAILOVER_FORMATION_TABLE " WHERE formationId = $1"; SPI_connect(); int spiStatus = SPI_execute_with_args(selectQuery, argCount, argTypes, argValues, NULL, false, 1); if (spiStatus != SPI_OK_SELECT) { elog(ERROR, "could not select from " AUTO_FAILOVER_FORMATION_TABLE); } if (SPI_processed > 0) { MemoryContext spiContext = MemoryContextSwitchTo(callerContext); TupleDesc tupleDescriptor = SPI_tuptable->tupdesc; HeapTuple heapTuple = SPI_tuptable->vals[0]; bool isNull = false; Datum formationId = heap_getattr(heapTuple, Anum_pgautofailover_formation_formationid, tupleDescriptor, &isNull); Datum kind = heap_getattr(heapTuple, Anum_pgautofailover_formation_kind, tupleDescriptor, &isNull); Datum dbname = heap_getattr(heapTuple, Anum_pgautofailover_formation_dbname, tupleDescriptor, &isNull); Datum opt_secondary = heap_getattr(heapTuple, Anum_pgautofailover_formation_opt_secondary, tupleDescriptor, &isNull); Datum number_sync_standbys = heap_getattr(heapTuple, Anum_pgautofailover_formation_number_sync_standbys, tupleDescriptor, &isNull); formation = (AutoFailoverFormation *) palloc0(sizeof(AutoFailoverFormation)); formation->formationId = TextDatumGetCString(formationId); formation->kind = FormationKindFromString(TextDatumGetCString(kind)); strlcpy(formation->dbname, NameStr(*DatumGetName(dbname)), NAMEDATALEN); formation->opt_secondary = DatumGetBool(opt_secondary); formation->number_sync_standbys = DatumGetInt32(number_sync_standbys); MemoryContextSwitchTo(spiContext); } else { formation = NULL; } SPI_finish(); return formation; } /* * create_formation inserts a new tuple in pgautofailover.formation table, of * the given formation kind. We know only two formation kind at the moment, * 'pgsql' and 'citus'. Support is only implemented for 'pgsql'. */ Datum create_formation(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); text *formationKindText = PG_GETARG_TEXT_P(1); char *formationKindCString = text_to_cstring(formationKindText); FormationKind formationKind = FormationKindFromString(formationKindCString); Name formationDBNameName = PG_GETARG_NAME(2); bool formationOptionSecondary = PG_GETARG_BOOL(3); int formationNumberSyncStandbys = PG_GETARG_INT32(4); AddFormation(formationId, formationKind, formationDBNameName, formationOptionSecondary, formationNumberSyncStandbys); AutoFailoverFormation *formation = GetFormation(formationId); Datum resultDatum = AutoFailoverFormationGetDatum(fcinfo, formation); PG_RETURN_DATUM(resultDatum); } /* * drop_formation removes a formation from the pgautofailover.formation table, * and may only succeed when no nodes belong to target formation. This is * checked by the foreign key reference installed in the pgautofailover schema. */ Datum drop_formation(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); RemoveFormation(formationId); PG_RETURN_VOID(); } /* * enable_secondary enables secondaries to be added to a formation. This is * done by changing the hassecondary field on pgautofailover.formation to true. * Subsequent nodes added to the formation will be assigned secondary of an * already running node as long as there are nodes without a secondary. */ Datum enable_secondary(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); SetFormationOptSecondary(formationId, true); PG_RETURN_VOID(); } /* * disable_secondary disables secondaries on a formation, it will only succeed * when no nodes of the formation are currently in the secondary role. This is * enforced by a trigger on the formation table. */ Datum disable_secondary(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); SetFormationOptSecondary(formationId, false); PG_RETURN_VOID(); } /* * AddFormation adds given formationId and kind to the pgautofailover.formation * table. * * It returns nothing: either the INSERT happened and we have the exact same * information as given in the table, or it failed and we raise an exception * here. */ void AddFormation(const char *formationId, FormationKind kind, Name dbname, bool optionSecondary, int numberSyncStandbys) { Oid argTypes[] = { TEXTOID, /* formationid */ TEXTOID, /* kind */ NAMEOID, /* dbname */ BOOLOID, /* opt_secondary */ INT4OID /* number_sync_standbys */ }; Datum argValues[] = { CStringGetTextDatum(formationId), /* formationid */ CStringGetTextDatum(FormationKindToString(kind)), /* kind */ NameGetDatum(dbname), /* dbname */ BoolGetDatum(optionSecondary), /* opt_secondary */ Int32GetDatum(numberSyncStandbys) /* number_sync_standbys */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *insertQuery = "INSERT INTO " AUTO_FAILOVER_FORMATION_TABLE " (formationid, kind, dbname, opt_secondary, number_sync_standbys)" " VALUES ($1, $2, $3, $4, $5)"; SPI_connect(); int spiStatus = SPI_execute_with_args(insertQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_INSERT) { elog(ERROR, "could not insert into " AUTO_FAILOVER_FORMATION_TABLE); } SPI_finish(); } /* * RemoveFormation deletes a formation, erroring out if there are still nodes * attached to it. We use the foreign key declaration to protect against that * case. */ void RemoveFormation(const char *formationId) { Oid argTypes[] = { TEXTOID, /* formationId */ }; Datum argValues[] = { CStringGetTextDatum(formationId) /* formationId */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *deleteQuery = "DELETE FROM " AUTO_FAILOVER_FORMATION_TABLE " WHERE formationid = $1"; SPI_connect(); int spiStatus = SPI_execute_with_args(deleteQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_DELETE) { elog(ERROR, "could not delete from " AUTO_FAILOVER_FORMATION_TABLE); } if (SPI_processed == 0) { elog(ERROR, "couldn't find formation \"%s\"", formationId); } else if (SPI_processed > 1) { /* that's a primary key index corruption or something nastly here. */ elog(ERROR, "formation name \"%s\" belongs to several formations", formationId); } SPI_finish(); } /* * SetFormationKind updates the formation kind to be the one given. */ void SetFormationKind(const char *formationId, FormationKind kind) { Oid argTypes[] = { TEXTOID, /* formationKind */ TEXTOID /* formationId */ }; Datum argValues[] = { CStringGetTextDatum(FormationKindToString(kind)), /* formationKind */ CStringGetTextDatum(formationId) /* formationId */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *updateQuery = "UPDATE " AUTO_FAILOVER_FORMATION_TABLE " SET kind = $1" " WHERE formationid = $2"; SPI_connect(); int spiStatus = SPI_execute_with_args(updateQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_UPDATE) { elog(ERROR, "could not update " AUTO_FAILOVER_FORMATION_TABLE); } SPI_finish(); } /* * SetFormationDBName updates the formation dbname to be the one given. */ void SetFormationDBName(const char *formationId, const char *dbname) { Oid argTypes[] = { TEXTOID, /* dbname */ TEXTOID /* formationId */ }; Datum argValues[] = { CStringGetTextDatum(dbname), /* dbname */ CStringGetTextDatum(formationId) /* formationId */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *updateQuery = "UPDATE " AUTO_FAILOVER_FORMATION_TABLE " SET dbname = $1" " WHERE formationid = $2"; SPI_connect(); int spiStatus = SPI_execute_with_args(updateQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_UPDATE) { elog(ERROR, "could not update " AUTO_FAILOVER_FORMATION_TABLE); } SPI_finish(); } /* * SetFormationOptSecondary updates the formation to enable or disable * secondary nodes for a formation. When enabling the user is responsible for * adding new nodes to actually add secondaries to the formation. When * disabling the user should have shutdown the secondary nodes before, command * errors otherwise. */ void SetFormationOptSecondary(const char *formationId, bool optSecondary) { Oid argTypes[] = { BOOLOID, /* opt_secondary */ TEXTOID /* formationId */ }; Datum argValues[] = { BoolGetDatum(optSecondary), /* opt_secondary */ CStringGetTextDatum(formationId) /* formationId */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *updateQuery = "UPDATE " AUTO_FAILOVER_FORMATION_TABLE " SET opt_secondary = $1" " WHERE formationid = $2"; SPI_connect(); int spiStatus = SPI_execute_with_args(updateQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_UPDATE) { elog(ERROR, "could not update " AUTO_FAILOVER_FORMATION_TABLE); } SPI_finish(); } /* * FormationKindFromString returns an enum value for FormationKind when given a * text representation of the value. */ FormationKind FormationKindFromString(const char *kind) { FormationKind kindArray[] = { FORMATION_KIND_UNKNOWN, FORMATION_KIND_UNKNOWN, FORMATION_KIND_PGSQL, FORMATION_KIND_CITUS }; char *kindList[] = { "", "unknown", "pgsql", "citus", NULL }; for (int listIndex = 0; kindList[listIndex] != NULL; listIndex++) { char *candidate = kindList[listIndex]; if (strcmp(kind, candidate) == 0) { return kindArray[listIndex]; } } ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unknown formation kind \"%s\"", kind))); /* never happens, make compiler happy */ return FORMATION_KIND_UNKNOWN; } /* * FormationKindToString returns the string representation of a FormationKind. */ char * FormationKindToString(FormationKind kind) { switch (kind) { case FORMATION_KIND_UNKNOWN: { return "unknown"; } case FORMATION_KIND_PGSQL: { return "pgsql"; } case FORMATION_KIND_CITUS: { return "citus"; } default: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unknown formation kind value %d", kind))); } /* keep compiler happy */ return ""; } /* * FormationKindFromNodeKindString returns a FormationKind value when given the * kind of a NODE in the formation: either standalone, coordinator, or worker. */ FormationKind FormationKindFromNodeKindString(const char *nodeKind) { FormationKind kindArray[] = { FORMATION_KIND_UNKNOWN, FORMATION_KIND_UNKNOWN, FORMATION_KIND_PGSQL, FORMATION_KIND_CITUS, FORMATION_KIND_CITUS }; char *kindList[] = { "", "unknown", "standalone", "coordinator", "worker", NULL }; for (int listIndex = 0; kindList[listIndex] != NULL; listIndex++) { char *candidate = kindList[listIndex]; if (strcmp(nodeKind, candidate) == 0) { return kindArray[listIndex]; } } ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unknown formation kind \"%s\"", nodeKind))); /* never happens, make compiler happy */ return FORMATION_KIND_UNKNOWN; } /* * IsCitusFormation returns whether the formation is a citus formation. */ bool IsCitusFormation(AutoFailoverFormation *formation) { return formation->kind == FORMATION_KIND_CITUS; } /* * set_formation_number_sync_standbys sets number_sync_standbys property of a * formation. The function returns true on success. */ Datum set_formation_number_sync_standbys(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); int number_sync_standbys = PG_GETARG_INT32(1); AutoFailoverFormation *formation = GetFormation(formationId); /* at the moment, only test with the number of standbys in group 0 */ int groupId = 0; int standbyCount = 0; char message[BUFSIZE] = { 0 }; if (formation == NULL) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unknown formation \"%s\"", formationId))); } LockFormation(formationId, ExclusiveLock); if (number_sync_standbys < 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid value for number_sync_standbys: \"%d\"", number_sync_standbys), errdetail("A non-negative integer is expected"))); } AutoFailoverNode *primaryNode = GetPrimaryNodeInGroup(formation->formationId, groupId); if (primaryNode == NULL) { ereport(ERROR, (errmsg("Couldn't find the primary node in formation \"%s\", " "group %d", formation->formationId, groupId))); } /* * We require a stable group state to apply new formation settings. * * The classic stable state is of course both reported and goal state being * "primary". That said, when number_sync_standbys is zero (0) and the * standby nodes are unavailable, then another stable state is when both * reported and goal state are "wait_primary". */ if (!IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY) && !IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY)) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot set number_sync_standbys when current " "goal state for primary " NODE_FORMAT " is \"%s\", and current reported state is \"%s\"", NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryNode->goalState), ReplicationStateGetName(primaryNode->reportedState)), errdetail("The primary node so must be in state \"primary\" " "or \"wait_primary\" " "to be able to apply configuration changes to " "its synchronous_standby_names setting"))); } /* set the formation property to see if that is a valid choice */ formation->number_sync_standbys = number_sync_standbys; if (!FormationNumSyncStandbyIsValid(formation, primaryNode, groupId, &standbyCount)) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid value for number_sync_standbys: \"%d\"", number_sync_standbys), errdetail("At least %d standby nodes are required, " "and only %d are currently participating in " "the replication quorum", number_sync_standbys + 1, standbyCount))); } /* SetFormationNumberSyncStandbys reports ERROR when returning false */ bool success = SetFormationNumberSyncStandbys(formationId, number_sync_standbys); /* and now ask the primary to change its settings */ LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to apply_settings " "after updating number_sync_standbys to %d for formation %s.", NODE_FORMAT_ARGS(primaryNode), formation->number_sync_standbys, formation->formationId); SetNodeGoalState(primaryNode, REPLICATION_STATE_APPLY_SETTINGS, message); PG_RETURN_BOOL(success); } /* * FormationNumSyncStandbyIsValid returns true if the current setting for * number_sync_standbys on the given formation makes sense with the registered * standbys. */ bool FormationNumSyncStandbyIsValid(AutoFailoverFormation *formation, AutoFailoverNode *primaryNode, int groupId, int *standbyCount) { ListCell *nodeCell = NULL; int count = 0; if (formation == NULL) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("the given formation must not be NULL"))); } List *standbyNodesGroupList = AutoFailoverOtherNodesList(primaryNode); foreach(nodeCell, standbyNodesGroupList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node->replicationQuorum) { ++count; } } *standbyCount = count; /* * number_sync_standbys = 0 is a special case in our FSM, because we have * special handling of a missing standby then, switching to wait_primary to * disable synchronous replication when the standby is not available. * * For other values (N) of number_sync_standbys, we require N+1 known * standby nodes, so that you can lose a standby at any point in time and * still accept writes. That's the service availability trade-off and cost. */ if (formation->number_sync_standbys == 0) { return true; } return (formation->number_sync_standbys + 1) <= count; } /* * SetFormationNumberSyncStandbys sets numberSyncStandbys property * of a formation entry. Returns true if successfull. */ bool SetFormationNumberSyncStandbys(const char *formationId, int numberSyncStandbys) { Oid argTypes[] = { INT4OID, /* numberSyncStandbys */ TEXTOID /* formationId */ }; Datum argValues[] = { Int32GetDatum(numberSyncStandbys), /* numberSyncStandbys */ CStringGetTextDatum(formationId) /* formationId */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *updateQuery = "UPDATE " AUTO_FAILOVER_FORMATION_TABLE " SET number_sync_standbys = $1" " WHERE formationid = $2"; SPI_connect(); int spiStatus = SPI_execute_with_args(updateQuery, argCount, argTypes, argValues, NULL, false, 0); SPI_finish(); if (spiStatus != SPI_OK_UPDATE) { elog(ERROR, "could not update " AUTO_FAILOVER_FORMATION_TABLE); return false; } return true; } /* * AutoFailoverFormationGetDatum prepares a Datum from given formation. * Caller is expected to provide fcinfo structure that contains compatible * call result type. */ Datum AutoFailoverFormationGetDatum(FunctionCallInfo fcinfo, AutoFailoverFormation *formation) { TupleDesc resultDescriptor = NULL; Datum values[5]; bool isNulls[5]; if (formation == NULL) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("the given formation must not be NULL"))); } memset(values, 0, sizeof(values)); memset(isNulls, false, sizeof(isNulls)); values[0] = CStringGetTextDatum(formation->formationId); values[1] = CStringGetTextDatum(FormationKindToString(formation->kind)); values[2] = CStringGetDatum(formation->dbname); values[3] = BoolGetDatum(formation->opt_secondary); values[4] = Int32GetDatum(formation->number_sync_standbys); TypeFuncClass resultTypeClass = get_call_result_type(fcinfo, NULL, &resultDescriptor); if (resultTypeClass != TYPEFUNC_COMPOSITE) { ereport(ERROR, (errmsg("return type must be a row type"))); } HeapTuple resultTuple = heap_form_tuple(resultDescriptor, values, isNulls); Datum resultDatum = HeapTupleGetDatum(resultTuple); PG_RETURN_DATUM(resultDatum); } pg_auto_failover-1.6.3/src/monitor/formation_metadata.h000066400000000000000000000044251414244367200233470ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * stc/monitor/formation_metadata.h * * Declarations for public functions and types related to formation metadata. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #pragma once #include "access/xlogdefs.h" #include "datatype/timestamp.h" #include "health_check.h" #include "node_metadata.h" #include "replication_state.h" #define AUTO_FAILOVER_FORMATION_TABLE_NAME "formation" /* column indexes for pgautofailover.node */ #define Natts_pgautofailover_formation 4 #define Anum_pgautofailover_formation_formationid 1 #define Anum_pgautofailover_formation_kind 2 #define Anum_pgautofailover_formation_dbname 3 #define Anum_pgautofailover_formation_opt_secondary 4 #define Anum_pgautofailover_formation_number_sync_standbys 5 /* * AutoFailoverFormation represents a formation that is being managed by the * pg_auto_failover monitor. */ typedef struct AutoFailoverFormation { char *formationId; FormationKind kind; char dbname[NAMEDATALEN]; bool opt_secondary; int number_sync_standbys; } AutoFailoverFormation; /* public function declarations */ extern AutoFailoverFormation * GetFormation(const char *formationId); extern void AddFormation(const char *formationId, FormationKind kind, Name dbname, bool optionSecondary, int numberSyncStandbys); extern void RemoveFormation(const char *formationId); extern void SetFormationKind(const char *formationId, FormationKind kind); extern void SetFormationDBName(const char *formationId, const char *dbname); extern void SetFormationOptSecondary(const char *formationId, bool optSecondary); extern bool IsCitusFormation(AutoFailoverFormation *formation); extern bool FormationNumSyncStandbyIsValid(AutoFailoverFormation *formation, AutoFailoverNode *primaryNode, int groupId, int *standbyCount); extern bool SetFormationNumberSyncStandbys(const char *formationId, int numberSyncStandbys); extern FormationKind FormationKindFromString(const char *kind); extern char * FormationKindToString(FormationKind kind); extern FormationKind FormationKindFromNodeKindString(const char *nodeKind); pg_auto_failover-1.6.3/src/monitor/group_state_machine.c000066400000000000000000001746011414244367200235300ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/group_state_machine.c * * Implementation of the state machine for fail-over within a group of * PostgreSQL nodes. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" #include "formation_metadata.h" #include "group_state_machine.h" #include "node_metadata.h" #include "notifications.h" #include "replication_state.h" #include "version_compat.h" #include "access/htup_details.h" #include "catalog/pg_enum.h" #include "commands/trigger.h" #include "nodes/makefuncs.h" #include "nodes/parsenodes.h" #include "parser/parse_type.h" #include "utils/builtins.h" #include "utils/rel.h" #include "utils/syscache.h" #include "utils/timestamp.h" /* * To communicate with the BuildCandidateList function, it's easier to handle a * structure with those bits of information to share: */ typedef struct CandidateList { int numberSyncStandbys; List *candidateNodesGroupList; List *mostAdvancedNodesGroupList; XLogRecPtr mostAdvancedReportedLSN; int candidateCount; int quorumCandidateCount; int missingNodesCount; } CandidateList; /* private function forward declarations */ static bool ProceedGroupStateForPrimaryNode(AutoFailoverNode *primaryNode); static bool ProceedGroupStateForMSFailover(AutoFailoverNode *activeNode, AutoFailoverNode *primaryNode); static bool ProceedWithMSFailover(AutoFailoverNode *activeNode, AutoFailoverNode *candidateNode); static bool BuildCandidateList(List *standbyNodesGroupList, CandidateList *candidateList); static AutoFailoverNode * SelectFailoverCandidateNode(CandidateList *candidateList, AutoFailoverNode *primaryNode); static bool PromoteSelectedNode(AutoFailoverNode *selectedNode, AutoFailoverNode *primaryNode, CandidateList *candidateList); static void AssignGoalState(AutoFailoverNode *pgAutoFailoverNode, ReplicationState state, char *description); static bool WalDifferenceWithin(AutoFailoverNode *secondaryNode, AutoFailoverNode *primaryNode, int64 delta); /* GUC variables */ int EnableSyncXlogThreshold = DEFAULT_XLOG_SEG_SIZE; int PromoteXlogThreshold = DEFAULT_XLOG_SEG_SIZE; /* * ProceedGroupState proceeds the state machines of the group of which * the given node is part. */ bool ProceedGroupState(AutoFailoverNode *activeNode) { char *formationId = activeNode->formationId; int groupId = activeNode->groupId; AutoFailoverFormation *formation = GetFormation(formationId); List *nodesGroupList = AutoFailoverNodeGroup(formationId, groupId); int nodesCount = list_length(nodesGroupList); if (formation == NULL) { ereport(ERROR, (errmsg("Formation for %s could not be found", activeNode->formationId))); } /* * If the active node just reached the DROPPED state, proceed to remove it * from the pgautofailover.node table. */ if (IsCurrentState(activeNode, REPLICATION_STATE_DROPPED)) { char message[BUFSIZE] = { 0 }; /* time to actually remove the current node */ RemoveAutoFailoverNode(activeNode); LogAndNotifyMessage( message, BUFSIZE, "Removing " NODE_FORMAT " from formation \"%s\" and group %d", NODE_FORMAT_ARGS(activeNode), activeNode->formationId, activeNode->groupId); return true; } /* node reports secondary/dropped */ if (activeNode->goalState == REPLICATION_STATE_DROPPED) { return true; } /* * A node in "maintenance" state can only get out of maintenance through an * explicit call to stop_maintenance(), the FSM will not assign a new state * to a node that is currently in maintenance. */ if (IsCurrentState(activeNode, REPLICATION_STATE_MAINTENANCE)) { return true; } /* * A node that is alone in its group should be SINGLE. * * Exception arises when it used to be other nodes in the group, and the * only node left has Candidate Priority of zero. In that case the setup is * clear, it can't allow writes, so it can't be SINGLE. In that case, it * should be REPORT_LSN, waiting for either a change of settings, or the * introduction of a new node. */ if (nodesCount == 1 && !IsCurrentState(activeNode, REPLICATION_STATE_SINGLE) && activeNode->candidatePriority > 0) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to single as there is no other node.", NODE_FORMAT_ARGS(activeNode)); /* other node may have been removed */ AssignGoalState(activeNode, REPLICATION_STATE_SINGLE, message); return true; } else if (nodesCount == 1 && !IsCurrentState(activeNode, REPLICATION_STATE_SINGLE) && activeNode->candidatePriority == 0) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to report_lsn as there is no other node" " and candidate priority is %d.", NODE_FORMAT_ARGS(activeNode), activeNode->candidatePriority); /* other node may have been removed */ AssignGoalState(activeNode, REPLICATION_STATE_REPORT_LSN, message); return true; } /* * We separate out the FSM for the primary server, because that one needs * to loop over every other node to take decisions. That induces some * complexity that is best managed in a specialized function. */ if (IsInPrimaryState(activeNode)) { return ProceedGroupStateForPrimaryNode(activeNode); } AutoFailoverNode *primaryNode = GetPrimaryOrDemotedNodeInGroup(formationId, groupId); /* * We want to have a primaryNode around for most operations, but also need * to support the case that the primaryNode has been dropped manually by a * call to remove_node(). So we have two main cases to think about here: * * - we have two nodes, one of them has been removed, we catch that earlier * in this function and assign the remaining one with the SINGLE state, * * - we have more than two nodes in total, and the primary has just been * removed (maybe it was still marked unhealthy and the operator knows it * won't ever come back so called remove_node() already): in that case in * remove_node() we set all the other nodes to REPORT_LSN (unless they * are in MAINTENANCE), and we should be able to make progress with the * failover without a primary around. * * In all other cases we require a primaryNode to be identified. */ if (primaryNode == NULL && !IsFailoverInProgress(nodesGroupList)) { ereport(ERROR, (errmsg("ProceedGroupState couldn't find the primary node " "in formation \"%s\", group %d", formationId, groupId), errdetail("activeNode is " NODE_FORMAT " in state %s", NODE_FORMAT_ARGS(activeNode), ReplicationStateGetName(activeNode->goalState)))); } /* Multiple Standby failover is handled in its own function. */ if (nodesCount > 2 && IsUnhealthy(primaryNode)) { /* * The WAIT_PRIMARY state encodes the fact that we know there is no * failover candidate, so there's no point in orchestrating a failover, * even though the primary node is currently not available. * * To be in the WAIT_PRIMARY means that the other nodes are all either * unhealty or with candidate priority set to zero. * * Otherwise stop replication from the primary and proceed with * candidate election for primary replacement, whenever we have at * least one candidates for failover. */ List *candidateNodesList = AutoFailoverOtherNodesListInState(primaryNode, REPLICATION_STATE_SECONDARY); int candidatesCount = CountHealthyCandidates(candidateNodesList); if (IsInPrimaryState(primaryNode) && !IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY) && candidatesCount >= 1) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to draining after it became unhealthy.", NODE_FORMAT_ARGS(primaryNode)); AssignGoalState(primaryNode, REPLICATION_STATE_DRAINING, message); } /* * In a multiple standby system we can assign maintenance as soon as * prepare_maintenance has been reached, at the same time than an * election is triggered. This also allows the operator to disable * maintenance on the old-primary and have it join the election. */ else if (IsCurrentState(primaryNode, REPLICATION_STATE_PREPARE_MAINTENANCE)) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to maintenance after it converged to prepare_maintenance.", NODE_FORMAT_ARGS(primaryNode)); AssignGoalState(primaryNode, REPLICATION_STATE_MAINTENANCE, message); } /* * ProceedGroupStateForMSFailover chooses the failover candidate when * there's more than one standby node around, by applying the * candidatePriority and comparing the reportedLSN. The function also * orchestrate fetching the missing WAL from the failover candidate if * that's needed. * * When ProceedGroupStateForMSFailover returns true, it means it was * successfull in driving the failover to the next step, and we should * stop here. When it return false, it did nothing, and so we want to * apply the common orchestration code for a failover. */ if (ProceedGroupStateForMSFailover(activeNode, primaryNode)) { return true; } } /* * when report_lsn and the promotion has been done already: * report_lsn -> secondary * * * Let the main primary loop account for allSecondariesAreHealthy and only * then decide to assign PRIMARY to the primaryNode. */ if (IsCurrentState(activeNode, REPLICATION_STATE_REPORT_LSN) && (IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY) || IsCurrentState(primaryNode, REPLICATION_STATE_JOIN_PRIMARY)) && IsHealthy(primaryNode)) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to secondary after " NODE_FORMAT " converged to %s and has been marked healthy.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryNode->reportedState)); AssignGoalState(activeNode, REPLICATION_STATE_SECONDARY, message); return true; } /* * when report_lsn and the promotion has been done already: * report_lsn -> secondary * */ if (IsCurrentState(activeNode, REPLICATION_STATE_REPORT_LSN) && IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY) && IsHealthy(primaryNode)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to secondary after " NODE_FORMAT " got selected as the failover candidate.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); AssignGoalState(activeNode, REPLICATION_STATE_SECONDARY, message); return true; } /* * When the candidate is done fast forwarding the locally missing WAL bits, * it can be promoted. */ if (IsCurrentState(activeNode, REPLICATION_STATE_FAST_FORWARD)) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to prepare_promotion", NODE_FORMAT_ARGS(activeNode)); AssignGoalState(activeNode, REPLICATION_STATE_PREPARE_PROMOTION, message); return true; } /* * There are other cases when we want to continue an already started * failover. */ if (IsCurrentState(activeNode, REPLICATION_STATE_REPORT_LSN) || IsCurrentState(activeNode, REPLICATION_STATE_FAST_FORWARD)) { return ProceedGroupStateForMSFailover(activeNode, primaryNode); } /* * when primary node is ready for replication: * wait_standby -> catchingup */ if (IsCurrentState(activeNode, REPLICATION_STATE_WAIT_STANDBY) && (IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY) || IsCurrentState(primaryNode, REPLICATION_STATE_JOIN_PRIMARY))) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to catchingup after " NODE_FORMAT " converged to %s.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryNode->reportedState)); /* start replication */ AssignGoalState(activeNode, REPLICATION_STATE_CATCHINGUP, message); return true; } /* * when primary node is ready for replication: * wait_standby -> catchingup * primary -> apply_settings */ if (IsCurrentState(activeNode, REPLICATION_STATE_WAIT_STANDBY) && IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY) && activeNode->replicationQuorum) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to catchingup and " NODE_FORMAT " to %s to edit synchronous_standby_names.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryNode->reportedState)); /* start replication */ AssignGoalState(activeNode, REPLICATION_STATE_CATCHINGUP, message); /* edit synchronous_standby_names to add the new standby now */ AssignGoalState(primaryNode, REPLICATION_STATE_APPLY_SETTINGS, message); return true; } /* * when primary node is ready for replication: * wait_standby -> catchingup */ if (IsCurrentState(activeNode, REPLICATION_STATE_WAIT_STANDBY) && IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY) && !activeNode->replicationQuorum) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to catchingup.", NODE_FORMAT_ARGS(activeNode)); /* start replication */ AssignGoalState(activeNode, REPLICATION_STATE_CATCHINGUP, message); return true; } /* * when secondary caught up: * catchingup -> secondary * + wait_primary -> primary * * When we have multiple standby nodes and one of them is joining, or * re-joining after maintenance, we have to edit the replication setting * synchronous_standby_names on the primary. The transition from another * state to PRIMARY includes that edit. If the primary already is in the * primary state, we assign APPLY_SETTINGS to it to make sure its * repication settings are updated now. */ if (IsCurrentState(activeNode, REPLICATION_STATE_CATCHINGUP) && (IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY) || IsCurrentState(primaryNode, REPLICATION_STATE_JOIN_PRIMARY) || IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY)) && IsHealthy(activeNode) && activeNode->reportedTLI == primaryNode->reportedTLI && WalDifferenceWithin(activeNode, primaryNode, EnableSyncXlogThreshold)) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to secondary after it caught up.", NODE_FORMAT_ARGS(activeNode)); /* node is ready for promotion */ AssignGoalState(activeNode, REPLICATION_STATE_SECONDARY, message); return true; } /* * when primary fails: * secondary -> prepare_promotion * + primary -> draining */ if (IsCurrentState(activeNode, REPLICATION_STATE_SECONDARY) && IsInPrimaryState(primaryNode) && IsUnhealthy(primaryNode) && IsHealthy(activeNode) && activeNode->candidatePriority > 0 && WalDifferenceWithin(activeNode, primaryNode, PromoteXlogThreshold)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to draining and " NODE_FORMAT " to prepare_promotion " "after " NODE_FORMAT " became unhealthy.", NODE_FORMAT_ARGS(primaryNode), NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); /* keep reading until no more records are available */ AssignGoalState(activeNode, REPLICATION_STATE_PREPARE_PROMOTION, message); /* shut down the primary */ AssignGoalState(primaryNode, REPLICATION_STATE_DRAINING, message); return true; } /* * when secondary is put to maintenance and there's no standby left * wait_maintenance -> maintenance * wait_primary */ if (IsCurrentState(activeNode, REPLICATION_STATE_WAIT_MAINTENANCE) && IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to maintenance after " NODE_FORMAT " converged to wait_primary.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); /* secondary reached maintenance */ AssignGoalState(activeNode, REPLICATION_STATE_MAINTENANCE, message); return true; } /* * when secondary is in wait_maintenance state and goal state of primary is * not wait_primary anymore, e.g. another node joined and made it primary * again or it got demoted. Then we don't need to wait anymore and we can * transition directly to maintenance. */ if (IsCurrentState(activeNode, REPLICATION_STATE_WAIT_MAINTENANCE) && primaryNode->goalState != REPLICATION_STATE_WAIT_PRIMARY) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to maintenance after " NODE_FORMAT " got assigned %s as goal state.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryNode->goalState)); /* secondary reached maintenance */ AssignGoalState(activeNode, REPLICATION_STATE_MAINTENANCE, message); return true; } /* * when primary is put to maintenance * prepare_promotion -> stop_replication */ if (IsCurrentState(activeNode, REPLICATION_STATE_PREPARE_PROMOTION) && IsCurrentState(primaryNode, REPLICATION_STATE_PREPARE_MAINTENANCE)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to stop_replication after " NODE_FORMAT " converged to prepare_maintenance.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); /* promote the secondary */ AssignGoalState(activeNode, REPLICATION_STATE_STOP_REPLICATION, message); return true; } /* * when a worker blocked writes: * prepare_promotion -> wait_primary */ if (IsCurrentState(activeNode, REPLICATION_STATE_PREPARE_PROMOTION) && primaryNode && IsCitusFormation(formation) && activeNode->groupId > 0) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to wait_primary and " NODE_FORMAT " to demoted after the coordinator metadata was updated.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); /* node is now taking writes */ AssignGoalState(activeNode, REPLICATION_STATE_WAIT_PRIMARY, message); /* done draining, node is presumed dead */ AssignGoalState(primaryNode, REPLICATION_STATE_DEMOTED, message); return true; } /* * when a worker blocked writes and the primary has been removed: * prepare_promotion -> wait_primary */ if (IsCurrentState(activeNode, REPLICATION_STATE_PREPARE_PROMOTION) && primaryNode == NULL && IsCitusFormation(formation) && activeNode->groupId > 0) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to wait_primary after the coordinator metadata was updated.", NODE_FORMAT_ARGS(activeNode)); /* node is now taking writes */ AssignGoalState(activeNode, REPLICATION_STATE_WAIT_PRIMARY, message); return true; } /* * when node is seeing no more writes: * prepare_promotion -> stop_replication * * refrain from prepare_maintenance -> demote_timeout on the primary, which * might happen here when secondary has reached prepare_promotion before * primary has reached prepare_maintenance. */ if (IsCurrentState(activeNode, REPLICATION_STATE_PREPARE_PROMOTION) && primaryNode && !IsInMaintenance(primaryNode)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to demote_timeout and " NODE_FORMAT " to stop_replication after " NODE_FORMAT " converged to prepare_promotion.", NODE_FORMAT_ARGS(primaryNode), NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(activeNode)); /* perform promotion to stop replication */ AssignGoalState(activeNode, REPLICATION_STATE_STOP_REPLICATION, message); /* wait for possibly-alive primary to kill itself */ AssignGoalState(primaryNode, REPLICATION_STATE_DEMOTE_TIMEOUT, message); return true; } /* * when primary node has been removed and we are promoting one standby * prepare_promotion -> stop_replication */ if (IsCurrentState(activeNode, REPLICATION_STATE_PREPARE_PROMOTION) && primaryNode == NULL) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to wait_primary after " NODE_FORMAT " converged to prepare_promotion.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(activeNode)); /* perform promotion to stop replication */ AssignGoalState(activeNode, REPLICATION_STATE_WAIT_PRIMARY, message); return true; } /* * when primary node is going to maintenance * stop_replication -> wait_primary * prepare_maintenance -> maintenance */ if (IsCurrentState(activeNode, REPLICATION_STATE_STOP_REPLICATION) && IsCurrentState(primaryNode, REPLICATION_STATE_PREPARE_MAINTENANCE)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to wait_primary and " NODE_FORMAT " to maintenance.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); /* node is now taking writes */ AssignGoalState(activeNode, REPLICATION_STATE_WAIT_PRIMARY, message); /* old primary node is now ready for maintenance operations */ AssignGoalState(primaryNode, REPLICATION_STATE_MAINTENANCE, message); return true; } /* * when drain time expires or primary reports it's drained: * draining -> demoted */ if (IsCurrentState(activeNode, REPLICATION_STATE_STOP_REPLICATION) && (IsCurrentState(primaryNode, REPLICATION_STATE_DEMOTE_TIMEOUT) || IsDrainTimeExpired(primaryNode))) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to wait_primary and " NODE_FORMAT " to demoted after the demote timeout expired.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); /* node is now taking writes */ AssignGoalState(activeNode, REPLICATION_STATE_WAIT_PRIMARY, message); /* done draining, node is presumed dead */ AssignGoalState(primaryNode, REPLICATION_STATE_DEMOTED, message); return true; } /* * when a worker blocked writes: * stop_replication -> wait_primary */ if (IsCurrentState(activeNode, REPLICATION_STATE_STOP_REPLICATION) && primaryNode && IsCitusFormation(formation) && activeNode->groupId > 0) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to wait_primary and " NODE_FORMAT " to demoted after the coordinator metadata was updated.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); /* node is now taking writes */ AssignGoalState(activeNode, REPLICATION_STATE_WAIT_PRIMARY, message); /* done draining, node is presumed dead */ AssignGoalState(primaryNode, REPLICATION_STATE_DEMOTED, message); return true; } /* * when a worker blocked writes, and the primary has been dropped: * stop_replication -> wait_primary */ if (IsCurrentState(activeNode, REPLICATION_STATE_STOP_REPLICATION) && primaryNode == NULL && IsCitusFormation(formation) && activeNode->groupId > 0) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to wait_primary after the coordinator metadata was updated.", NODE_FORMAT_ARGS(activeNode)); /* node is now taking writes */ AssignGoalState(activeNode, REPLICATION_STATE_WAIT_PRIMARY, message); return true; } /* * when a new primary is ready: * demoted -> catchingup * * We accept to move from demoted to catching up as soon as the primary * node is has reported either wait_primary or join_primary, and even when * it's already transitioning to primary, thanks to another standby * concurrently making progress. */ if (IsCurrentState(activeNode, REPLICATION_STATE_DEMOTED) && IsHealthy(primaryNode) && ((primaryNode->reportedState == REPLICATION_STATE_WAIT_PRIMARY || primaryNode->reportedState == REPLICATION_STATE_JOIN_PRIMARY) && primaryNode->goalState == REPLICATION_STATE_PRIMARY)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to catchingup after it converged to demotion and " NODE_FORMAT " converged to primary.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); /* it's safe to rejoin as a secondary */ AssignGoalState(activeNode, REPLICATION_STATE_CATCHINGUP, message); return true; } /* * when a new primary is ready: * demoted -> catchingup */ if (IsCurrentState(activeNode, REPLICATION_STATE_DEMOTED) && IsHealthy(primaryNode) && (IsCurrentState(primaryNode, REPLICATION_STATE_JOIN_PRIMARY) || IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY) || IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY))) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to catchingup after it converged to demotion and " NODE_FORMAT " converged to %s.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryNode->reportedState)); /* it's safe to rejoin as a secondary */ AssignGoalState(activeNode, REPLICATION_STATE_CATCHINGUP, message); return true; } /* * when a new primary is ready: * join_secondary -> secondary * * As there's no action to implement on the new selected primary for that * step, we can make progress as soon as we want to. * * The primary could be in one of those states: * - wait_primary/wait_primary * - wait_primary/primary * * This transition also happens when a former primary node has been * demoted, and a multiple standbys has taken effect, we have a new primary * being promoted, and several standby nodes following the new primary. * */ if (IsCurrentState(activeNode, REPLICATION_STATE_JOIN_SECONDARY) && primaryNode->reportedState == REPLICATION_STATE_WAIT_PRIMARY && (primaryNode->goalState == REPLICATION_STATE_WAIT_PRIMARY || primaryNode->goalState == REPLICATION_STATE_PRIMARY)) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to secondary after " NODE_FORMAT " converged to wait_primary.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); /* it's safe to rejoin as a secondary */ AssignGoalState(activeNode, REPLICATION_STATE_SECONDARY, message); /* compute next step for the primary depending on node settings */ return ProceedGroupStateForPrimaryNode(primaryNode); } /* * when a new secondary re-appears after a failover or at a "random" time * in the FSM cycle, and the wait_primary or join_primary node has already * made progress to primary. * * join_secondary -> secondary */ if (IsCurrentState(activeNode, REPLICATION_STATE_JOIN_SECONDARY) && IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to secondary after " NODE_FORMAT " converged to primary.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(primaryNode)); /* it's safe to rejoin as a secondary */ AssignGoalState(activeNode, REPLICATION_STATE_SECONDARY, message); return true; } return false; } /* * Group State Machine when a primary node contacts the monitor. */ static bool ProceedGroupStateForPrimaryNode(AutoFailoverNode *primaryNode) { List *otherNodesGroupList = AutoFailoverOtherNodesList(primaryNode); int otherNodesCount = list_length(otherNodesGroupList); /* * when a first "other" node wants to become standby: * single -> wait_primary */ if (IsCurrentState(primaryNode, REPLICATION_STATE_SINGLE)) { ListCell *nodeCell = NULL; foreach(nodeCell, otherNodesGroupList) { AutoFailoverNode *otherNode = (AutoFailoverNode *) lfirst(nodeCell); if (IsCurrentState(otherNode, REPLICATION_STATE_WAIT_STANDBY)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to wait_primary after " NODE_FORMAT " joined.", NODE_FORMAT_ARGS(primaryNode), NODE_FORMAT_ARGS(otherNode)); /* prepare replication slot and pg_hba.conf */ AssignGoalState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY, message); return true; } } } /* * when secondary unhealthy: * secondary ➜ catchingup * primary ➜ wait_primary * * We only swith the primary to wait_primary when there's no healthy * secondary anymore. In other cases, there's by definition at least one * candidate for failover. * * Also we might lose a standby node while already in WAIT_PRIMARY, when * all the left standby nodes are assigned a candidatePriority of zero. */ if (IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY) || IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY) || IsCurrentState(primaryNode, REPLICATION_STATE_APPLY_SETTINGS)) { /* * We count our nodes in different ways, because of special cases we * want to be able to address. We want to distinguish nodes that are in * the replication quorum, nodes that are secondary, and nodes that are * secondary but do not participate in the quorum. * * - replicationQuorumCount is the count of nodes with * replicationQuorum true, whether or not those nodes are currently * in the SECONDARY state. * * - secondaryNodesCount is the count of nodes that are currently in * the SECONDARY state. * * - secondaryQuorumNodesCount is the count of nodes that are both * setup to participate in the replication quorum and also currently * in the SECONDARY state. */ int replicationQuorumCount = otherNodesCount; int secondaryNodesCount = otherNodesCount; int secondaryQuorumNodesCount = otherNodesCount; AutoFailoverFormation *formation = GetFormation(primaryNode->formationId); ListCell *nodeCell = NULL; foreach(nodeCell, otherNodesGroupList) { AutoFailoverNode *otherNode = (AutoFailoverNode *) lfirst(nodeCell); /* * We force secondary nodes to catching-up even if the node is on * its way to being a secondary... unless it is currently in the * reportLSN or join_secondary state, because in those states * Postgres is stopped, waiting for the new primary to be * available. */ if (otherNode->goalState == REPLICATION_STATE_SECONDARY && otherNode->reportedState != REPLICATION_STATE_REPORT_LSN && otherNode->reportedState != REPLICATION_STATE_JOIN_SECONDARY && IsUnhealthy(otherNode)) { char message[BUFSIZE]; --secondaryNodesCount; --secondaryQuorumNodesCount; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to catchingup after it became unhealthy.", NODE_FORMAT_ARGS(otherNode)); /* other node is behind, no longer eligible for promotion */ AssignGoalState(otherNode, REPLICATION_STATE_CATCHINGUP, message); } else if (!IsCurrentState(otherNode, REPLICATION_STATE_SECONDARY)) { --secondaryNodesCount; --secondaryQuorumNodesCount; } /* at this point we are left with nodes in SECONDARY state */ else if (IsCurrentState(otherNode, REPLICATION_STATE_SECONDARY) && !otherNode->replicationQuorum) { --secondaryQuorumNodesCount; } /* now separately count nodes setup with replication quorum */ if (!otherNode->replicationQuorum) { --replicationQuorumCount; } } /* * Special case first: when given a setup where all the nodes are async * (replicationQuorumCount == 0) we allow the "primary" state in almost * all cases, knowing that synchronous_standby_names is still going to * be computed as ''. * * That said, if we don't have a single node in the SECONDARY state, we * still want to switch to WAIT_PRIMARY to show that something * unexpected is happening. */ if (replicationQuorumCount == 0) { Assert(formation->number_sync_standbys == 0); ReplicationState primaryGoalState = secondaryNodesCount == 0 ? REPLICATION_STATE_WAIT_PRIMARY : REPLICATION_STATE_PRIMARY; if (primaryNode->goalState != primaryGoalState) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to %s because none of the secondary nodes" " are healthy at the moment.", NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryGoalState)); AssignGoalState(primaryNode, primaryGoalState, message); return true; } /* when all nodes are async, we're done here */ return true; } /* * Disable synchronous replication to maintain availability. * * Note that we implement here a trade-off between availability (of * writes) against durability of the written data. In the case when * there's a single standby in the group, pg_auto_failover choice is to * maintain availability of the service, including writes. * * In the case when the user has setup a replication quorum of 1 or * more, then pg_auto_failover does not get in the way. You get what * you ask for, which is a strong guarantee on durability. * * To have number_sync_standbys == 1, you need to have at least 2 * standby servers. To get to a point where writes are not possible * anymore, there needs to be a point in time where 2 of the 2 standby * nodes are unavailable. In that case, pg_auto_failover does not * change the configured trade-offs. Writes are blocked until one of * the two defective standby nodes is available again. */ if (!IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY) && secondaryQuorumNodesCount == 0) { /* * Allow wait_primary when number_sync_standbys = 0, otherwise * block writes on the primary. */ ReplicationState primaryGoalState = formation->number_sync_standbys == 0 ? REPLICATION_STATE_WAIT_PRIMARY : REPLICATION_STATE_PRIMARY; if (primaryNode->goalState != primaryGoalState) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to %s because none of the standby nodes in the quorum" " are healthy at the moment.", NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryGoalState)); AssignGoalState(primaryNode, primaryGoalState, message); return true; } } /* * when a node is wait_primary and has at least one healthy candidate * secondary * wait_primary ➜ primary */ if (IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY) && secondaryQuorumNodesCount > 0) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to primary now that we have %d healthy " " secondary nodes in the quorum.", NODE_FORMAT_ARGS(primaryNode), secondaryQuorumNodesCount); AssignGoalState(primaryNode, REPLICATION_STATE_PRIMARY, message); return true; } /* * when a node has changed its replication settings: * apply_settings ➜ wait_primary * apply_settings ➜ primary * * Even when we don't currently have healthy standby nodes to failover * to, if the number_sync_standbys is greater than zero that means the * user wants to block writes on the primary, and we do that by * switching to the primary state after having applied replication * settings. Think * * $ pg_autoctl set formation number-sync-standbys 1 * * during an incident to stop the amount of potential data loss. * */ if (IsCurrentState(primaryNode, REPLICATION_STATE_APPLY_SETTINGS)) { char message[BUFSIZE] = { 0 }; ReplicationState primaryGoalState = formation->number_sync_standbys == 0 && secondaryQuorumNodesCount == 0 ? REPLICATION_STATE_WAIT_PRIMARY : REPLICATION_STATE_PRIMARY; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to %s after it applied replication properties change.", NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryGoalState)); AssignGoalState(primaryNode, primaryGoalState, message); return true; } return true; } /* * We don't use the join_primary state any more, though for backwards * compatibility if a node reports JOIN_PRIMARY well then we assign PRIMARY * to the node. After all it might be that an operator upgrades while a * node is in JOIN_PRIMARY and we certainly want to be able to handle the * situation. */ if (IsCurrentState(primaryNode, REPLICATION_STATE_JOIN_PRIMARY)) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to primary", NODE_FORMAT_ARGS(primaryNode)); AssignGoalState(primaryNode, REPLICATION_STATE_PRIMARY, message); return true; } return false; } /* * ProceedGroupStateForMSFailover implements Group State Machine transition to * orchestrate a failover when we have more than one standby. * * This function is supposed to be called when the following pre-conditions are * met: * * - the primary node is not healthy * - there's more than one standby node registered in the system */ static bool ProceedGroupStateForMSFailover(AutoFailoverNode *activeNode, AutoFailoverNode *primaryNode) { List *nodesGroupList = AutoFailoverNodeGroup(activeNode->formationId, activeNode->groupId); CandidateList candidateList = { 0 }; /* * Done with the single standby code path, now we have several standby * nodes that might all be candidate for failover, or just some of them. * * The first order of business though is to determine if a failover is * currently happening, by looping over all the nodes in case one of them * has already been selected as the failover candidate. */ AutoFailoverNode *nodeBeingPromoted = FindCandidateNodeBeingPromoted(nodesGroupList); /* * If a failover is in progress, continue driving it. */ if (nodeBeingPromoted != NULL) { char message[BUFSIZE] = { 0 }; List *knownUnreachableStates = list_make2_int(REPLICATION_STATE_REPORT_LSN, REPLICATION_STATE_PREPARE_PROMOTION); /* activeNode might be the failover candidate, proceed already */ if (nodeBeingPromoted->nodeId == activeNode->nodeId) { return ProceedWithMSFailover(activeNode, nodeBeingPromoted); } LogAndNotifyMessage( message, BUFSIZE, "Active " NODE_FORMAT " found failover candidate " NODE_FORMAT " being promoted (currently \"%s\"/\"%s\")", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(nodeBeingPromoted), ReplicationStateGetName(nodeBeingPromoted->reportedState), ReplicationStateGetName(nodeBeingPromoted->goalState)); /* * The currently selected node might not be marked healthy at this time * because in REPORT_LSN we shut Postgres down. We still should proceed * with the previously selected node in that case. * * We really need to avoid having two candidates at the same time, and * again, at prepare_promotion point Postgres might not have been * started yet. */ if (IsStateIn(nodeBeingPromoted->reportedState, knownUnreachableStates) || IsHealthy(nodeBeingPromoted)) { elog(LOG, "Found candidate " NODE_FORMAT, NODE_FORMAT_ARGS(nodeBeingPromoted)); return ProceedWithMSFailover(activeNode, nodeBeingPromoted); } } /* * Now, have all our candidates for failover report the most recent LSN * they managed to receive. We build the list of nodes that we consider as * failover candidates into candidateNodesGroupList. * * When every one of the nodes in that list has reported its LSN position, * then we select a node from the just built candidateNodesGroupList to * promote. * * It might well be that in this call to node_active() only a part of the * candidates have reported their LSN position yet. Then we refrain from * selecting any in this round, expecting a future call to node_active() to * be the kicker. * * This design also allows for nodes to concurrently be put to maintenance * or get unhealthy: then the next call to node_active() might build a * different candidateNodesGroupList in which every node has reported their * LSN position, allowing progress to be made. */ char *formationId = activeNode->formationId; AutoFailoverFormation *formation = GetFormation(formationId); candidateList.numberSyncStandbys = formation->number_sync_standbys; BuildCandidateList(nodesGroupList, &candidateList); /* * Time to select a candidate? * * We reach this code when we don't have an healthy primary anymore, it's * been demoted or is draining now. Most probably it's dead. * * Before we enter the selection process, we must have collected the last * received LSN from ALL the standby nodes that are considered as a * candidate (thanks to the FSM transition secondary -> report_lsn), and * now we need to select one of the failover candidates. */ if (candidateList.missingNodesCount > 0) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Failover still in progress after %d nodes reported their LSN " "and we are waiting for %d nodes to report, " "activeNode is " NODE_FORMAT " and reported state \"%s\"", candidateList.candidateCount, candidateList.missingNodesCount, NODE_FORMAT_ARGS(activeNode), ReplicationStateGetName(activeNode->reportedState)); return false; } /* * So all the expected candidates did report their LSN, no node is missing. * Let's see about selecting a candidate for failover now, when we do have * candidates. * * To start the selection process, we require at least number_sync_standbys * nodes to have reported their LSN and be currently healthy, otherwise we * won't be able to maintain our guarantees: we would end-up with a node in * WAIT_PRIMARY state with all the writes blocked for lack of standby * nodes. */ int minCandidates = formation->number_sync_standbys + 1; /* no candidates is a hard pass */ if (candidateList.candidateCount == 0) { return false; } /* not enough candidates to promote and then accept writes, pass */ else if (candidateList.quorumCandidateCount < minCandidates) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Failover still in progress with %d candidates that participate " "in the quorum having reported their LSN: %d nodes are required " "in the quorum to satisfy number_sync_standbys=%d in " "formation \"%s\", activeNode is " NODE_FORMAT " and reported state \"%s\"", candidateList.quorumCandidateCount, minCandidates, formation->number_sync_standbys, formation->formationId, NODE_FORMAT_ARGS(activeNode), ReplicationStateGetName(activeNode->reportedState)); return false; } /* enough candidates to promote and then accept writes, let's do it! */ else { /* build the list of most advanced standby nodes, not ordered */ List *mostAdvancedNodeList = ListMostAdvancedStandbyNodes(nodesGroupList); /* select a node to failover to */ /* * standbyNodesGroupList contains at least 2 nodes: we're in the * process of selecting a candidate for failover. Then * mostAdvancedNodeList is expected to always contain at least one * node, the one with the most advanced reportedLSN, and maybe it * contains more than one node. */ if (list_length(mostAdvancedNodeList) > 0) { AutoFailoverNode *mostAdvancedNode = (AutoFailoverNode *) linitial(mostAdvancedNodeList); char message[BUFSIZE] = { 0 }; candidateList.mostAdvancedNodesGroupList = mostAdvancedNodeList; candidateList.mostAdvancedReportedLSN = mostAdvancedNode->reportedLSN; LogAndNotifyMessage( message, BUFSIZE, "The current most advanced reported LSN is %X/%X, " "as reported by " NODE_FORMAT " and %d other nodes", (uint32) (mostAdvancedNode->reportedLSN >> 32), (uint32) mostAdvancedNode->reportedLSN, NODE_FORMAT_ARGS(mostAdvancedNode), list_length(mostAdvancedNodeList) - 1); } else { ereport(ERROR, (errmsg("BUG: mostAdvancedNodeList is empty"))); } AutoFailoverNode *selectedNode = SelectFailoverCandidateNode(&candidateList, primaryNode); /* we might not have a selected candidate for failover yet */ if (selectedNode == NULL) { /* * Publish more information about the process in the monitor event * table. This is a quite complex mechanism here, and it should be * made as easy as possible to analyze and debug. */ char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Failover still in progress after all %d candidate nodes " "reported their LSN and we failed to select one of them; " "activeNode is " NODE_FORMAT " and reported state \"%s\"", candidateList.candidateCount, NODE_FORMAT_ARGS(activeNode), ReplicationStateGetName(activeNode->reportedState)); return false; } return PromoteSelectedNode(selectedNode, primaryNode, &candidateList); } return false; } /* * BuildCandidateList builds the list of current standby candidates that have * already reported their LSN, and sets nodes that should be reporting to the * REPORT_LSN goal state. * * A CandidateList keeps track of the list of candidate nodes, the list of most * advanced nodes (in terms of LSN positions), and two counters, the count of * candidate nodes (that's the length of the first list) and the count of nodes * that are due to report their LSN but didn't yet, named the * missingNodesCount. * * Managing the missingNodesCount allows a better message to be printed by the * monitor and prevents early failover: when missingNodesCount > 0 then the * caller for BuildCandidateList knows to refrain from any decision making. */ static bool BuildCandidateList(List *nodesGroupList, CandidateList *candidateList) { ListCell *nodeCell = NULL; List *candidateNodesGroupList = NIL; List *secondaryStates = list_make2_int(REPLICATION_STATE_SECONDARY, REPLICATION_STATE_CATCHINGUP); foreach(nodeCell, nodesGroupList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node == NULL) { /* shouldn't happen */ ereport(ERROR, (errmsg("BUG: node is NULL"))); continue; } /* * Skip old and new primary nodes (if a selection has been made). * * When a failover is ongoing, a former primary node that has reached * DRAINING and is reporting should be asked to report their LSN. */ if ((IsInPrimaryState(node) || IsBeingDemotedPrimary(node) || IsDemotedPrimary(node)) && !(IsCurrentState(node, REPLICATION_STATE_DRAINING) || IsCurrentState(node, REPLICATION_STATE_DEMOTED))) { elog(LOG, "Skipping candidate " NODE_FORMAT ", which is a primary (old or new)", NODE_FORMAT_ARGS(node)); continue; } /* * Skip unhealthy nodes to avoid having to wait for them to report, * unless the node is unhealthy because Postgres is down, but * pg_autoctl is still reporting. */ if (IsUnhealthy(node) && !IsReporting(node)) { elog(LOG, "Skipping candidate " NODE_FORMAT ", which is unhealthy", NODE_FORMAT_ARGS(node)); /* * When a secondary node is now down, and had already reported its * LSN, then it's not "missing": we have its LSN and are able to * continue with the election mechanism. * * Otherwise, we didn't get its LSN and this node might be (one of) * the most advanced LSN. Picking it now might lead to loosing * commited data that was reported to the client connection, if * this node is the only one with the most advanted LSN. * * Only the nodes that participate in the quorum are required to * report their LSN, because only those nodes are waited by * Postgres to report a commit to the client connection. */ if (node->replicationQuorum && node->reportedState != REPLICATION_STATE_REPORT_LSN) { ++(candidateList->missingNodesCount); } continue; } /* * Grab healthy standby nodes which have reached REPORT_LSN. */ if (IsCurrentState(node, REPLICATION_STATE_REPORT_LSN)) { candidateNodesGroupList = lappend(candidateNodesGroupList, node); /* when number_sync_standbys is zero, quorum isn't discriminant */ if (node->replicationQuorum || candidateList->numberSyncStandbys == 0) { ++(candidateList->quorumCandidateCount); } continue; } /* if REPORT LSN is assigned and not reached yet, count that */ if (node->goalState == REPLICATION_STATE_REPORT_LSN) { ++(candidateList->missingNodesCount); continue; } /* * Nodes in SECONDARY or CATCHINGUP states are candidates due to report * their LSN. Also old primary nodes in DEMOTED state are due to report * now. And also old primary nodes in DRAINING state, when the drain * timeout is over, are due to report. * * When a node has been asked to re-join the group after a maintenance * period, and been assigned catching-up but failed to connect to the * primary, and a failover now happens, we need that node to join the * REPORT_LSN crew. * * Finally, another interesting case for us here would be a node that * has been asked to re-join a newly elected primary, but the newly * elected primary has now failed and we're in the election process to * replace it. Then demoted/catchingup has been assigned, but there is * no primary to catch-up to anymore, join the REPORT_LSN crew. */ if ((IsStateIn(node->reportedState, secondaryStates) && IsStateIn(node->goalState, secondaryStates)) || (node->reportedState == REPLICATION_STATE_MAINTENANCE && node->goalState == REPLICATION_STATE_CATCHINGUP) || ((IsCurrentState(node, REPLICATION_STATE_DRAINING) || IsCurrentState(node, REPLICATION_STATE_DEMOTED) || (node->reportedState == REPLICATION_STATE_DEMOTED && node->goalState == REPLICATION_STATE_CATCHINGUP)))) { char message[BUFSIZE] = { 0 }; ++(candidateList->missingNodesCount); LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to report_lsn to find the failover candidate", NODE_FORMAT_ARGS(node)); AssignGoalState(node, REPLICATION_STATE_REPORT_LSN, message); continue; } } candidateList->candidateNodesGroupList = candidateNodesGroupList; candidateList->candidateCount = list_length(candidateNodesGroupList); return true; } /* * ProceedWithMSFailover drives a failover forward when we already have a * failover candidate. It might be the first time we just found/elected a * candidate, or one subsequent call to node_active() when then failover is * already being orchestrated. * * Here we have choosen a failover candidate, which is either being * promoted to being the new primary (when it already had all the most * recent WAL, or is done fetching them), or is fetching the most recent * WAL it's still missing from another standby node. */ static bool ProceedWithMSFailover(AutoFailoverNode *activeNode, AutoFailoverNode *candidateNode) { Assert(candidateNode != NULL); /* * When the activeNode is "just" another standby which did REPORT LSN, we * stop replication as soon as possible, and later follow the new primary, * as soon as it's ready. */ if (IsCurrentState(activeNode, REPLICATION_STATE_REPORT_LSN) && CandidateNodeIsReadyToStreamWAL(candidateNode)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to join_secondary after " NODE_FORMAT " got selected as the failover candidate.", NODE_FORMAT_ARGS(activeNode), NODE_FORMAT_ARGS(candidateNode)); AssignGoalState(activeNode, REPLICATION_STATE_JOIN_SECONDARY, message); return true; } /* when we have a candidate, we don't go through finding a candidate */ return false; } /* * SelectFailoverCandidateNode returns the candidate to failover to when we * have one already. * * The selection is based on candidatePriority. If the candidate with the * higher priority doesn't have the most recent LSN, we have it fetch the * missing WAL bits from one of the standby which did receive them. * * Before we enter the selection process, we must have collected the last * received LSN from ALL the standby nodes that are considered as a candidate * (thanks to the FSM transition secondary -> report_lsn), and now we need to * select one of the failover candidates. * * As input we get the candidateNodesGroupList, a filtered list of standby that * are known to be a failover candidate from an earlier filtering process. We * also get the mostAdvancedNode and the primaryNode so that we can decide on * the next step (cascade WALs or promote directly). */ static AutoFailoverNode * SelectFailoverCandidateNode(CandidateList *candidateList, AutoFailoverNode *primaryNode) { /* * Build the list of failover candidate nodes, ordered by priority. * Nodes with candidatePriority == 0 are skipped in GroupListCandidates. */ List *sortedCandidateNodesGroupList = GroupListCandidates(candidateList->candidateNodesGroupList); /* it's only one of the most advanced nodes, a reference to compare LSN */ AutoFailoverNode *mostAdvancedNode = (AutoFailoverNode *) linitial(candidateList->mostAdvancedNodesGroupList); /* the goal in this function is to find this one */ AutoFailoverNode *selectedNode = NULL; ListCell *nodeCell = NULL; /* * We refuse to orchestrate a failover that would have us lose more data * than is configured on the monitor. Both when using sync and async * replication we have the same situation that could happen, where the most * advanced standby node in the system is lagging behind the primary and * promoting it would incur data loss. * * In sync replication, that happens when the primary has been waiting for * a large chunk of WAL bytes to be reported. In async, the only difference * is that the primary did not wait. * * In terms of client-side guarantees, it's a big difference. In term of * data durability, it's the same thing. * * For this situation to change, users will have to either re-live the * unhealthy primary or change the * pgautofailover.enable_sync_wal_log_threshold GUC to a larger value and * thus explicitely accept data loss. */ if (primaryNode && !WalDifferenceWithin(mostAdvancedNode, primaryNode, PromoteXlogThreshold)) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "One of the most advanced standby nodes in the group " "is " NODE_FORMAT "with reported LSN %X/%X, which is more than " "pgautofailover.enable_sync_wal_log_threshold (%d) behind " "the primary " NODE_FORMAT ", which has reported %X/%X", NODE_FORMAT_ARGS(mostAdvancedNode), (uint32) (mostAdvancedNode->reportedLSN >> 32), (uint32) mostAdvancedNode->reportedLSN, PromoteXlogThreshold, NODE_FORMAT_ARGS(primaryNode), (uint32) (primaryNode->reportedLSN >> 32), (uint32) primaryNode->reportedLSN); return NULL; } /* * Select the node to be promoted: we can pick any candidate with the * max priority, so we pick the one with the most advanced LSN among * those having max(candidate priority). */ foreach(nodeCell, sortedCandidateNodesGroupList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); /* all the candidates are now in the REPORT_LSN state */ if (IsUnhealthy(node)) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Not selecting failover candidate " NODE_FORMAT "because it is unhealthy", NODE_FORMAT_ARGS(node)); continue; } else { int cPriority = node->candidatePriority; XLogRecPtr cLSN = node->reportedLSN; if (selectedNode == NULL) { selectedNode = node; } else if (cPriority == selectedNode->candidatePriority && cLSN > selectedNode->reportedLSN) { selectedNode = node; } else if (cPriority < selectedNode->candidatePriority) { /* * Short circuit the loop, as we scan in decreasing * priority order. */ break; } } } /* * Now we may have a selectedNode. We need to check that either it has all * the WAL needed, or that at least one of the nodes with all the WAL * needed is healthy right now. */ if (selectedNode && selectedNode->reportedLSN < candidateList->mostAdvancedReportedLSN) { bool someMostAdvancedStandbysAreHealthy = false; foreach(nodeCell, candidateList->mostAdvancedNodesGroupList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (IsHealthy(node)) { someMostAdvancedStandbysAreHealthy = true; break; } } if (!someMostAdvancedStandbysAreHealthy) { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "The selected candidate " NODE_FORMAT " needs to fetch missing " "WAL to reach LSN %X/%X (from current reported LSN %X/%X) " "and none of the most advanced standby nodes are healthy " "at the moment.", NODE_FORMAT_ARGS(selectedNode), (uint32) (mostAdvancedNode->reportedLSN >> 32), (uint32) mostAdvancedNode->reportedLSN, (uint32) (selectedNode->reportedLSN >> 32), (uint32) selectedNode->reportedLSN); return NULL; } } return selectedNode; } /* * PromoteSelectedNode assigns goal state to the selected node to failover to. */ static bool PromoteSelectedNode(AutoFailoverNode *selectedNode, AutoFailoverNode *primaryNode, CandidateList *candidateList) { /* selectedNode can't be NULL here */ if (selectedNode == NULL) { ereport(ERROR, (errmsg("BUG: selectedNode is NULL in PromoteSelectedNode"))); } /* * Ok so we now may start the failover process, we have selected a * candidate after all nodes reported their LSN. We still have two * possible situations here: * * - if the selected candidate has all the WAL bytes, promote it * already * * - if the selected candidate is lagging, we ask it to connect to a * standby that has not been selected and grab missing WAL bytes from * there * * When the perform_promotion API has been used to promote a specific node * in the system then its candidate priority has been incremented by 100. * Now is the time to reset it. */ if (selectedNode->candidatePriority > MAX_USER_DEFINED_CANDIDATE_PRIORITY) { char message[BUFSIZE] = { 0 }; selectedNode->candidatePriority -= CANDIDATE_PRIORITY_INCREMENT; ReportAutoFailoverNodeReplicationSetting( selectedNode->nodeId, selectedNode->nodeHost, selectedNode->nodePort, selectedNode->candidatePriority, selectedNode->replicationQuorum); LogAndNotifyMessage( message, BUFSIZE, "Updating candidate priority back to %d for " NODE_FORMAT, selectedNode->candidatePriority, NODE_FORMAT_ARGS(selectedNode)); NotifyStateChange(selectedNode, message); } /* * When a failover is performed with all the nodes up and running, we tweak * the priority of the primary in a way that prevents its re-election. Now * that the election is done, it's time to reset the primary priority back * to its former value. * * As the primaryNode parameter might be NULL, we loop over all the * candidates and reset any negative priority found in the list. */ if (candidateList->candidateNodesGroupList != NULL) { ListCell *nodeCell = NULL; foreach(nodeCell, candidateList->candidateNodesGroupList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node == NULL) { /* shouldn't happen */ ereport(ERROR, (errmsg("BUG: node is NULL"))); continue; } if (node->candidatePriority < 0) { char message[BUFSIZE] = { 0 }; node->candidatePriority += CANDIDATE_PRIORITY_INCREMENT; ReportAutoFailoverNodeReplicationSetting( node->nodeId, node->nodeHost, node->nodePort, node->candidatePriority, node->replicationQuorum); LogAndNotifyMessage( message, BUFSIZE, "Updating candidate priority back to %d for " NODE_FORMAT, node->candidatePriority, NODE_FORMAT_ARGS(node)); NotifyStateChange(node, message); } } } if (selectedNode->reportedLSN == candidateList->mostAdvancedReportedLSN) { char message[BUFSIZE] = { 0 }; if (primaryNode) { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to prepare_promotion after " NODE_FORMAT " became unhealthy and %d nodes reported their LSN position.", NODE_FORMAT_ARGS(selectedNode), NODE_FORMAT_ARGS(primaryNode), candidateList->candidateCount); } else { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to prepare_promotion and %d nodes reported their LSN position.", NODE_FORMAT_ARGS(selectedNode), candidateList->candidateCount); } AssignGoalState(selectedNode, REPLICATION_STATE_PREPARE_PROMOTION, message); /* leave the other nodes in ReportLSN state for now */ return true; } else { char message[BUFSIZE] = { 0 }; if (primaryNode) { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to fast_forward after " NODE_FORMAT " became unhealthy and %d nodes reported their LSN position.", NODE_FORMAT_ARGS(selectedNode), NODE_FORMAT_ARGS(primaryNode), candidateList->candidateCount); } else { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to fast_forward after %d nodes reported their LSN position.", NODE_FORMAT_ARGS(selectedNode), candidateList->candidateCount); } AssignGoalState(selectedNode, REPLICATION_STATE_FAST_FORWARD, message); return true; } } /* * AssignGoalState assigns a new goal state to a AutoFailover node. */ static void AssignGoalState(AutoFailoverNode *pgAutoFailoverNode, ReplicationState state, char *description) { if (pgAutoFailoverNode != NULL) { SetNodeGoalState(pgAutoFailoverNode, state, description); } } /* * WalDifferenceWithin returns whether the most recently reported relative log * position of the given nodes is within the specified bound. Returns false if * neither node has reported a relative xlog position. * * Returns false when the nodes are not on the same reported timeline. */ static bool WalDifferenceWithin(AutoFailoverNode *secondaryNode, AutoFailoverNode *otherNode, int64 delta) { if (secondaryNode == NULL || otherNode == NULL) { return true; } XLogRecPtr secondaryLsn = secondaryNode->reportedLSN; XLogRecPtr otherNodeLsn = otherNode->reportedLSN; if (secondaryLsn == 0 || otherNodeLsn == 0) { /* we don't have any data yet */ return false; } int64 walDifference = Abs(otherNodeLsn - secondaryLsn); return walDifference <= delta; } pg_auto_failover-1.6.3/src/monitor/group_state_machine.h000066400000000000000000000021241414244367200235230ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/group_state_machine.h * * Declarations for public functions and types related to a group state * machine. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #pragma once #include "postgres.h" #include "access/xlogdefs.h" #include "node_metadata.h" /* * AutoFailoverNodeState describes the current state of a node in a group. */ typedef struct AutoFailoverNodeState { int64 nodeId; int32 groupId; ReplicationState replicationState; int32 reportedTLI; XLogRecPtr reportedLSN; SyncState pgsrSyncState; bool pgIsRunning; int candidatePriority; bool replicationQuorum; } AutoFailoverNodeState; /* public function declarations */ extern bool ProceedGroupState(AutoFailoverNode *activeNode); /* GUCs */ extern int EnableSyncXlogThreshold; extern int PromoteXlogThreshold; extern int DrainTimeoutMs; extern int UnhealthyTimeoutMs; extern int StartupGracePeriodMs; pg_auto_failover-1.6.3/src/monitor/health_check.h000066400000000000000000000033421414244367200221100ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/health_check_metadata.h * * Declarations for public functions and types related to health check * metadata. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #pragma once #include "postgres.h" #include "access/htup.h" #include "access/tupdesc.h" #include "nodes/pg_list.h" /* * NodeHealthState represents the last-known health state of a node after * the last round of health checks. */ typedef enum { NODE_HEALTH_UNKNOWN = -1, NODE_HEALTH_BAD = 0, NODE_HEALTH_GOOD = 1 } NodeHealthState; /* * NodeHealth represents a node that is to be health-checked and its last-known * health state. */ typedef struct NodeHealth { int64 nodeId; char *nodeName; char *nodeHost; int nodePort; NodeHealthState healthState; } NodeHealth; /* GUCs to configure health checks */ extern bool HealthChecksEnabled; extern int HealthCheckPeriod; extern int HealthCheckTimeout; extern int HealthCheckMaxRetries; extern int HealthCheckRetryDelay; extern void InitializeHealthCheckWorker(void); extern void HealthCheckWorkerMain(Datum arg); extern void HealthCheckWorkerLauncherMain(Datum arg); extern List * LoadNodeHealthList(void); extern NodeHealth * TupleToNodeHealth(HeapTuple heapTuple, TupleDesc tupleDescriptor); extern void SetNodeHealthState(int64 nodeId, char *nodeName, char *nodeHost, uint16 nodePort, int previousHealthState, int healthState); extern void StopHealthCheckWorker(Oid databaseId); extern char * NodeHealthToString(NodeHealthState health); pg_auto_failover-1.6.3/src/monitor/health_check_metadata.c000066400000000000000000000163011414244367200237420ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/health_check_metadata.c * * Implementation of functions related to health check metadata. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" #include "miscadmin.h" #include "health_check.h" #include "metadata.h" #include "notifications.h" #include "access/htup.h" #include "access/tupdesc.h" #include "access/xact.h" #include "commands/extension.h" #include "executor/spi.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" #include "pgstat.h" #include "utils/builtins.h" #include "utils/memutils.h" #include "utils/snapmgr.h" /* human-readable names for addressing columns of health check queries */ #define TLIST_NUM_NODE_ID 1 #define TLIST_NUM_NODE_NAME 2 #define TLIST_NUM_NODE_HOST 3 #define TLIST_NUM_NODE_PORT 4 #define TLIST_NUM_HEALTH_STATUS 5 /* GUCs */ bool HealthChecksEnabled = true; static bool HaMonitorHasBeenLoaded(void); static void StartSPITransaction(void); static void EndSPITransaction(void); /* * LoadNodeHealthList loads a list of nodes of which to check the health. */ List * LoadNodeHealthList(void) { List *nodeHealthList = NIL; int spiStatus PG_USED_FOR_ASSERTS_ONLY = 0; StringInfoData query; MemoryContext upperContext = CurrentMemoryContext, oldContext = NULL; if (!HealthChecksEnabled) { return NIL; } StartSPITransaction(); if (HaMonitorHasBeenLoaded()) { initStringInfo(&query); appendStringInfo(&query, "SELECT nodeid, nodename, nodehost, nodeport, health " "FROM " AUTO_FAILOVER_NODE_TABLE); pgstat_report_activity(STATE_RUNNING, query.data); spiStatus = SPI_execute(query.data, false, 0); /* * When we start the monitor during an upgrade (from 1.3 to 1.4), the * background worker might be reading the 1.3 pgautofailover catalogs * still, where the "nodehost" column does not exist. */ if (spiStatus != SPI_OK_SELECT) { EndSPITransaction(); return NIL; } oldContext = MemoryContextSwitchTo(upperContext); for (uint64 rowNumber = 0; rowNumber < SPI_processed; rowNumber++) { HeapTuple heapTuple = SPI_tuptable->vals[rowNumber]; NodeHealth *nodeHealth = TupleToNodeHealth(heapTuple, SPI_tuptable->tupdesc); nodeHealthList = lappend(nodeHealthList, nodeHealth); } MemoryContextSwitchTo(oldContext); } EndSPITransaction(); MemoryContextSwitchTo(upperContext); return nodeHealthList; } /* * HaMonitorHasBeenLoaded returns true if the extension has been created * in the current database and the extension script has been executed. Otherwise, * it returns false. The result is cached as this is called very frequently. */ static bool HaMonitorHasBeenLoaded(void) { bool extensionPresent = false; bool extensionScriptExecuted = true; Oid extensionOid = get_extension_oid(AUTO_FAILOVER_EXTENSION_NAME, true); if (extensionOid != InvalidOid) { extensionPresent = true; } if (extensionPresent) { /* check if pg_cron extension objects are still being created */ if (creating_extension && CurrentExtensionObject == extensionOid) { extensionScriptExecuted = false; } else if (IsBinaryUpgrade) { extensionScriptExecuted = false; } } bool extensionLoaded = extensionPresent && extensionScriptExecuted; return extensionLoaded; } /* * TupleToNodeHealth constructs a node health description from a heap tuple obtained * via SPI. */ NodeHealth * TupleToNodeHealth(HeapTuple heapTuple, TupleDesc tupleDescriptor) { bool isNull = false; Datum nodeIdDatum = SPI_getbinval(heapTuple, tupleDescriptor, TLIST_NUM_NODE_ID, &isNull); Datum nodeNameDatum = SPI_getbinval(heapTuple, tupleDescriptor, TLIST_NUM_NODE_NAME, &isNull); Datum nodeHostDatum = SPI_getbinval(heapTuple, tupleDescriptor, TLIST_NUM_NODE_HOST, &isNull); Datum nodePortDatum = SPI_getbinval(heapTuple, tupleDescriptor, TLIST_NUM_NODE_PORT, &isNull); Datum healthStateDatum = SPI_getbinval(heapTuple, tupleDescriptor, TLIST_NUM_HEALTH_STATUS, &isNull); NodeHealth *nodeHealth = palloc0(sizeof(NodeHealth)); nodeHealth->nodeId = DatumGetInt64(nodeIdDatum); nodeHealth->nodeName = TextDatumGetCString(nodeNameDatum); nodeHealth->nodeHost = TextDatumGetCString(nodeHostDatum); nodeHealth->nodePort = DatumGetInt32(nodePortDatum); nodeHealth->healthState = DatumGetInt32(healthStateDatum); return nodeHealth; } /* * SetNodeHealthState updates the health state of a node in the metadata. */ void SetNodeHealthState(int64 nodeId, char *nodeName, char *nodeHost, uint16 nodePort, int previousHealthState, int healthState) { StringInfoData query; int spiStatus PG_USED_FOR_ASSERTS_ONLY = 0; MemoryContext upperContext = CurrentMemoryContext; StartSPITransaction(); if (HaMonitorHasBeenLoaded()) { initStringInfo(&query); appendStringInfo(&query, "UPDATE " AUTO_FAILOVER_NODE_TABLE " SET health = %d, healthchecktime = now() " " WHERE nodeid = %lld " " AND nodehost = %s AND nodeport = %d " " RETURNING node.*", healthState, (long long) nodeId, quote_literal_cstr(nodeHost), nodePort); pgstat_report_activity(STATE_RUNNING, query.data); spiStatus = SPI_execute(query.data, false, 0); Assert(spiStatus == SPI_OK_UPDATE_RETURNING); /* * We should have 0 or 1 row impacted, because of pkey on nodeid. We * might have updated zero rows when a node is concurrently being * DELETEd, because of the default REPETEABLE READ isolation level. */ if (SPI_processed == 1) { if (healthState != previousHealthState) { HeapTuple heapTuple = SPI_tuptable->vals[0]; AutoFailoverNode *pgAutoFailoverNode = TupleToAutoFailoverNode(SPI_tuptable->tupdesc, heapTuple); char message[BUFSIZE] = { 0 }; LogAndNotifyMessage(message, sizeof(message), "Node " NODE_FORMAT " is marked as %s by the monitor", NODE_FORMAT_ARGS(pgAutoFailoverNode), healthState == 0 ? "unhealthy" : "healthy"); NotifyStateChange(pgAutoFailoverNode, message); } } } else { /* extension has been dropped, just skip the update */ } EndSPITransaction(); MemoryContextSwitchTo(upperContext); } /* * StartSPITransaction starts a transaction using SPI. */ static void StartSPITransaction(void) { SetCurrentStatementStartTimestamp(); StartTransactionCommand(); SPI_connect(); PushActiveSnapshot(GetTransactionSnapshot()); } /* * EndSPITransaction finishes a transaction that was started using SPI. */ static void EndSPITransaction(void) { pgstat_report_activity(STATE_IDLE, NULL); SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); } /* * NodeHealthToString returns a string representation of the given node health * enum value. */ char * NodeHealthToString(NodeHealthState health) { switch (health) { case NODE_HEALTH_UNKNOWN: { return "unknown"; } case NODE_HEALTH_BAD: { return "bad"; } case NODE_HEALTH_GOOD: { return "good"; } default: { /* shouldn't happen */ ereport(ERROR, (errmsg("BUG: health is %d", health))); return "unknown"; } } } pg_auto_failover-1.6.3/src/monitor/health_check_worker.c000066400000000000000000000736771414244367200235160ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/health_check_worker.c * * Implementation of the health check worker. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" /* these are internal headers */ #include "health_check.h" #include "metadata.h" #include "version_compat.h" /* these are always necessary for a bgworker */ #include "access/heapam.h" #include "access/htup_details.h" #include "access/xact.h" #include "catalog/pg_database.h" #include "commands/extension.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgworker.h" #include "storage/ipc.h" #include "storage/latch.h" #include "storage/lmgr.h" #include "storage/lwlock.h" #include "storage/proc.h" #include "storage/shmem.h" /* these headers are used by this particular worker's code */ #include "fmgr.h" #include "lib/stringinfo.h" #include "libpq-fe.h" #include "libpq-int.h" #include "libpq/pqsignal.h" #include "poll.h" #include "sys/time.h" #include "utils/builtins.h" #include "utils/memutils.h" #include "tcop/utility.h" /* * The healthcheck only checks if it gets a response back from postgres. So * both user and password are actually useless, because we do not need to * authenticate. They are provided though to override any settings set through * PGPASSWORD environment variable or .pgpass file. This way it does not matter * that TLS is not necessarily used, because no secret information is sent. */ #define CONN_INFO_TEMPLATE \ "host=%s port=%u user=pgautofailover_monitor " \ "password=pgautofailover_monitor dbname=postgres " \ "connect_timeout=%u" #define MAX_CONN_INFO_SIZE 1024 #define CANNOT_CONNECT_NOW "57P03" typedef enum { HEALTH_CHECK_INITIAL = 0, HEALTH_CHECK_CONNECTING = 1, HEALTH_CHECK_OK = 2, HEALTH_CHECK_RETRY = 3, HEALTH_CHECK_DEAD = 4 } HealthCheckState; typedef struct HealthCheck { NodeHealth *node; HealthCheckState state; PGconn *connection; bool readyToPoll; PostgresPollingStatusType pollingStatus; int numTries; struct timeval nextEventTime; } HealthCheck; /* * Shared memory data for all maintenance workers. */ typedef struct HealthCheckHelperControlData { /* * Lock protecting the shared memory state. This is to be taken when * looking up (shared mode) or inserting (exclusive mode) per-database * data in HealthCheckWorkerDBHash. */ int trancheId; char *lockTrancheName; LWLock lock; } HealthCheckHelperControlData; /* * Per database worker state. */ typedef struct HealthCheckHelperDatabase { /* hash key: database to run on */ Oid dboid; pid_t workerPid; BackgroundWorkerHandle *handle; } HealthCheckHelperDatabase; typedef struct DatabaseListEntry { Oid dboid; char *dbname; } DatabaseListEntry; /* * Hash-table of workers, one entry for each database with pg_auto_failover * activated, and a lock to protect access to it. */ static HTAB *HealthCheckWorkerDBHash; static HealthCheckHelperControlData *HealthCheckHelperControl = NULL; static shmem_startup_hook_type prev_shmem_startup_hook = NULL; /* private function declarations */ static void pg_auto_failover_monitor_sigterm(SIGNAL_ARGS); static void pg_auto_failover_monitor_sighup(SIGNAL_ARGS); static BackgroundWorkerHandle * RegisterHealthCheckWorker(DatabaseListEntry *db); static List * BuildDatabaseList(void); static bool pgAutoFailoverExtensionExists(void); static List * CreateHealthChecks(List *nodeHealthList); static HealthCheck * CreateHealthCheck(NodeHealth *nodeHealth); static void DoHealthChecks(List *healthCheckList); static void ManageHealthCheck(HealthCheck *healthCheck, struct timeval currentTime); static int WaitForEvent(List *healthCheckList); static int CompareTimes(struct timeval *leftTime, struct timeval *rightTime); static int SubtractTimes(struct timeval base, struct timeval subtract); static struct timeval AddTimeMillis(struct timeval base, uint32 additionalMs); static void LatchWait(long timeoutMs); static size_t HealthCheckWorkerShmemSize(void); static void HealthCheckWorkerShmemInit(void); /* flags set by signal handlers */ static volatile sig_atomic_t got_sighup = false; static volatile sig_atomic_t got_sigterm = false; /* GUC variables */ int HealthCheckPeriod = 5 * 1000; int HealthCheckTimeout = 5 * 1000; int HealthCheckMaxRetries = 2; int HealthCheckRetryDelay = 2 * 1000; /* * Signal handler for SIGTERM * Set a flag to let the main loop to terminate, and set our latch to wake * it up. */ static void pg_auto_failover_monitor_sigterm(SIGNAL_ARGS) { int save_errno = errno; got_sigterm = true; SetLatch(MyLatch); errno = save_errno; } /* * Signal handler for SIGHUP * Set a flag to tell the main loop to reread the config file, and set * our latch to wake it up. */ static void pg_auto_failover_monitor_sighup(SIGNAL_ARGS) { int save_errno = errno; got_sighup = true; SetLatch(MyLatch); errno = save_errno; } /* * InitializeHealthCheckWorker, called at server start, is responsible for * requesting shared memory and related infrastructure required by worker * daemons. */ void InitializeHealthCheckWorker(void) { if (!IsUnderPostmaster) { RequestAddinShmemSpace(HealthCheckWorkerShmemSize()); } prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = HealthCheckWorkerShmemInit; } /* * HealthCheckWorkerLauncherMain is the main entry point for the * pg_auto_failover Health Check workers. * * We start a background worker for each database because a single background * worker may only connect to a single database for its whole lifetime. Each * worker checks if the "pgautofailover" extension is installed locally, and * then does the health checks. */ void HealthCheckWorkerLauncherMain(Datum arg) { MemoryContext originalContext = CurrentMemoryContext; /* Establish signal handlers before unblocking signals. */ pqsignal(SIGHUP, pg_auto_failover_monitor_sighup); pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, pg_auto_failover_monitor_sigterm); /* We're now ready to receive signals */ BackgroundWorkerUnblockSignals(); /* * Initialize a connection to shared catalogs only. */ BackgroundWorkerInitializeConnection(NULL, NULL, 0); /* Make background worker recognisable in pg_stat_activity */ pgstat_report_appname("pg_auto_failover monitor launcher"); MemoryContext launcherContext = AllocSetContextCreate(CurrentMemoryContext, "Health Check Launcher Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); while (!got_sigterm) { List *databaseList; ListCell *databaseListCell; originalContext = MemoryContextSwitchTo(launcherContext); databaseList = BuildDatabaseList(); MemoryContextSwitchTo(originalContext); foreach(databaseListCell, databaseList) { int pid; BackgroundWorkerHandle *handle = NULL; DatabaseListEntry *entry = (DatabaseListEntry *) lfirst(databaseListCell); bool isFound = false; LWLockAcquire(&HealthCheckHelperControl->lock, LW_EXCLUSIVE); HealthCheckHelperDatabase *dbData = hash_search(HealthCheckWorkerDBHash, (void *) &entry->dboid, HASH_ENTER, &isFound); if (isFound) { handle = dbData->handle; LWLockRelease(&HealthCheckHelperControl->lock); /* * This database has already been processed. * * Perform a quick and inexpensive check to verify that it is * actually running. Note that it is not possible to get * BGWH_NOT_YET_STARTED at this point, because this is not first * time we try to register the worker due to the isFound value * above. The HealthCheckWorkerDBHash only maintains verified * started entries. Thus we can only get BGWH_STARTED or * BGWH_STOPPED. */ if (GetBackgroundWorkerPid(handle, &pid) != BGWH_STARTED) { ereport(WARNING, (errmsg( "found stopped worker for pg_auto_failover " "health checks in \"%s\"", entry->dbname))); /* * Now we know that the worker has stopped. We use * StopHealthCheckWorker to remove the entry from the * HealthCheckWorkerDBHash. That will force a retry in the * next scan of the databaselist. * * Furthermore, if the status from GetBackgroundWorkerPid * was not the correct one, then StopHealthCheckWorker will * also make certain that the rogue worker will be stopped. * That will leave HealthCheckWorkerDBHash in a consistent * state. */ StopHealthCheckWorker(entry->dboid); } continue; } /* register a worker for the entry database, in the background */ handle = RegisterHealthCheckWorker(entry); if (handle) { /* * Once started, the Health Check process will update its * pid. */ dbData->workerPid = 0; /* * We need to release the lock for the worker to be able to * complete its startup procedure: the per-database worker * takes the control lock in SHARED mode to edit its own PID in * its own entry in HealthCheckWorkerDBHash. */ LWLockRelease(&HealthCheckHelperControl->lock); /* * WaitForBackgroundWorkerStartup will wait for worker to start; * thus, BGWH_NOT_YET_STARTED is never returned. However, if the * postmaster has died, it will give up and return * BGWH_POSTMASTER_DIED. In such a case the process will get * signaled to stop and we will exit further down. For good * measure though, do verify the process did actually start * before marking it as Active. */ if (WaitForBackgroundWorkerStartup(handle, &pid) == BGWH_STARTED) { dbData->handle = handle; ereport(LOG, (errmsg( "started worker for pg_auto_failover " "health checks in \"%s\"", entry->dbname))); continue; } } LWLockRelease(&HealthCheckHelperControl->lock); /* * Similarly to the comment above, we either failed to start * the worker, or we failed to register it. * * NOTE. We use StopHealthCheckWorker to remove the entry * from the HealthCheckWorkerDBHash so that it will be * retried in the next databaselist scan. The call to kill() * the failed worker in StopHealthCheckWorker() will take * place only if a handle was registered. */ ereport(WARNING, (errmsg("failed to %s worker for pg_auto_failover " "health checks in \"%s\"", handle ? "start" : "register", entry->dbname))); StopHealthCheckWorker(entry->dboid); } MemoryContextReset(launcherContext); LatchWait(HealthCheckTimeout); if (got_sighup) { got_sighup = false; ProcessConfigFile(PGC_SIGHUP); } } MemoryContextReset(launcherContext); MemoryContextSwitchTo(originalContext); } /* * RegisterHealthCheckWorker registers a background worker in given target * database, and returns the background worker handle so that the caller can * wait until it is started. * * This is necessary because of locking management, we want to release the main * lock from the caller before waiting for the worker's start. */ static BackgroundWorkerHandle * RegisterHealthCheckWorker(DatabaseListEntry *db) { BackgroundWorker worker; BackgroundWorkerHandle *handle; StringInfoData buf; initStringInfo(&buf); memset(&worker, 0, sizeof(worker)); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; worker.bgw_start_time = BgWorkerStart_RecoveryFinished; worker.bgw_restart_time = BGW_NEVER_RESTART; worker.bgw_main_arg = ObjectIdGetDatum(db->dboid); worker.bgw_notify_pid = MyProcPid; strlcpy(worker.bgw_library_name, "pgautofailover", sizeof(worker.bgw_library_name)); strlcpy(worker.bgw_function_name, "HealthCheckWorkerMain", sizeof(worker.bgw_function_name)); appendStringInfo(&buf, "pg_auto_failover monitor healthcheck worker %s", db->dbname); strlcpy(worker.bgw_name, buf.data, sizeof(worker.bgw_name)); if (!RegisterDynamicBackgroundWorker(&worker, &handle)) { ereport(WARNING, (errmsg( "failed to start worker for pg_auto_failover health checks in \"%s\"", db->dbname), errhint("You might need to increase max_worker_processes."))); return NULL; } return handle; } /* * BuildDatabaseList * Compile a list of all currently available databases in the cluster */ static List * BuildDatabaseList(void) { List *databaseList = NIL; HeapTuple dbTuple; MemoryContext originalContext = CurrentMemoryContext; StartTransactionCommand(); Relation pgDatabaseRelation = heap_open(DatabaseRelationId, AccessShareLock); TableScanDesc scan = table_beginscan_catalog(pgDatabaseRelation, 0, NULL); while (HeapTupleIsValid(dbTuple = heap_getnext(scan, ForwardScanDirection))) { MemoryContext oldContext; Form_pg_database dbForm = (Form_pg_database) GETSTRUCT(dbTuple); DatabaseListEntry *entry; /* only consider non-template databases that we can connect to */ if (!dbForm->datistemplate && dbForm->datallowconn) { oldContext = MemoryContextSwitchTo(originalContext); entry = (DatabaseListEntry *) palloc(sizeof(DatabaseListEntry)); entry->dboid = HeapTupleGetOid(dbTuple); entry->dbname = pstrdup(NameStr(dbForm->datname)); databaseList = lappend(databaseList, entry); MemoryContextSwitchTo(oldContext); } } heap_endscan(scan); heap_close(pgDatabaseRelation, AccessShareLock); CommitTransactionCommand(); MemoryContextSwitchTo(originalContext); return databaseList; } /* * HealthCheckWorkerMain is the main entry-point for the background worker that * performs health checks. */ void HealthCheckWorkerMain(Datum arg) { Oid dboid = DatumGetObjectId(arg); bool foundPgAutoFailoverExtension = false; /* * Look up this worker's configuration. */ LWLockAcquire(&HealthCheckHelperControl->lock, LW_SHARED); HealthCheckHelperDatabase *myDbData = (HealthCheckHelperDatabase *) hash_search(HealthCheckWorkerDBHash, (void *) &dboid, HASH_FIND, NULL); if (!myDbData) { /* * When the database crashes, background workers are restarted, but * the state in shared memory is lost. In that case, we exit and * wait for HealthCheckWorkerLauncherMain to restart it. */ proc_exit(0); } /* from this point, DROP DATABASE will attempt to kill the worker */ myDbData->workerPid = MyProcPid; /* Establish signal handlers before unblocking signals. */ pqsignal(SIGHUP, pg_auto_failover_monitor_sighup); pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, pg_auto_failover_monitor_sigterm); /* We're now ready to receive signals */ BackgroundWorkerUnblockSignals(); /* we're also done editing our own hash table entry */ LWLockRelease(&HealthCheckHelperControl->lock); /* Connect to our database */ BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, 0); /* Make background worker recognisable in pg_stat_activity */ pgstat_report_appname("pg_auto_failover health check worker"); /* * Only process given database when the extension has been loaded. * Otherwise, happily quit. */ MemoryContext healthCheckContext = AllocSetContextCreate(CurrentMemoryContext, "Health check context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(healthCheckContext); /* * Main loop: do this until the SIGTERM handler tells us to terminate */ while (!got_sigterm) { struct timeval currentTime = { 0, 0 }; struct timeval roundEndTime = { 0, 0 }; gettimeofday(¤tTime, NULL); roundEndTime = AddTimeMillis(currentTime, HealthCheckPeriod); if (!foundPgAutoFailoverExtension) { if (pgAutoFailoverExtensionExists()) { foundPgAutoFailoverExtension = true; elog(LOG, "pg_auto_failover extension found in database %d, " "starting Health Checks.", dboid); } } if (foundPgAutoFailoverExtension) { List *nodeHealthList = LoadNodeHealthList(); if (nodeHealthList != NIL) { List *healthCheckList = CreateHealthChecks(nodeHealthList); DoHealthChecks(healthCheckList); } MemoryContextReset(healthCheckContext); } gettimeofday(¤tTime, NULL); int timeout = SubtractTimes(roundEndTime, currentTime); if (timeout >= 0) { LatchWait(timeout); } if (got_sighup) { got_sighup = false; ProcessConfigFile(PGC_SIGHUP); } } elog(LOG, "pg_auto_failover monitor exiting for database %d", dboid); proc_exit(0); } /* * pgAutoFailoverExtensionExists returns true when we can find the * "pgautofailover" extension in the pg_extension catalogs. Caller must have * already connected to a database before calling this function. */ static bool pgAutoFailoverExtensionExists(void) { MemoryContext originalContext = CurrentMemoryContext; StartTransactionCommand(); Oid extensionOid = get_extension_oid(AUTO_FAILOVER_EXTENSION_NAME, true); CommitTransactionCommand(); /* CommitTransactionCommand resets the memory context to TopMemoryContext */ MemoryContextSwitchTo(originalContext); return (extensionOid != InvalidOid); } /* * CreateHealthChecks creates a list of health checks from a list of node health * descriptions. */ static List * CreateHealthChecks(List *nodeHealthList) { List *healthCheckList = NIL; ListCell *nodeHealthCell = NULL; foreach(nodeHealthCell, nodeHealthList) { NodeHealth *nodeHealth = (NodeHealth *) lfirst(nodeHealthCell); HealthCheck *healthCheck = CreateHealthCheck(nodeHealth); healthCheckList = lappend(healthCheckList, healthCheck); } return healthCheckList; } /* * CreateHealthCheck creates a health check from a health check description. */ static HealthCheck * CreateHealthCheck(NodeHealth *nodeHealth) { struct timeval invalidTime = { 0, 0 }; HealthCheck *healthCheck = palloc0(sizeof(HealthCheck)); healthCheck->node = nodeHealth; healthCheck->state = HEALTH_CHECK_INITIAL; healthCheck->connection = NULL; healthCheck->numTries = 0; healthCheck->nextEventTime = invalidTime; return healthCheck; } /* * DoHealthChecks performs the given health checks. */ static void DoHealthChecks(List *healthCheckList) { while (!got_sigterm) { int pendingCheckCount = 0; struct timeval currentTime = { 0, 0 }; ListCell *healthCheckCell = NULL; gettimeofday(¤tTime, NULL); foreach(healthCheckCell, healthCheckList) { HealthCheck *healthCheck = (HealthCheck *) lfirst(healthCheckCell); ManageHealthCheck(healthCheck, currentTime); if (healthCheck->state != HEALTH_CHECK_OK && healthCheck->state != HEALTH_CHECK_DEAD) { pendingCheckCount++; } } if (pendingCheckCount == 0) { break; } WaitForEvent(healthCheckList); } } /* * WaitForEvent sleeps until a time-based or I/O event occurs in any of the health * checks. */ static int WaitForEvent(List *healthCheckList) { ListCell *healthCheckCell = NULL; int healthCheckCount = list_length(healthCheckList); struct timeval currentTime = { 0, 0 }; struct timeval nextEventTime = { 0, 0 }; int healthCheckIndex = 0; struct pollfd *pollFDs = (struct pollfd *) palloc0(healthCheckCount * sizeof(struct pollfd)); gettimeofday(¤tTime, NULL); foreach(healthCheckCell, healthCheckList) { HealthCheck *healthCheck = (HealthCheck *) lfirst(healthCheckCell); struct pollfd *pollFileDescriptor = &pollFDs[healthCheckIndex]; pollFileDescriptor->fd = -1; pollFileDescriptor->events = 0; pollFileDescriptor->revents = 0; if (healthCheck->state == HEALTH_CHECK_CONNECTING || healthCheck->state == HEALTH_CHECK_RETRY) { bool hasTimeout = healthCheck->nextEventTime.tv_sec != 0; if (hasTimeout && (nextEventTime.tv_sec == 0 || CompareTimes(&healthCheck->nextEventTime, &nextEventTime) < 0)) { nextEventTime = healthCheck->nextEventTime; } } if (healthCheck->state == HEALTH_CHECK_CONNECTING) { PGconn *connection = healthCheck->connection; int pollEventMask = 0; if (healthCheck->pollingStatus == PGRES_POLLING_READING) { pollEventMask = POLLERR | POLLIN; } else if (healthCheck->pollingStatus == PGRES_POLLING_WRITING) { pollEventMask = POLLERR | POLLOUT; } pollFileDescriptor->fd = PQsocket(connection); pollFileDescriptor->events = pollEventMask; } healthCheckIndex++; } int pollTimeout = SubtractTimes(nextEventTime, currentTime); if (pollTimeout < 0) { pollTimeout = 0; } else if (pollTimeout > HealthCheckRetryDelay) { pollTimeout = HealthCheckRetryDelay; } int pollResult = poll(pollFDs, healthCheckCount, pollTimeout); if (pollResult < 0) { return STATUS_ERROR; } healthCheckIndex = 0; foreach(healthCheckCell, healthCheckList) { HealthCheck *healthCheck = (HealthCheck *) lfirst(healthCheckCell); struct pollfd *pollFileDescriptor = &pollFDs[healthCheckIndex]; healthCheck->readyToPoll = pollFileDescriptor->revents & pollFileDescriptor->events; healthCheckIndex++; } return 0; } /* * LatchWait sleeps on the process latch until a timeout occurs. */ static void LatchWait(long timeoutMs) { int waitResult = 0; /* * Background workers mustn't call usleep() or any direct equivalent: * instead, they may wait on their process latch, which sleeps as * necessary, but is awakened if postmaster dies. That way the * background process goes away immediately in an emergency. */ #if (PG_VERSION_NUM >= 100000) waitResult = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, timeoutMs, WAIT_EVENT_CLIENT_READ); #else waitResult = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, timeoutMs); #endif ResetLatch(MyLatch); /* emergency bailout if postmaster has died */ if (waitResult & WL_POSTMASTER_DEATH) { elog(LOG, "pg_auto_failover monitor exiting"); proc_exit(1); } } /* * ManageHealthCheck proceeds the health check state machine. */ static void ManageHealthCheck(HealthCheck *healthCheck, struct timeval currentTime) { HealthCheckState checkState = healthCheck->state; NodeHealth *nodeHealth = healthCheck->node; switch (checkState) { case HEALTH_CHECK_RETRY: { if (healthCheck->numTries >= HealthCheckMaxRetries + 1) { SetNodeHealthState(healthCheck->node->nodeId, healthCheck->node->nodeName, healthCheck->node->nodeHost, healthCheck->node->nodePort, nodeHealth->healthState, NODE_HEALTH_BAD); healthCheck->state = HEALTH_CHECK_DEAD; break; } if (CompareTimes(&healthCheck->nextEventTime, ¤tTime) > 0) { /* Retry time lies in the future */ break; } /* Fall through to re-connect */ } /* fallthrough */ case HEALTH_CHECK_INITIAL: { StringInfo connInfoString = makeStringInfo(); appendStringInfo(connInfoString, CONN_INFO_TEMPLATE, nodeHealth->nodeHost, nodeHealth->nodePort, HealthCheckTimeout); PGconn *connection = PQconnectStart(connInfoString->data); PQsetnonblocking(connection, true); ConnStatusType connStatus = PQstatus(connection); if (connStatus == CONNECTION_BAD) { struct timeval nextTryTime = { 0, 0 }; PQfinish(connection); nextTryTime = AddTimeMillis(currentTime, HealthCheckRetryDelay); healthCheck->nextEventTime = nextTryTime; healthCheck->connection = NULL; healthCheck->pollingStatus = PGRES_POLLING_FAILED; healthCheck->state = HEALTH_CHECK_RETRY; } else { struct timeval timeoutTime = { 0, 0 }; timeoutTime = AddTimeMillis(currentTime, HealthCheckTimeout); healthCheck->nextEventTime = timeoutTime; healthCheck->connection = connection; healthCheck->pollingStatus = PGRES_POLLING_WRITING; healthCheck->state = HEALTH_CHECK_CONNECTING; } healthCheck->numTries++; pfree(connInfoString->data); pfree(connInfoString); break; } case HEALTH_CHECK_CONNECTING: { PGconn *connection = healthCheck->connection; PostgresPollingStatusType pollingStatus = PGRES_POLLING_FAILED; if (CompareTimes(&healthCheck->nextEventTime, ¤tTime) < 0) { struct timeval nextTryTime = { 0, 0 }; PQfinish(connection); nextTryTime = AddTimeMillis(currentTime, HealthCheckRetryDelay); healthCheck->nextEventTime = nextTryTime; healthCheck->connection = NULL; healthCheck->pollingStatus = pollingStatus; healthCheck->state = HEALTH_CHECK_RETRY; break; } if (!healthCheck->readyToPoll) { break; } /* This logic is taken from libpq's internal_ping (fe-connect.c) */ pollingStatus = PQconnectPoll(connection); char *sqlstate = connection->last_sqlstate; bool receivedSqlstate = (sqlstate != NULL && strlen(sqlstate) == 5); bool cannotConnectNowSqlstate = (receivedSqlstate && strcmp(sqlstate, CANNOT_CONNECT_NOW) == 0); if (pollingStatus == PGRES_POLLING_OK || /* an auth request means pg is running */ connection->auth_req_received || /* any error but CANNOT_CONNECT means the db is accepting connections */ (receivedSqlstate && !cannotConnectNowSqlstate)) { PQfinish(connection); SetNodeHealthState(healthCheck->node->nodeId, healthCheck->node->nodeName, healthCheck->node->nodeHost, healthCheck->node->nodePort, nodeHealth->healthState, NODE_HEALTH_GOOD); healthCheck->connection = NULL; healthCheck->numTries = 0; healthCheck->state = HEALTH_CHECK_OK; } else if (pollingStatus == PGRES_POLLING_FAILED) { struct timeval nextTryTime = { 0, 0 }; PQfinish(connection); nextTryTime = AddTimeMillis(currentTime, HealthCheckRetryDelay); healthCheck->nextEventTime = nextTryTime; healthCheck->connection = NULL; healthCheck->state = HEALTH_CHECK_RETRY; } else { /* Health check is still connecting */ } healthCheck->pollingStatus = pollingStatus; break; } case HEALTH_CHECK_DEAD: case HEALTH_CHECK_OK: default: { /* Health check is done */ } } } /* * CompareTime compares two timeval structs. * * If leftTime < rightTime, return -1 * If leftTime > rightTime, return 1 * else, return 0 */ static int CompareTimes(struct timeval *leftTime, struct timeval *rightTime) { int compareResult = 0; if (leftTime->tv_sec < rightTime->tv_sec) { compareResult = -1; } else if (leftTime->tv_sec > rightTime->tv_sec) { compareResult = 1; } else if (leftTime->tv_usec < rightTime->tv_usec) { compareResult = -1; } else if (leftTime->tv_usec > rightTime->tv_usec) { compareResult = 1; } else { compareResult = 0; } return compareResult; } /* * SubtractTimes subtract the ‘struct timeval’ values y from x, * returning the result. * * From: * http://www.gnu.org/software/libc/manual/html_node/Elapsed-Time.html */ static int SubtractTimes(struct timeval x, struct timeval y) { int differenceMs = 0; /* Perform the carry for the later subtraction by updating y. */ if (x.tv_usec < y.tv_usec) { int nsec = (y.tv_usec - x.tv_usec) / 1000000 + 1; y.tv_usec -= 1000000 * nsec; y.tv_sec += nsec; } if (x.tv_usec - y.tv_usec > 1000000) { int nsec = (x.tv_usec - y.tv_usec) / 1000000; y.tv_usec += 1000000 * nsec; y.tv_sec -= nsec; } differenceMs += 1000 * (x.tv_sec - y.tv_sec); differenceMs += (x.tv_usec - y.tv_usec) / 1000; return differenceMs; } /* * AddTimeMillis adds additionalMs milliseconds to a timeval. */ static struct timeval AddTimeMillis(struct timeval base, uint32 additionalMs) { struct timeval result = { 0, 0 }; result.tv_sec = base.tv_sec + additionalMs / 1000; result.tv_usec = base.tv_usec + (additionalMs % 1000) * 1000; return result; } /* * HealthCheckWorkerShmemSize computes how much shared memory is required. */ static size_t HealthCheckWorkerShmemSize(void) { Size size = 0; size = add_size(size, sizeof(HealthCheckHelperDatabase)); /* * We request enough shared memory to have one hash-table entry for each * worker process. We couldn't start more anyway, so there's little point * in allocating more. */ Size hashSize = hash_estimate_size(max_worker_processes, sizeof(HealthCheckHelperDatabase)); size = add_size(size, hashSize); return size; } /* * HealthCheckWorkerShmemInit initializes the requested shared memory for the * maintenance daemon. */ static void HealthCheckWorkerShmemInit(void) { bool alreadyInitialized = false; HASHCTL hashInfo; LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); HealthCheckHelperControl = (HealthCheckHelperControlData *) ShmemInitStruct("pg_auto_failover Health Check Helper Daemon", HealthCheckWorkerShmemSize(), &alreadyInitialized); /* * Might already be initialized on EXEC_BACKEND type platforms that call * shared library initialization functions in every backend. */ if (!alreadyInitialized) { HealthCheckHelperControl->trancheId = LWLockNewTrancheId(); HealthCheckHelperControl->lockTrancheName = "pg_auto_failover Health Check Daemon"; LWLockRegisterTranche(HealthCheckHelperControl->trancheId, HealthCheckHelperControl->lockTrancheName); LWLockInitialize(&HealthCheckHelperControl->lock, HealthCheckHelperControl->trancheId); } memset(&hashInfo, 0, sizeof(hashInfo)); hashInfo.keysize = sizeof(Oid); hashInfo.entrysize = sizeof(HealthCheckHelperDatabase); hashInfo.hash = tag_hash; int hashFlags = (HASH_ELEM | HASH_FUNCTION); HealthCheckWorkerDBHash = ShmemInitHash("pg_auto_failover Database Hash", max_worker_processes, max_worker_processes, &hashInfo, hashFlags); LWLockRelease(AddinShmemInitLock); if (prev_shmem_startup_hook != NULL) { prev_shmem_startup_hook(); } } /* * StopHealthCheckWorker stops the maintenance daemon for the given database * and removes it from the Health Check Launcher control hash. */ void StopHealthCheckWorker(Oid databaseId) { bool found = false; pid_t workerPid = 0; LWLockAcquire(&HealthCheckHelperControl->lock, LW_EXCLUSIVE); HealthCheckHelperDatabase *dbData = (HealthCheckHelperDatabase *) hash_search(HealthCheckWorkerDBHash, &databaseId, HASH_REMOVE, &found); if (found) { workerPid = dbData->workerPid; } LWLockRelease(&HealthCheckHelperControl->lock); if (workerPid > 0) { kill(workerPid, SIGTERM); } } pg_auto_failover-1.6.3/src/monitor/metadata.c000066400000000000000000000163251414244367200212660ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/metadata.c * * Implementation of functions related to pg_auto_failover metadata. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" #include "miscadmin.h" #include "fmgr.h" #include "metadata.h" #include "version_compat.h" #include "access/genam.h" #include "access/heapam.h" #include "access/htup.h" #include "access/htup_details.h" #include "access/tupdesc.h" #include "access/xact.h" #include "catalog/indexing.h" #include "catalog/namespace.h" #include "catalog/pg_extension.h" #include "catalog/pg_type.h" #include "commands/sequence.h" #include "executor/spi.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/hsearch.h" #include "utils/rel.h" #include "utils/relcache.h" bool EnableVersionChecks = true; /* version checks are enabled */ /* * pgAutoFailoverRelationId returns the OID of a given relation in the * pgautofailover schema. */ Oid pgAutoFailoverRelationId(const char *relname) { Oid namespaceId = pgAutoFailoverSchemaId(); Oid relationId = get_relname_relid(relname, namespaceId); if (relationId == InvalidOid) { ereport(ERROR, (errmsg("%s does not exist", relname))); } return relationId; } /* * pgAutoFailoverSchemaId returns the name of the schema in which metadata is * stored. */ Oid pgAutoFailoverSchemaId(void) { Oid namespaceId = get_namespace_oid(AUTO_FAILOVER_SCHEMA_NAME, true); if (namespaceId == InvalidOid) { ereport(ERROR, (errmsg("%s schema does not exist", AUTO_FAILOVER_SCHEMA_NAME), errhint("Run: CREATE EXTENSION %s", AUTO_FAILOVER_EXTENSION_NAME))); } return namespaceId; } /* * pgAutoFailoverExtensionOwner gets the owner of the extension and verifies * that this is the superuser. */ Oid pgAutoFailoverExtensionOwner(void) { ScanKeyData scanKey[1]; bool indexOK = true; Form_pg_extension extensionForm = NULL; Oid extensionOwner = InvalidOid; Relation pgExtension = heap_open(ExtensionRelationId, AccessShareLock); ScanKeyInit(&scanKey[0], Anum_pg_extension_extname, BTEqualStrategyNumber, F_NAMEEQ, CStringGetDatum(AUTO_FAILOVER_EXTENSION_NAME)); SysScanDesc scanDescriptor = systable_beginscan(pgExtension, ExtensionNameIndexId, indexOK, NULL, 1, scanKey); HeapTuple extensionTuple = systable_getnext(scanDescriptor); if (HeapTupleIsValid(extensionTuple)) { extensionForm = (Form_pg_extension) GETSTRUCT(extensionTuple); if (!superuser_arg(extensionForm->extowner)) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("extension needs to be owned by superuser"))); } extensionOwner = extensionForm->extowner; Assert(OidIsValid(extensionOwner)); } else { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("extension not loaded"), errhint("Run: CREATE EXTENSION %s", AUTO_FAILOVER_EXTENSION_NAME))); } systable_endscan(scanDescriptor); heap_close(pgExtension, AccessShareLock); return extensionOwner; } /* * LockFormation takes a lock on a formation to prevent concurrent * membership changes. */ void LockFormation(char *formationId, LOCKMODE lockMode) { LOCKTAG tag; const bool sessionLock = false; const bool dontWait = false; uint32 formationIdHash = string_hash(formationId, NAMEDATALEN); SET_LOCKTAG_ADVISORY(tag, MyDatabaseId, 0, formationIdHash, ADV_LOCKTAG_CLASS_AUTO_FAILOVER_FORMATION); (void) LockAcquire(&tag, lockMode, sessionLock, dontWait); } /* * LockNodeGroup takes a lock on a particular group in a formation to * prevent concurrent state changes. */ void LockNodeGroup(char *formationId, int groupId, LOCKMODE lockMode) { LOCKTAG tag; const bool sessionLock = false; const bool dontWait = false; uint32 formationIdHash = string_hash(formationId, NAMEDATALEN); SET_LOCKTAG_ADVISORY(tag, MyDatabaseId, formationIdHash, (uint32) groupId, ADV_LOCKTAG_CLASS_AUTO_FAILOVER_NODE_GROUP); (void) LockAcquire(&tag, lockMode, sessionLock, dontWait); } /* * checkPgAutoFailoverVersion checks whether there is a version mismatch * between the available version and the loaded version or between the * installed version and the loaded version. Returns true if compatible, false * otherwise. * * We need to be careful that the pgautofailover.so that is currently loaded in * the Postgres backend is intended to work with the current extension version * definition (schema and SQL definitions of C coded functions). */ bool checkPgAutoFailoverVersion() { char *installedVersion = NULL; char *availableVersion = NULL; const int argCount = 1; Oid argTypes[] = { TEXTOID }; Datum argValues[] = { CStringGetTextDatum(AUTO_FAILOVER_EXTENSION_NAME) }; MemoryContext callerContext = CurrentMemoryContext; char *selectQuery = "SELECT default_version, installed_version " "FROM pg_catalog.pg_available_extensions WHERE name = $1;"; if (!EnableVersionChecks) { return true; } SPI_connect(); int spiStatus = SPI_execute_with_args(selectQuery, argCount, argTypes, argValues, NULL, false, 1); if (spiStatus != SPI_OK_SELECT) { elog(ERROR, "could not select from pg_catalog.pg_available_extensions"); } if (SPI_processed != 1) { elog(ERROR, "expected a single entry for extension \"%s\"", AUTO_FAILOVER_EXTENSION_NAME); } else { TupleDesc tupleDescriptor = SPI_tuptable->tupdesc; HeapTuple heapTuple = SPI_tuptable->vals[0]; bool defaultIsNull = false, installedIsNull = false; MemoryContext spiContext = MemoryContextSwitchTo(callerContext); Datum defaultVersionDatum = heap_getattr(heapTuple, 1, tupleDescriptor, &defaultIsNull); Datum installedVersionDatum = heap_getattr(heapTuple, 2, tupleDescriptor, &installedIsNull); if (!defaultIsNull) { availableVersion = TextDatumGetCString(defaultVersionDatum); } if (!installedIsNull) { installedVersion = TextDatumGetCString(installedVersionDatum); } MemoryContextSwitchTo(spiContext); } SPI_finish(); if (strcmp(AUTO_FAILOVER_EXTENSION_VERSION, availableVersion) != 0) { ereport(ERROR, (errmsg("loaded \"%s\" library version differs from latest " "available extension version", AUTO_FAILOVER_EXTENSION_NAME), errdetail("Loaded library requires %s, but the latest control " "file specifies %s.", AUTO_FAILOVER_EXTENSION_VERSION, availableVersion), errhint("Restart the database to load the latest version " "of the \"%s\" library.", AUTO_FAILOVER_EXTENSION_NAME))); return false; } if (strcmp(AUTO_FAILOVER_EXTENSION_VERSION, installedVersion) != 0) { ereport(ERROR, (errmsg("loaded \"%s\" library version differs from installed " "extension version", AUTO_FAILOVER_EXTENSION_NAME), errdetail("Loaded library requires %s, but the installed " "extension version is %s.", AUTO_FAILOVER_EXTENSION_VERSION, installedVersion), errhint("Run ALTER EXTENSION %s UPDATE and try again.", AUTO_FAILOVER_EXTENSION_NAME))); return false; } return true; } pg_auto_failover-1.6.3/src/monitor/metadata.h000066400000000000000000000031741414244367200212710ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/metadata.h * * Declarations for public functions and types related to pg_auto_failover * metadata. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #pragma once #include "storage/lockdefs.h" #define AUTO_FAILOVER_EXTENSION_VERSION "1.6" #define AUTO_FAILOVER_EXTENSION_NAME "pgautofailover" #define AUTO_FAILOVER_SCHEMA_NAME "pgautofailover" #define AUTO_FAILOVER_FORMATION_TABLE "pgautofailover.formation" #define AUTO_FAILOVER_NODE_TABLE "pgautofailover.node" #define AUTO_FAILOVER_EVENT_TABLE "pgautofailover.event" #define REPLICATION_STATE_TYPE_NAME "replication_state" /* * Postgres' advisory locks use 'field4' to discern between different kind of * advisory locks. It only uses values 1 and 2, whereas Citus uses values 4, 5 * 6. We start counting at 10 to avoid conflict. */ typedef enum AutoFailoverHALocktagClass { ADV_LOCKTAG_CLASS_AUTO_FAILOVER_FORMATION = 10, ADV_LOCKTAG_CLASS_AUTO_FAILOVER_NODE_GROUP = 11 } AutoFailoverHALocktagClass; /* GUC variable for version checks, true by default */ extern bool EnableVersionChecks; /* public function declarations */ extern Oid pgAutoFailoverRelationId(const char *relname); extern Oid pgAutoFailoverSchemaId(void); extern Oid pgAutoFailoverExtensionOwner(void); extern void LockFormation(char *formationId, LOCKMODE lockMode); extern void LockNodeGroup(char *formationId, int groupId, LOCKMODE lockMode); extern bool checkPgAutoFailoverVersion(void); pg_auto_failover-1.6.3/src/monitor/node_active_protocol.c000066400000000000000000002172231414244367200237070ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/node_active_protocol.c * * Implementation of the functions used to communicate with PostgreSQL * nodes. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" #include "access/xact.h" #include "formation_metadata.h" #include "group_state_machine.h" #include "metadata.h" #include "node_metadata.h" #include "notifications.h" #include "replication_state.h" #include "access/htup_details.h" #include "access/xlogdefs.h" #include "catalog/pg_enum.h" #include "nodes/makefuncs.h" #include "nodes/parsenodes.h" #include "parser/parse_type.h" #include "storage/lockdefs.h" #include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/syscache.h" /* private function forward declarations */ static AutoFailoverNodeState * NodeActive(char *formationId, AutoFailoverNodeState *currentNodeState); static void JoinAutoFailoverFormation(AutoFailoverFormation *formation, char *nodeName, char *nodeHost, int nodePort, uint64 sysIdentifier, char *nodeCluster, AutoFailoverNodeState *currentNodeState); static int AssignGroupId(AutoFailoverFormation *formation, char *nodeHost, int nodePort, ReplicationState *initialState); static bool RemoveNode(AutoFailoverNode *currentNode, bool force); /* SQL-callable function declarations */ PG_FUNCTION_INFO_V1(register_node); PG_FUNCTION_INFO_V1(node_active); PG_FUNCTION_INFO_V1(update_node_metadata); PG_FUNCTION_INFO_V1(get_nodes); PG_FUNCTION_INFO_V1(get_primary); PG_FUNCTION_INFO_V1(get_other_node); PG_FUNCTION_INFO_V1(get_other_nodes); PG_FUNCTION_INFO_V1(remove_node); PG_FUNCTION_INFO_V1(remove_node_by_nodeid); PG_FUNCTION_INFO_V1(remove_node_by_host); PG_FUNCTION_INFO_V1(perform_failover); PG_FUNCTION_INFO_V1(perform_promotion); PG_FUNCTION_INFO_V1(start_maintenance); PG_FUNCTION_INFO_V1(stop_maintenance); PG_FUNCTION_INFO_V1(set_node_candidate_priority); PG_FUNCTION_INFO_V1(set_node_replication_quorum); PG_FUNCTION_INFO_V1(synchronous_standby_names); /* * register_node adds a node to a given formation * * At register time the monitor connects to the node to check that nodehost and * nodeport are valid, and it does a SELECT pg_is_in_recovery() to help decide * what initial role to attribute the entering node. */ Datum register_node(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); text *nodeHostText = PG_GETARG_TEXT_P(1); char *nodeHost = text_to_cstring(nodeHostText); int32 nodePort = PG_GETARG_INT32(2); Name dbnameName = PG_GETARG_NAME(3); const char *expectedDBName = NameStr(*dbnameName); text *nodeNameText = PG_GETARG_TEXT_P(4); char *nodeName = text_to_cstring(nodeNameText); uint64 sysIdentifier = PG_GETARG_INT64(5); int64 currentNodeId = PG_GETARG_INT64(6); int32 currentGroupId = PG_GETARG_INT32(7); Oid currentReplicationStateOid = PG_GETARG_OID(8); text *nodeKindText = PG_GETARG_TEXT_P(9); char *nodeKind = text_to_cstring(nodeKindText); FormationKind expectedFormationKind = FormationKindFromNodeKindString(nodeKind); int candidatePriority = PG_GETARG_INT32(10); bool replicationQuorum = PG_GETARG_BOOL(11); text *nodeClusterText = PG_GETARG_TEXT_P(12); char *nodeCluster = text_to_cstring(nodeClusterText); AutoFailoverNodeState currentNodeState = { 0 }; currentNodeState.nodeId = currentNodeId; currentNodeState.groupId = currentGroupId; currentNodeState.replicationState = EnumGetReplicationState(currentReplicationStateOid); currentNodeState.reportedLSN = 0; currentNodeState.candidatePriority = candidatePriority; currentNodeState.replicationQuorum = replicationQuorum; LockFormation(formationId, ExclusiveLock); AutoFailoverFormation *formation = GetFormation(formationId); /* * The default formationId is "default" and of kind FORMATION_KIND_PGSQL. * It might get used to manage a formation though. Check about that here, * and when the first node registered is a Citus node, update the target * formation to be of kind Citus, actually. */ if (formation == NULL) { ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("formation \"%s\" does not exist", formationId), errhint("Use `pg_autoctl create formation` " "to create the target formation first"))); } if (formation->kind != expectedFormationKind) { List *allNodes = AllAutoFailoverNodes(formationId); if (list_length(allNodes) == 0) { /* first node in the list, let's switch to citus */ SetFormationKind(formationId, expectedFormationKind); formation->kind = expectedFormationKind; } else { ereport(ERROR, (errmsg("node %s:%d of kind \"%s\" can not be registered in " "formation \"%s\" of kind \"%s\"", nodeHost, nodePort, nodeKind, formationId, FormationKindToString(formation->kind)))); } } if (strncmp(formation->dbname, expectedDBName, NAMEDATALEN) != 0) { List *allNodes = AllAutoFailoverNodes(formationId); if (list_length(allNodes) == 0) { /* first node in the list, rename database and update formation */ SetFormationDBName(formationId, expectedDBName); strlcpy(formation->dbname, expectedDBName, NAMEDATALEN); } else { ereport(ERROR, (errmsg("node %s:%d with dbname \"%s\" can not be " "registered in formation \"%s\" " "which expects dbname \"%s\"", nodeHost, nodePort, expectedDBName, formationId, formation->dbname))); } } /* * The register_node() function is STRICT but users may have skipped the * --name option on the create command line. We still want to avoid having * to scan all the 10 parameters for ISNULL tests, so instead our client * sends an empty string for the nodename. */ JoinAutoFailoverFormation(formation, strcmp(nodeName, "") == 0 ? NULL : nodeName, nodeHost, nodePort, sysIdentifier, nodeCluster, ¤tNodeState); LockNodeGroup(formationId, currentNodeState.groupId, ExclusiveLock); AutoFailoverNode *pgAutoFailoverNode = GetAutoFailoverNode(nodeHost, nodePort); if (pgAutoFailoverNode == NULL) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("node %s:%d with dbname \"%s\" could not be registered in " "formation \"%s\", could not get information for node that was inserted", nodeHost, nodePort, expectedDBName, formationId))); } else { char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Registering " NODE_FORMAT " to formation \"%s\" " "with replication quorum %s and candidate priority %d [%d]", NODE_FORMAT_ARGS(pgAutoFailoverNode), pgAutoFailoverNode->formationId, pgAutoFailoverNode->replicationQuorum ? "true" : "false", pgAutoFailoverNode->candidatePriority, currentNodeState.candidatePriority); } /* * When adding a second sync node to a formation that has * number_sync_standbys set to zero (the default value for single node and * single standby formations), we switch the default value to 1 * automatically. */ if (pgAutoFailoverNode->goalState == REPLICATION_STATE_WAIT_STANDBY && formation->number_sync_standbys == 0) { AutoFailoverNode *primaryNode = GetPrimaryNodeInGroup(formationId, currentNodeState.groupId); List *standbyNodesList = AutoFailoverOtherNodesList(primaryNode); int syncStandbyNodeCount = CountSyncStandbys(standbyNodesList); /* * number_sync_standbys = 0 is a special case in our FSM, because we * have special handling of a missing standby then, switching to * wait_primary to disable synchronous replication when the standby is * not available. * * For other values (N) of number_sync_standbys, we require N+1 known * sync standby nodes, so that you can lose a standby at any point in * time and still accept writes. * * The default value for number_sync_standbys with two standby nodes is * 1. Because it was set to zero when adding the first standby, we need * to increment the value when adding a second standby node that * participates in the replication quorum (a "sync standby" node). */ if (syncStandbyNodeCount == 2) { char message[BUFSIZE] = { 0 }; formation->number_sync_standbys = 1; if (!SetFormationNumberSyncStandbys(formationId, 1)) { ereport(ERROR, (errmsg("couldn't set the formation \"%s\" " "number_sync_standbys to 1 now that a third " "node has been added", formationId))); } LogAndNotifyMessage( message, BUFSIZE, "Setting number_sync_standbys to %d for formation %s " "now that we have %d/%d standby nodes set with replication-quorum.", formation->number_sync_standbys, formation->formationId, syncStandbyNodeCount, list_length(standbyNodesList)); } } AutoFailoverNodeState *assignedNodeState = (AutoFailoverNodeState *) palloc0(sizeof(AutoFailoverNodeState)); assignedNodeState->nodeId = pgAutoFailoverNode->nodeId; assignedNodeState->groupId = pgAutoFailoverNode->groupId; assignedNodeState->replicationState = pgAutoFailoverNode->goalState; assignedNodeState->candidatePriority = pgAutoFailoverNode->candidatePriority; assignedNodeState->replicationQuorum = pgAutoFailoverNode->replicationQuorum; /* * Check that the state selected by the monitor matches the state required * by the keeper, if any. REPLICATION_STATE_INITIAL means the monitor can * pick whatever is needed now, depending on the groupId. * * The keeper might be confronted to an already existing Postgres instance * that is running as a primary (not in recovery), and so asking to * register as a SINGLE. Better error out than ask the keeper to remove * some unknown data. */ if (currentNodeState.replicationState != REPLICATION_STATE_INITIAL) { if (currentNodeState.replicationState != pgAutoFailoverNode->goalState) { const char *currentState = ReplicationStateGetName(currentNodeState.replicationState); const char *goalState = ReplicationStateGetName(pgAutoFailoverNode->goalState); ereport(ERROR, (errmsg("node %s:%d can not be registered in state %s, " "it should be in state %s", nodeHost, nodePort, currentState, goalState))); } } ProceedGroupState(pgAutoFailoverNode); TupleDesc resultDescriptor = NULL; Datum values[6]; bool isNulls[6]; memset(values, 0, sizeof(values)); memset(isNulls, false, sizeof(isNulls)); values[0] = Int64GetDatum(assignedNodeState->nodeId); values[1] = Int32GetDatum(assignedNodeState->groupId); values[2] = ObjectIdGetDatum( ReplicationStateGetEnum(pgAutoFailoverNode->goalState)); values[3] = Int32GetDatum(assignedNodeState->candidatePriority); values[4] = BoolGetDatum(assignedNodeState->replicationQuorum); values[5] = CStringGetTextDatum(pgAutoFailoverNode->nodeName); TypeFuncClass resultTypeClass = get_call_result_type(fcinfo, NULL, &resultDescriptor); if (resultTypeClass != TYPEFUNC_COMPOSITE) { ereport(ERROR, (errmsg("return type must be a row type"))); } HeapTuple resultTuple = heap_form_tuple(resultDescriptor, values, isNulls); Datum resultDatum = HeapTupleGetDatum(resultTuple); PG_RETURN_DATUM(resultDatum); } /* * node_active is the main entry-point for the HA state machine. Nodes * periodically call this function from the moment they start to communicate * their state to the monitor to obtain their assigned state. */ Datum node_active(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); int64 currentNodeId = PG_GETARG_INT64(1); int32 currentGroupId = PG_GETARG_INT32(2); Oid currentReplicationStateOid = PG_GETARG_OID(3); bool currentPgIsRunning = PG_GETARG_BOOL(4); int32 currentTLI = PG_GETARG_INT32(5); XLogRecPtr currentLSN = PG_GETARG_LSN(6); text *currentPgsrSyncStateText = PG_GETARG_TEXT_P(7); char *currentPgsrSyncState = text_to_cstring(currentPgsrSyncStateText); AutoFailoverNodeState currentNodeState = { 0 }; currentNodeState.nodeId = currentNodeId; currentNodeState.groupId = currentGroupId; currentNodeState.replicationState = EnumGetReplicationState(currentReplicationStateOid); currentNodeState.reportedTLI = currentTLI; currentNodeState.reportedLSN = currentLSN; currentNodeState.pgsrSyncState = SyncStateFromString(currentPgsrSyncState); currentNodeState.pgIsRunning = currentPgIsRunning; AutoFailoverNodeState *assignedNodeState = NodeActive(formationId, ¤tNodeState); Oid newReplicationStateOid = ReplicationStateGetEnum(assignedNodeState->replicationState); TupleDesc resultDescriptor = NULL; Datum values[5]; bool isNulls[5]; memset(values, 0, sizeof(values)); memset(isNulls, false, sizeof(isNulls)); values[0] = Int64GetDatum(assignedNodeState->nodeId); values[1] = Int32GetDatum(assignedNodeState->groupId); values[2] = ObjectIdGetDatum(newReplicationStateOid); values[3] = Int32GetDatum(assignedNodeState->candidatePriority); values[4] = BoolGetDatum(assignedNodeState->replicationQuorum); TypeFuncClass resultTypeClass = get_call_result_type(fcinfo, NULL, &resultDescriptor); if (resultTypeClass != TYPEFUNC_COMPOSITE) { ereport(ERROR, (errmsg("return type must be a row type"))); } HeapTuple resultTuple = heap_form_tuple(resultDescriptor, values, isNulls); Datum resultDatum = HeapTupleGetDatum(resultTuple); PG_RETURN_DATUM(resultDatum); } /* * NodeActive reports the current state of a node and returns the assigned state. */ static AutoFailoverNodeState * NodeActive(char *formationId, AutoFailoverNodeState *currentNodeState) { AutoFailoverNode *pgAutoFailoverNode = GetAutoFailoverNodeById( currentNodeState->nodeId); if (pgAutoFailoverNode == NULL) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("couldn't find node with nodeid %lld", (long long) currentNodeState->nodeId))); } else if (strcmp(pgAutoFailoverNode->formationId, formationId) != 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("node %lld does not belong to formation %s", (long long) currentNodeState->nodeId, formationId))); } else { LockFormation(formationId, ShareLock); if (pgAutoFailoverNode->reportedState != currentNodeState->replicationState) { /* * The keeper is reporting that it achieved the assigned goal * state, supposedly. Log the new reported state as an event, and * notify it. */ char message[BUFSIZE] = { 0 }; if (pgAutoFailoverNode->goalState == REPLICATION_STATE_REPORT_LSN) { LogAndNotifyMessage( message, BUFSIZE, "New state is reported by " NODE_FORMAT " with LSN %X/%X: %s", NODE_FORMAT_ARGS(pgAutoFailoverNode), (uint32) (pgAutoFailoverNode->reportedLSN >> 32), (uint32) pgAutoFailoverNode->reportedLSN, ReplicationStateGetName(currentNodeState->replicationState)); } else { LogAndNotifyMessage( message, BUFSIZE, "New state is reported by " NODE_FORMAT ": \"%s\"", NODE_FORMAT_ARGS(pgAutoFailoverNode), ReplicationStateGetName(currentNodeState->replicationState)); } pgAutoFailoverNode->reportedState = currentNodeState->replicationState; pgAutoFailoverNode->pgsrSyncState = currentNodeState->pgsrSyncState; pgAutoFailoverNode->reportedLSN = currentNodeState->reportedLSN; NotifyStateChange(pgAutoFailoverNode, message); } /* * Report the current state. The state might not have changed, but in * that case we still update the last report time. */ ReportAutoFailoverNodeState(pgAutoFailoverNode->nodeHost, pgAutoFailoverNode->nodePort, currentNodeState->replicationState, currentNodeState->pgIsRunning, currentNodeState->pgsrSyncState, currentNodeState->reportedTLI, currentNodeState->reportedLSN); } LockNodeGroup(formationId, currentNodeState->groupId, ExclusiveLock); ProceedGroupState(pgAutoFailoverNode); AutoFailoverNodeState *assignedNodeState = (AutoFailoverNodeState *) palloc0(sizeof(AutoFailoverNodeState)); assignedNodeState->nodeId = pgAutoFailoverNode->nodeId; assignedNodeState->groupId = pgAutoFailoverNode->groupId; assignedNodeState->replicationState = pgAutoFailoverNode->goalState; assignedNodeState->candidatePriority = pgAutoFailoverNode->candidatePriority; assignedNodeState->replicationQuorum = pgAutoFailoverNode->replicationQuorum; return assignedNodeState; } /* * JoinAutoFailoverFormation adds a new node to a AutoFailover formation. */ static void JoinAutoFailoverFormation(AutoFailoverFormation *formation, char *nodeName, char *nodeHost, int nodePort, uint64 sysIdentifier, char *nodeCluster, AutoFailoverNodeState *currentNodeState) { int groupId = -1; ReplicationState initialState = REPLICATION_STATE_UNKNOWN; /* in a Postgres formation, we have a single groupId, and it's groupId 0 */ if (formation->kind == FORMATION_KIND_PGSQL) { /* * Register with groupId -1 to get one assigned by the monitor, or with * the groupId you know you want to join. In a Postgres (pgsql) * formation it's all down to groupId 0 anyway. */ if (currentNodeState->groupId > 0) { ereport(ERROR, (errmsg("node %s:%d can not be registered in group %d " "in formation \"%s\" of type pgsql", nodeHost, nodePort, currentNodeState->groupId, formation->formationId), errdetail("in a pgsql formation, there can be only one " "group, with groupId 0"))); } groupId = currentNodeState->groupId = 0; } /* a group number was asked for in the registration call */ if (currentNodeState->groupId >= 0) { /* the node prefers a particular group */ groupId = currentNodeState->groupId; List *groupNodeList = AutoFailoverNodeGroup(formation->formationId, groupId); /* * Target group is empty: to make it simple to reason about the roles * in a group, we only ever accept a primary node first. Then, any * other node in the same group should be a standby. That's easy. */ if (list_length(groupNodeList) == 0) { initialState = REPLICATION_STATE_SINGLE; } /* target group already has a primary, any other node is a standby */ else if (formation->opt_secondary) { initialState = REPLICATION_STATE_WAIT_STANDBY; /* if we have a primary node, pg_basebackup from it */ AutoFailoverNode *primaryNode = GetPrimaryNodeInGroup( formation->formationId, currentNodeState->groupId); /* we might be in the middle of a failover */ List *nodesGroupList = AutoFailoverNodeGroup( formation->formationId, currentNodeState->groupId); /* if we don't have a primary, look for a node being promoted */ AutoFailoverNode *nodeBeingPromoted = NULL; /* we might have an upstream node that's not a failover candidate */ bool foundUpstreamNode = false; if (primaryNode == NULL) { nodeBeingPromoted = FindCandidateNodeBeingPromoted( nodesGroupList); } /* * If we don't have a primary node and we also don't have a node * being promoted, it might be that all we have is a list of * nodes with candidatePriority zero. * * When that happens, those nodes are assigned REPORT_LSN, in case * a candidate could be promoted (and maybe fast-forwarded). * * If we find even a single node in REPORT_LSN and with candidate * priority zero, we have an upstream node for creating a new * node, that can then be promoted as the new primary. */ if (primaryNode == NULL && nodeBeingPromoted == NULL) { ListCell *nodeCell = NULL; foreach(nodeCell, nodesGroupList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node->candidatePriority == 0 && IsCurrentState(node, REPLICATION_STATE_REPORT_LSN)) { foundUpstreamNode = true; break; } } if (foundUpstreamNode) { initialState = REPLICATION_STATE_REPORT_LSN; } } /* * If we can't figure it out, have the client handle the situation. */ if (!(primaryNode || nodeBeingPromoted || foundUpstreamNode)) { ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("JoinAutoFailoverFormation couldn't find the " " primary node in formation \"%s\", group %d", formation->formationId, currentNodeState->groupId), errhint("Retry registering in a moment"))); } } /* formation->opt_secondary is false */ else { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("Formation \"%s\" does not allow secondary nodes", formation->formationId), errhint("use pg_autoctl enable secondary"))); } } else { /* * In a Citus formation, the register policy is to build a set of * workers with each a primary and a secondary, including the * coordinator. * * That's the policy implemented in AssignGroupId. */ groupId = AssignGroupId(formation, nodeHost, nodePort, &initialState); } AddAutoFailoverNode(formation->formationId, formation->kind, currentNodeState->nodeId, groupId, nodeName, nodeHost, nodePort, sysIdentifier, initialState, currentNodeState->replicationState, currentNodeState->candidatePriority, currentNodeState->replicationQuorum, nodeCluster); currentNodeState->groupId = groupId; } /* * AssignGroupId assigns a group ID to a new node and returns it. */ static int AssignGroupId(AutoFailoverFormation *formation, char *nodeHost, int nodePort, ReplicationState *initialState) { int groupId = -1; int candidateGroupId = /* * a Citus formation's coordinator always asks for groupId 0, and the * workers are not allowed to ask for groupId 0. So here, when the * formation is a citus formation, then candidateGroupId begins at 1. */ formation->kind == FORMATION_KIND_CITUS ? 1 : 0; do { List *groupNodeList = AutoFailoverNodeGroup(formation->formationId, candidateGroupId); if (list_length(groupNodeList) == 0) { groupId = candidateGroupId; *initialState = REPLICATION_STATE_SINGLE; } else if (formation->opt_secondary && list_length(groupNodeList) == 1) { groupId = candidateGroupId; *initialState = REPLICATION_STATE_WAIT_STANDBY; } else { candidateGroupId++; } } while (groupId == -1); return groupId; } /* * get_primary returns the node in a group which currently takes writes. */ Datum get_primary(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); int32 groupId = PG_GETARG_INT32(1); TupleDesc resultDescriptor = NULL; Datum values[4]; bool isNulls[4]; AutoFailoverNode *primaryNode = GetPrimaryOrDemotedNodeInGroup(formationId, groupId); if (primaryNode == NULL) { ereport(ERROR, (errmsg("group has no writable node right now"))); } memset(values, 0, sizeof(values)); memset(isNulls, false, sizeof(isNulls)); values[0] = Int64GetDatum(primaryNode->nodeId); values[1] = CStringGetTextDatum(primaryNode->nodeName); values[2] = CStringGetTextDatum(primaryNode->nodeHost); values[3] = Int32GetDatum(primaryNode->nodePort); TypeFuncClass resultTypeClass = get_call_result_type(fcinfo, NULL, &resultDescriptor); if (resultTypeClass != TYPEFUNC_COMPOSITE) { ereport(ERROR, (errmsg("return type must be a row type"))); } HeapTuple resultTuple = heap_form_tuple(resultDescriptor, values, isNulls); Datum resultDatum = HeapTupleGetDatum(resultTuple); PG_RETURN_DATUM(resultDatum); } typedef struct get_nodes_fctx { List *nodesList; } get_nodes_fctx; /* * get_nodes returns all the node in a group, if any. */ Datum get_nodes(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); FuncCallContext *funcctx; get_nodes_fctx *fctx; MemoryContext oldcontext; /* stuff done only on the first call of the function */ if (SRF_IS_FIRSTCALL()) { text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); if (PG_ARGISNULL(0)) { ereport(ERROR, (errmsg("formation_id must not be null"))); } checkPgAutoFailoverVersion(); /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* * switch to memory context appropriate for multiple function calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* allocate memory for user context */ fctx = (get_nodes_fctx *) palloc(sizeof(get_nodes_fctx)); /* * Use fctx to keep state from call to call. Seed current with the * original start value */ if (PG_ARGISNULL(1)) { fctx->nodesList = AllAutoFailoverNodes(formationId); } else { int32 groupId = PG_GETARG_INT32(1); fctx->nodesList = AutoFailoverAllNodesInGroup(formationId, groupId); } funcctx->user_fctx = fctx; MemoryContextSwitchTo(oldcontext); } /* stuff done on every call of the function */ funcctx = SRF_PERCALL_SETUP(); /* * get the saved state and use current as the result for this iteration */ fctx = funcctx->user_fctx; if (fctx->nodesList != NIL) { TupleDesc resultDescriptor = NULL; Datum values[6]; bool isNulls[6]; AutoFailoverNode *node = (AutoFailoverNode *) linitial(fctx->nodesList); memset(values, 0, sizeof(values)); memset(isNulls, false, sizeof(isNulls)); values[0] = Int64GetDatum(node->nodeId); values[1] = CStringGetTextDatum(node->nodeName); values[2] = CStringGetTextDatum(node->nodeHost); values[3] = Int32GetDatum(node->nodePort); values[4] = LSNGetDatum(node->reportedLSN); values[5] = BoolGetDatum(CanTakeWritesInState(node->reportedState)); TypeFuncClass resultTypeClass = get_call_result_type(fcinfo, NULL, &resultDescriptor); if (resultTypeClass != TYPEFUNC_COMPOSITE) { ereport(ERROR, (errmsg("return type must be a row type"))); } HeapTuple resultTuple = heap_form_tuple(resultDescriptor, values, isNulls); Datum resultDatum = HeapTupleGetDatum(resultTuple); /* prepare next SRF call */ fctx->nodesList = list_delete_first(fctx->nodesList); SRF_RETURN_NEXT(funcctx, PointerGetDatum(resultDatum)); } SRF_RETURN_DONE(funcctx); } /* * get_other_node is not supported anymore, but we might want to be able to * have the pgautofailover.so for 1.1 co-exists with the SQL definitions for * 1.0 at least during an upgrade, or to test upgrades. */ Datum get_other_node(PG_FUNCTION_ARGS) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("pgautofailover.get_other_node is no longer supported"))); } /* * get_other_nodes returns the other node in a group, if any. */ Datum get_other_nodes(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); FuncCallContext *funcctx; get_nodes_fctx *fctx; MemoryContext oldcontext; /* stuff done only on the first call of the function */ if (SRF_IS_FIRSTCALL()) { int64 nodeId = PG_GETARG_INT64(0); checkPgAutoFailoverVersion(); /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* * switch to memory context appropriate for multiple function calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* allocate memory for user context */ fctx = (get_nodes_fctx *) palloc(sizeof(get_nodes_fctx)); /* * Use fctx to keep state from call to call. Seed current with the * original start value */ AutoFailoverNode *activeNode = GetAutoFailoverNodeById(nodeId); if (activeNode == NULL) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("node %lld is not registered", (long long) nodeId))); } if (PG_NARGS() == 1) { fctx->nodesList = AutoFailoverOtherNodesList(activeNode); } else if (PG_NARGS() == 2) { Oid currentReplicationStateOid = PG_GETARG_OID(1); ReplicationState currentState = EnumGetReplicationState(currentReplicationStateOid); fctx->nodesList = AutoFailoverOtherNodesListInState(activeNode, currentState); } else { /* that's a bug in the SQL exposure of that function */ ereport(ERROR, (errmsg("unsupported number of arguments (%d)", PG_NARGS()))); } funcctx->user_fctx = fctx; MemoryContextSwitchTo(oldcontext); } /* stuff done on every call of the function */ funcctx = SRF_PERCALL_SETUP(); /* * get the saved state and use current as the result for this iteration */ fctx = funcctx->user_fctx; if (fctx->nodesList != NIL) { TupleDesc resultDescriptor = NULL; Datum values[6]; bool isNulls[6]; AutoFailoverNode *node = (AutoFailoverNode *) linitial(fctx->nodesList); memset(values, 0, sizeof(values)); memset(isNulls, false, sizeof(isNulls)); values[0] = Int64GetDatum(node->nodeId); values[1] = CStringGetTextDatum(node->nodeName); values[2] = CStringGetTextDatum(node->nodeHost); values[3] = Int32GetDatum(node->nodePort); values[4] = LSNGetDatum(node->reportedLSN); values[5] = BoolGetDatum(CanTakeWritesInState(node->reportedState)); TypeFuncClass resultTypeClass = get_call_result_type(fcinfo, NULL, &resultDescriptor); if (resultTypeClass != TYPEFUNC_COMPOSITE) { ereport(ERROR, (errmsg("return type must be a row type"))); } HeapTuple resultTuple = heap_form_tuple(resultDescriptor, values, isNulls); Datum resultDatum = HeapTupleGetDatum(resultTuple); /* prepare next SRF call */ fctx->nodesList = list_delete_first(fctx->nodesList); SRF_RETURN_NEXT(funcctx, PointerGetDatum(resultDatum)); } SRF_RETURN_DONE(funcctx); } /* * remove_node is not supported anymore, but we might want to be able to have * the pgautofailover.so for 1.1 co-exists with the SQL definitions for 1.0 at * least during an upgrade, or to test upgrades. */ Datum remove_node(PG_FUNCTION_ARGS) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("pgautofailover.remove_node is no longer supported"))); } /* * remove_node removes the given node from the monitor. */ Datum remove_node_by_nodeid(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); int64 nodeId = PG_GETARG_INT64(0); bool force = PG_GETARG_BOOL(1); AutoFailoverNode *currentNode = GetAutoFailoverNodeById(nodeId); if (currentNode == NULL) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("couldn't find node with nodeid %lld", (long long) nodeId))); } PG_RETURN_BOOL(RemoveNode(currentNode, force)); } /* * remove_node removes the given node from the monitor. */ Datum remove_node_by_host(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *nodeHostText = PG_GETARG_TEXT_P(0); char *nodeHost = text_to_cstring(nodeHostText); int32 nodePort = PG_GETARG_INT32(1); bool force = PG_GETARG_BOOL(2); AutoFailoverNode *currentNode = GetAutoFailoverNode(nodeHost, nodePort); if (currentNode == NULL) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("couldn't find node with " "hostname \"%s\" and port %d", nodeHost, nodePort))); } PG_RETURN_BOOL(RemoveNode(currentNode, force)); } /* RemoveNode removes the given node from the monitor. */ static bool RemoveNode(AutoFailoverNode *currentNode, bool force) { ListCell *nodeCell = NULL; char message[BUFSIZE] = { 0 }; if (currentNode == NULL) { return false; } LockFormation(currentNode->formationId, ExclusiveLock); AutoFailoverFormation *formation = GetFormation(currentNode->formationId); /* when removing the primary, initiate a failover */ bool currentNodeIsPrimary = CanTakeWritesInState(currentNode->goalState); /* get the list of the other nodes */ List *otherNodesGroupList = AutoFailoverOtherNodesList(currentNode); /* and the first other node to trigger our first FSM transition */ AutoFailoverNode *firstStandbyNode = otherNodesGroupList == NIL ? NULL : linitial(otherNodesGroupList); /* * To remove a node is a 2-step process. * * 1. pgautofailover.remove_node() sets the goal state to DROPPED * 2. pgautofailover.node_active() reports that the goal state is reached * * From the client side though, if a crash happens after having called the * node_active() function but before having stored the state, it might be * useful to call pgautofailover.remove_node() again. * * When pgautofailover.remove_node() is called on a node that has already * reached the DROPPED state, we proceed to remove it. */ if (IsCurrentState(currentNode, REPLICATION_STATE_DROPPED) || force) { /* time to actually remove the current node */ RemoveAutoFailoverNode(currentNode); LogAndNotifyMessage( message, BUFSIZE, "Removing " NODE_FORMAT " from formation \"%s\" and group %d", NODE_FORMAT_ARGS(currentNode), currentNode->formationId, currentNode->groupId); return true; } /* if the removal is already in progress, politely ignore the request */ if (currentNode->goalState == REPLICATION_STATE_DROPPED) { return true; } /* review the FSM for every other node, when removing the primary */ if (currentNodeIsPrimary) { foreach(nodeCell, otherNodesGroupList) { char message[BUFSIZE] = { 0 }; AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node == NULL) { /* shouldn't happen */ ereport(ERROR, (errmsg("BUG: node is NULL"))); continue; } /* skip nodes that are currently in maintenance */ if (IsInMaintenance(node)) { continue; } LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to report_lsn after primary node removal.", NODE_FORMAT_ARGS(node)); SetNodeGoalState(node, REPLICATION_STATE_REPORT_LSN, message); } } /* * Mark the node as being dropped, so that the pg_autoctl node-active * process can implement further actions at drop time. */ LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " from formation \"%s\" and group %d to \"dropped\"" " to implement node removal.", NODE_FORMAT_ARGS(currentNode), currentNode->formationId, currentNode->groupId); SetNodeGoalState(currentNode, REPLICATION_STATE_DROPPED, message); /* * Adjust number-sync-standbys if necessary. * * otherNodesGroupList is the list of all the remaining nodes, and that * includes the current primary, which might be setup with replication * quorum set to true (and probably is). */ int countSyncStandbys = CountSyncStandbys(otherNodesGroupList) - 1; if (countSyncStandbys < (formation->number_sync_standbys + 1)) { formation->number_sync_standbys = countSyncStandbys - 1; if (formation->number_sync_standbys < 0) { formation->number_sync_standbys = 0; } if (!SetFormationNumberSyncStandbys(formation->formationId, formation->number_sync_standbys)) { ereport(ERROR, (errmsg("couldn't set the formation \"%s\" " "number_sync_standbys to %d now that a " "standby node has been removed", currentNode->formationId, formation->number_sync_standbys))); } LogAndNotifyMessage( message, BUFSIZE, "Setting number_sync_standbys to %d for formation \"%s\" " "now that we have %d standby nodes set with replication-quorum.", formation->number_sync_standbys, formation->formationId, countSyncStandbys); } /* now proceed with the failover, starting with the first standby */ if (currentNodeIsPrimary) { /* if we have at least one other node in the group, proceed */ if (firstStandbyNode) { (void) ProceedGroupState(firstStandbyNode); } } else { /* find the primary, if any, and have it realize a node has left */ AutoFailoverNode *primaryNode = GetPrimaryNodeInGroup(currentNode->formationId, currentNode->groupId); if (primaryNode) { ReplicationState goalState = primaryNode->goalState; (void) ProceedGroupState(primaryNode); /* * When the removal of a secondary node has no impact on the * primary node state, we still need to change the replication * settings to adjust to the possibly new * synchronous_standby_names, so we force APPLY_SETTINGS in that * case. */ if (primaryNode->goalState == goalState && goalState != REPLICATION_STATE_APPLY_SETTINGS) { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to apply_settings after removing standby " NODE_FORMAT " from formation %s.", NODE_FORMAT_ARGS(primaryNode), NODE_FORMAT_ARGS(currentNode), formation->formationId); SetNodeGoalState(primaryNode, REPLICATION_STATE_APPLY_SETTINGS, message); } } } return true; } /* * perform_failover promotes the secondary in the given group */ Datum perform_failover(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); int32 groupId = PG_GETARG_INT32(1); LockFormation(formationId, ShareLock); LockNodeGroup(formationId, groupId, ExclusiveLock); List *groupNodeList = AutoFailoverNodeGroup(formationId, groupId); if (list_length(groupNodeList) < 2) { ereport(ERROR, (errmsg("cannot fail over: group %d in formation %s " "currently has %d node registered", groupId, formationId, list_length(groupNodeList)), errdetail("At least 2 nodes are required " "to implement a failover"))); } /* get a current primary node that we can failover from (accepts writes) */ AutoFailoverNode *primaryNode = GetNodeToFailoverFromInGroup(formationId, groupId); if (primaryNode == NULL) { ereport(ERROR, (errmsg("couldn't find the primary node in formation \"%s\", " "group %d", formationId, groupId))); } /* * When we have only two nodes, we can failover directly to the secondary * node, provided its current state allows for that. * * When we have more than two nodes, then we need to check that we have at * least one candidate for failover and initiate the REPORT_LSN dance to * make the failover happen. */ if (list_length(groupNodeList) == 2) { List *standbyNodesGroupList = AutoFailoverOtherNodesList(primaryNode); if (list_length(standbyNodesGroupList) != 1) { ereport(ERROR, (errmsg("couldn't find the standby node in " "formation \"%s\", group %d with primary node " NODE_FORMAT, formationId, groupId, NODE_FORMAT_ARGS(primaryNode)))); } AutoFailoverNode *secondaryNode = linitial(standbyNodesGroupList); if (secondaryNode->goalState != REPLICATION_STATE_SECONDARY) { const char *secondaryState = ReplicationStateGetName(secondaryNode->goalState); ereport(ERROR, (errmsg( "standby " NODE_FORMAT " is in state \"%s\", " "which prevents the node for being a failover candidate", NODE_FORMAT_ARGS(secondaryNode), secondaryState))); } /* * In order to safely proceed we need to ensure that the primary node * has reached the primary state fully already. In the transition to * PRIMARY we actually wait until the current LSN observed on the * primary has made it to the secondary, which is a needed guarantee * for avoiding data loss. */ if (!IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY) || !IsCurrentState(secondaryNode, REPLICATION_STATE_SECONDARY)) { ereport(ERROR, (errmsg( "cannot fail over: primary node is not in a stable state"), errdetail(NODE_FORMAT " has reported state \"%s\" and" " is assigned state \"%s\"," " and " NODE_FORMAT " has reported state \"%s\"" " and is assigned state \"%s\"", NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryNode->reportedState), ReplicationStateGetName(primaryNode->goalState), NODE_FORMAT_ARGS(secondaryNode), ReplicationStateGetName(secondaryNode->reportedState), ReplicationStateGetName(secondaryNode->goalState)), errhint("a stable state must be observed to " "perform a manual failover"))); } char message[BUFSIZE] = { 0 }; LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to draining and " NODE_FORMAT " to prepare_promotion after a user-initiated failover.", NODE_FORMAT_ARGS(primaryNode), NODE_FORMAT_ARGS(secondaryNode)); SetNodeGoalState(primaryNode, REPLICATION_STATE_DRAINING, message); SetNodeGoalState(secondaryNode, REPLICATION_STATE_PREPARE_PROMOTION, message); } else { List *standbyNodesGroupList = AutoFailoverOtherNodesList(primaryNode); AutoFailoverNode *firstStandbyNode = linitial(standbyNodesGroupList); char message[BUFSIZE] = { 0 }; /* so we have at least one candidate, let's get started */ LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT "at LSN %X/%X to draining after a user-initiated failover.", NODE_FORMAT_ARGS(primaryNode), (uint32) (primaryNode->reportedLSN >> 32), (uint32) primaryNode->reportedLSN); SetNodeGoalState(primaryNode, REPLICATION_STATE_DRAINING, message); /* * When a failover is performed with all the nodes up and running, the * old primary is often in the best situation to win the election. In * that case, we trick the candidate priority in a way that makes the * node lose the election. * * We undo this change in priority once the election completes. */ if (primaryNode) { char message[BUFSIZE] = { 0 }; primaryNode->candidatePriority -= CANDIDATE_PRIORITY_INCREMENT; ReportAutoFailoverNodeReplicationSetting( primaryNode->nodeId, primaryNode->nodeHost, primaryNode->nodePort, primaryNode->candidatePriority, primaryNode->replicationQuorum); LogAndNotifyMessage( message, BUFSIZE, "Updating candidate priority to %d for " NODE_FORMAT, primaryNode->candidatePriority, NODE_FORMAT_ARGS(primaryNode)); NotifyStateChange(primaryNode, message); } /* now proceed with the failover, starting with the first standby */ (void) ProceedGroupState(firstStandbyNode); } PG_RETURN_VOID(); } /* * promote promotes a given target node in a group. */ Datum perform_promotion(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); text *nodeNameText = PG_GETARG_TEXT_P(1); char *nodeName = text_to_cstring(nodeNameText); AutoFailoverNode *currentNode = GetAutoFailoverNodeByName(formationId, nodeName); if (currentNode == NULL) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("node \"%s\" is not registered in formation \"%s\"", nodeName, formationId))); } LockFormation(formationId, ShareLock); LockNodeGroup(formationId, currentNode->groupId, ExclusiveLock); /* * If the current node is the primary, that's done. */ if (IsCurrentState(currentNode, REPLICATION_STATE_SINGLE) || IsCurrentState(currentNode, REPLICATION_STATE_PRIMARY)) { /* return false: no promotion is happening */ ereport(NOTICE, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot perform promotion: node %s in formation %s " "is already a primary.", nodeName, formationId))); PG_RETURN_BOOL(false); } /* * If the node is not a primary, it needs to be in the SECONDARY state or * in the REPORT_LSN state. In the case where none of the nodes are a * candidate for failover, and the primary has been lost, all the remaining * nodes are assigned REPORT_LSN, and we make it possible to then manually * promote one of them. * * When we call perform_failover() to implement the actual failover * orchestration, this condition is going to be checked again, but in a * different way. * * For instance, the target could be in MAINTENANCE and perform_failover * would still be able to implement a failover given another secondary node * being around. */ if (!IsCurrentState(currentNode, REPLICATION_STATE_SECONDARY) && !IsCurrentState(currentNode, REPLICATION_STATE_REPORT_LSN)) { /* return false: no promotion is happening */ ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg( "cannot perform promotion: node %s in formation %s " "has reported state \"%s\" and is assigned state \"%s\", " "promotion can only be performed when " "in state \"secondary\".", nodeName, formationId, ReplicationStateGetName(currentNode->reportedState), ReplicationStateGetName(currentNode->goalState)))); } /* * If we have only two nodes in the group, then perform a failover. */ List *groupNodesList = AutoFailoverNodeGroup(currentNode->formationId, currentNode->groupId); int totalNodesCount = list_length(groupNodesList); if (totalNodesCount <= 2) { DirectFunctionCall2(perform_failover, CStringGetTextDatum(formationId), Int32GetDatum(currentNode->groupId)); /* if we reach this point, then a failover is in progress */ PG_RETURN_BOOL(true); } else { char message[BUFSIZE] = { 0 }; /* * In the general case, we perform a little trick: * * - first increment the node's candidate-priority by 100, * * - then call perform_failover, * * - when the node reaches WAIT_PRIMARY again, after promotion, reset * its candidate priority. */ currentNode->candidatePriority += CANDIDATE_PRIORITY_INCREMENT; ReportAutoFailoverNodeReplicationSetting( currentNode->nodeId, currentNode->nodeHost, currentNode->nodePort, currentNode->candidatePriority, currentNode->replicationQuorum); LogAndNotifyMessage( message, BUFSIZE, "Updating candidate priority to %d for " NODE_FORMAT, currentNode->candidatePriority, NODE_FORMAT_ARGS(currentNode)); NotifyStateChange(currentNode, message); /* * In case of errors in the perform_failover function, we ereport an * ERROR and that causes the transaction to fail (ROLLBACK). In that * case, the UPDATE of the candidate priority in the * pgautofailover.node table is also cancelled, and the notification * above is not sent either. */ DirectFunctionCall2(perform_failover, CStringGetTextDatum(formationId), Int32GetDatum(currentNode->groupId)); /* if we reach this point, then a failover is in progress */ PG_RETURN_BOOL(true); } /* can't happen, keep compiler happy */ PG_RETURN_BOOL(false); } /* * start_maintenance sets the given node in maintenance state. * * This operation is only allowed on a secondary node. To do so on a primary * node, first failover so that it's now a secondary. */ Datum start_maintenance(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); int64 nodeId = PG_GETARG_INT64(0); AutoFailoverNode *primaryNode = NULL; List *secondaryStates = list_make2_int(REPLICATION_STATE_SECONDARY, REPLICATION_STATE_CATCHINGUP); char message[BUFSIZE]; AutoFailoverNode *currentNode = GetAutoFailoverNodeById(nodeId); if (currentNode == NULL) { PG_RETURN_BOOL(false); } LockFormation(currentNode->formationId, ShareLock); LockNodeGroup(currentNode->formationId, currentNode->groupId, ExclusiveLock); AutoFailoverFormation *formation = GetFormation(currentNode->formationId); List *groupNodesList = AutoFailoverNodeGroup(currentNode->formationId, currentNode->groupId); int totalNodesCount = list_length(groupNodesList); /* check pre-conditions for the current node (secondary) */ if (currentNode->reportedState == REPLICATION_STATE_MAINTENANCE || currentNode->goalState == REPLICATION_STATE_MAINTENANCE) { /* if we're already in maintenance, we're good */ PG_RETURN_BOOL(true); } /* * We allow to go to maintenance in the following cases only: * * - current node is a primary, and we then promote the secondary * - current node is a secondary or is catching up */ if (!(IsCurrentState(currentNode, REPLICATION_STATE_PRIMARY) || (IsStateIn(currentNode->reportedState, secondaryStates)))) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot start maintenance: node %s:%d has reported state " "\"%s\" and is assigned state \"%s\", " "expected either \"primary\", " "\"secondary\" or \"catchingup\"", currentNode->nodeHost, currentNode->nodePort, ReplicationStateGetName(currentNode->reportedState), ReplicationStateGetName(currentNode->goalState)))); } /* * We now need to have the primary node identified, and the list of the * secondary nodes (not including those already in maintenance), to decide * if we can proceed. */ if (IsCurrentState(currentNode, REPLICATION_STATE_PRIMARY)) { primaryNode = currentNode; } else { primaryNode = GetPrimaryNodeInGroup(currentNode->formationId, currentNode->groupId); if (primaryNode == NULL) { ereport(ERROR, (errmsg("couldn't find the primary node in formation \"%s\", " "group %d", currentNode->formationId, currentNode->groupId))); } } /* * We need to always have at least formation->number_sync_standbys nodes in * the SECONDARY state participating in the quorum, otherwise writes may be * blocked on the primary. In case when we know we will have to block * writes, warn our user. * * As they might still need to operate this maintenance operation, we won't * forbid it by erroring out, though. */ List *secondaryNodesList = AutoFailoverOtherNodesListInState(primaryNode, REPLICATION_STATE_SECONDARY); int candidatesCount = CountHealthyCandidates(secondaryNodesList); int secondaryNodesCount = CountHealthySyncStandbys(secondaryNodesList); if (formation->number_sync_standbys > 0 && secondaryNodesCount <= formation->number_sync_standbys && IsHealthySyncStandby(currentNode)) { ereport(WARNING, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("Starting maintenance on " NODE_FORMAT " will block writes on the primary " NODE_FORMAT, NODE_FORMAT_ARGS(currentNode), NODE_FORMAT_ARGS(primaryNode)), errdetail("we now have %d " "healthy node(s) left in the \"secondary\" state " "and formation \"%s\" number-sync-standbys requires " "%d sync standbys", secondaryNodesCount - 1, formation->formationId, formation->number_sync_standbys))); } /* * Now that we cleared that adding another node in MAINTENANCE is * compatible with our service expectations from * formation->number_sync_standbys, we may proceed. * * We proceed in different ways when asked to put a primary or a secondary * to maintenance: in the case of a primary, we must failover. */ if (IsCurrentState(currentNode, REPLICATION_STATE_PRIMARY)) { List *standbyNodesGroupList = AutoFailoverOtherNodesList(currentNode); AutoFailoverNode *firstStandbyNode = linitial(standbyNodesGroupList); char message[BUFSIZE] = { 0 }; /* * We need at least one candidate node to initiate a failover and allow * the primary to reach maintenance. */ if (candidatesCount < 1) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("Starting maintenance on " NODE_FORMAT " in state \"%s\" is not currently possible", NODE_FORMAT_ARGS(currentNode), ReplicationStateGetName(currentNode->reportedState)), errdetail("there is currently %d candidate nodes available", candidatesCount))); } if (totalNodesCount == 2) { /* * Set the primary to prepare_maintenance now, and if we have a * single secondary we assign it prepare_promotion, otherwise we * need to elect a secondary, same as in perform_failover. */ LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to prepare_maintenance " "after a user-initiated start_maintenance call.", NODE_FORMAT_ARGS(currentNode)); SetNodeGoalState(currentNode, REPLICATION_STATE_PREPARE_MAINTENANCE, message); AutoFailoverNode *otherNode = firstStandbyNode; /* * We put the only secondary node straight to prepare_replication. */ LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to prepare_maintenance and " NODE_FORMAT " to prepare_promotion " "after a user-initiated start_maintenance call.", NODE_FORMAT_ARGS(currentNode), NODE_FORMAT_ARGS(otherNode)); SetNodeGoalState(otherNode, REPLICATION_STATE_PREPARE_PROMOTION, message); } else { /* put the primary directly to maintenance */ LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to maintenance " "after a user-initiated start_maintenance call.", NODE_FORMAT_ARGS(currentNode)); SetNodeGoalState(currentNode, REPLICATION_STATE_PREPARE_MAINTENANCE, message); /* now proceed with the failover, starting with the first standby */ (void) ProceedGroupState(firstStandbyNode); } PG_RETURN_BOOL(true); } /* * Only allow a secondary to get to MAINTENANCE when the primary is in the * PRIMARY state. */ else if (IsStateIn(currentNode->reportedState, secondaryStates) && IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY)) { /* * In most cases we can simply put a secondary directly into * maintenance mode. However, when putting the last secondary node * that's part of the replication quorum to maintenance, we disable * sync rep on the primary by switching it to wait_primary. Otherwise * the primary won't be able to accept writes until the monitor assigns * it wait_primary. This way we're nice about it and don't bring the * secondary down before that happens. Because we didn't change the * state of any standby node yet, we get there when the count is one * (not zero). */ if (formation->number_sync_standbys == 0 && secondaryNodesCount == 1 && IsHealthySyncStandby(currentNode)) { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to wait_primary and " NODE_FORMAT " to wait_maintenance " "after a user-initiated start_maintenance call.", NODE_FORMAT_ARGS(primaryNode), NODE_FORMAT_ARGS(currentNode)); SetNodeGoalState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY, message); SetNodeGoalState(currentNode, REPLICATION_STATE_WAIT_MAINTENANCE, message); } else { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to maintenance " "after a user-initiated start_maintenance call.", NODE_FORMAT_ARGS(currentNode)); SetNodeGoalState(currentNode, REPLICATION_STATE_MAINTENANCE, message); } } else { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot start maintenance: current state for " NODE_FORMAT " is \"%s\", expected \"secondary\" or \"catchingup\", " "and current state for primary " NODE_FORMAT " is \"%s\" ➜ \"%s\" ", NODE_FORMAT_ARGS(currentNode), ReplicationStateGetName(currentNode->reportedState), NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryNode->reportedState), ReplicationStateGetName(primaryNode->goalState)))); } PG_RETURN_BOOL(true); } /* * stop_maintenance brings a node back from maintenance to a participating * member of the formation. Depending on the state of the formation it's either * assigned catchingup or report_lsn. * * This operation is only allowed on a node that's in the maintenance state. */ Datum stop_maintenance(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); int64 nodeId = PG_GETARG_INT64(0); char message[BUFSIZE] = { 0 }; AutoFailoverNode *currentNode = GetAutoFailoverNodeById(nodeId); if (currentNode == NULL) { PG_RETURN_BOOL(false); } LockFormation(currentNode->formationId, ShareLock); LockNodeGroup(currentNode->formationId, currentNode->groupId, ExclusiveLock); List *groupNodesList = AutoFailoverNodeGroup(currentNode->formationId, currentNode->groupId); int totalNodesCount = list_length(groupNodesList); if (!IsCurrentState(currentNode, REPLICATION_STATE_MAINTENANCE) && !(totalNodesCount > 2 && IsCurrentState(currentNode, REPLICATION_STATE_PREPARE_MAINTENANCE))) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot stop maintenance when current state for " NODE_FORMAT " is not \"maintenance\"", NODE_FORMAT_ARGS(currentNode)), errdetail("Current reported state is \"%s\" and " "assigned state is \"%s\"", ReplicationStateGetName(currentNode->reportedState), ReplicationStateGetName(currentNode->goalState)))); } /* * We need to find the primary node even if we are in the middle of a * failover, and it's already set to draining. That way we may rejoin the * cluster, report our LSN, and help proceed to reach a consistent state. */ AutoFailoverNode *primaryNode = GetPrimaryOrDemotedNodeInGroup(currentNode->formationId, currentNode->groupId); /* * When there is no primary, we might be in trouble, we just want to join * the possibily ongoing election. */ if (totalNodesCount == 1) { (void) ProceedGroupState(currentNode); PG_RETURN_BOOL(true); } else if (primaryNode == NULL && totalNodesCount == 2) { ereport(ERROR, (errmsg("couldn't find the primary node in formation \"%s\", " "group %d", currentNode->formationId, currentNode->groupId))); } else if (primaryNode == NULL && totalNodesCount > 2) { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to report_lsn after a user-initiated stop_maintenance call.", NODE_FORMAT_ARGS(currentNode)); SetNodeGoalState(currentNode, REPLICATION_STATE_REPORT_LSN, message); PG_RETURN_BOOL(true); } /* * When a failover is in progress and stop_maintenance() is called (by * means of pg_autoctl disable maintenance or otherwise), then we should * join the crew on REPORT_LSN: the last known primary can be presumed * down. */ if (IsFailoverInProgress(groupNodesList)) { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to catchingup after a user-initiated stop_maintenance call.", NODE_FORMAT_ARGS(currentNode)); SetNodeGoalState(currentNode, REPLICATION_STATE_REPORT_LSN, message); } else { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to catchingup after a user-initiated stop_maintenance call.", NODE_FORMAT_ARGS(currentNode)); SetNodeGoalState(currentNode, REPLICATION_STATE_CATCHINGUP, message); } PG_RETURN_BOOL(true); } /* * set_node_candidate_priority sets node candidate priority property */ Datum set_node_candidate_priority(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); text *nodeNameText = PG_GETARG_TEXT_P(1); char *nodeName = text_to_cstring(nodeNameText); int candidatePriority = PG_GETARG_INT32(2); ListCell *nodeCell = NULL; int nonZeroCandidatePriorityNodeCount = 0; AutoFailoverNode *currentNode = GetAutoFailoverNodeByName(formationId, nodeName); if (currentNode == NULL) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("node \"%s\" is not registered in formation \"%s\"", nodeName, formationId))); } LockFormation(currentNode->formationId, ShareLock); LockNodeGroup(currentNode->formationId, currentNode->groupId, ExclusiveLock); List *nodesGroupList = AutoFailoverNodeGroup(currentNode->formationId, currentNode->groupId); int nodesCount = list_length(nodesGroupList); if (candidatePriority < 0 || candidatePriority > MAX_USER_DEFINED_CANDIDATE_PRIORITY) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid value for candidate_priority \"%d\" " "expected an integer value between 0 and %d", candidatePriority, MAX_USER_DEFINED_CANDIDATE_PRIORITY))); } if (strcmp(currentNode->nodeCluster, "default") != 0 && candidatePriority != 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid value for candidate_priority: " "read-replica nodes in a citus cluster must always " "have candidate priority set to zero"))); } if (candidatePriority == 0 && currentNode->candidatePriority != 0) { /* * We need to ensure we have at least two nodes with a non-zero * candidate priority, otherwise we can't failover. Those two nodes * include the current primary. */ foreach(nodeCell, nodesGroupList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node->candidatePriority > 0) { nonZeroCandidatePriorityNodeCount++; } } /* account for the change we're asked to implement */ nonZeroCandidatePriorityNodeCount -= 1; if (nonZeroCandidatePriorityNodeCount < 2) { ereport(NOTICE, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("setting candidate priority to zero, preventing " "automated failover"), errdetail("Group %d in formation \"%s\" have no " "failover candidate.", currentNode->groupId, formationId))); } } currentNode->candidatePriority = candidatePriority; ReportAutoFailoverNodeReplicationSetting( currentNode->nodeId, currentNode->nodeHost, currentNode->nodePort, currentNode->candidatePriority, currentNode->replicationQuorum); if (nodesCount == 1) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Updating candidate priority to %d for " NODE_FORMAT, currentNode->candidatePriority, NODE_FORMAT_ARGS(currentNode)); NotifyStateChange(currentNode, message); } else { char message[BUFSIZE]; AutoFailoverNode *primaryNode = GetPrimaryNodeInGroup(currentNode->formationId, currentNode->groupId); /* * If we allow setting changes during APPLY_SETTINGS we open the door * for race conditions where we can't be sure that the latest changes * have been applied. * * If we don't currently have a primary node anyway, we can just * proceed with the change. */ if (primaryNode && !IsCurrentState(primaryNode, REPLICATION_STATE_APPLY_SETTINGS)) { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to apply_settings after updating " NODE_FORMAT " candidate priority to %d.", NODE_FORMAT_ARGS(primaryNode), NODE_FORMAT_ARGS(currentNode), currentNode->candidatePriority); SetNodeGoalState(primaryNode, REPLICATION_STATE_APPLY_SETTINGS, message); } /* if primaryNode is not NULL, then current state is APPLY_SETTINGS */ else if (primaryNode) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot set candidate priority when current state " "for primary " NODE_FORMAT " is \"%s\"", NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryNode->reportedState)))); } /* other case is that we failed to find a primary node, proceed */ } PG_RETURN_BOOL(true); } /* * set_node_replication_quorum sets node replication quorum property */ Datum set_node_replication_quorum(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); text *nodeNameText = PG_GETARG_TEXT_P(1); char *nodeName = text_to_cstring(nodeNameText); bool replicationQuorum = PG_GETARG_BOOL(2); AutoFailoverNode *currentNode = GetAutoFailoverNodeByName(formationId, nodeName); if (currentNode == NULL) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("node \"%s\" is not registered in formation \"%s\"", nodeName, formationId))); } LockFormation(currentNode->formationId, ShareLock); LockNodeGroup(currentNode->formationId, currentNode->groupId, ExclusiveLock); List *nodesGroupList = AutoFailoverNodeGroup(currentNode->formationId, currentNode->groupId); int nodesCount = list_length(nodesGroupList); currentNode->replicationQuorum = replicationQuorum; ReportAutoFailoverNodeReplicationSetting(currentNode->nodeId, currentNode->nodeHost, currentNode->nodePort, currentNode->candidatePriority, currentNode->replicationQuorum); /* we need to see the result of that operation in the next query */ CommandCounterIncrement(); /* it's not always possible to opt-out from replication-quorum */ if (!currentNode->replicationQuorum) { AutoFailoverFormation *formation = GetFormation(currentNode->formationId); AutoFailoverNode *primaryNode = GetPrimaryNodeInGroup(formation->formationId, currentNode->groupId); int standbyCount = 0; if (primaryNode == NULL) { /* maybe we could use an Assert() instead? */ ereport(ERROR, (errmsg("Couldn't find the primary node in " "formation \"%s\", group %d", formation->formationId, currentNode->groupId))); } if (!FormationNumSyncStandbyIsValid(formation, primaryNode, currentNode->groupId, &standbyCount)) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("can't set replication quorum to false"), errdetail("At least %d standby nodes are required " "in formation %s with number_sync_standbys = %d, " "and only %d would be participating in " "the replication quorum", formation->number_sync_standbys + 1, formation->formationId, formation->number_sync_standbys, standbyCount))); } } if (nodesCount == 1) { char message[BUFSIZE]; LogAndNotifyMessage( message, BUFSIZE, "Updating replicationQuorum to %s for " NODE_FORMAT, currentNode->replicationQuorum ? "true" : "false", NODE_FORMAT_ARGS(currentNode)); NotifyStateChange(currentNode, message); } else { char message[BUFSIZE]; AutoFailoverNode *primaryNode = GetPrimaryNodeInGroup(currentNode->formationId, currentNode->groupId); /* * If we allow setting changes during APPLY_SETTINGS we open the door * for race conditions where we can't be sure that the latest changes * have been applied. * * If we don't currently have a primary node anyway, we can just * proceed with the change. */ if (primaryNode && !IsCurrentState(primaryNode, REPLICATION_STATE_APPLY_SETTINGS)) { LogAndNotifyMessage( message, BUFSIZE, "Setting goal state of " NODE_FORMAT " to apply_settings after updating " NODE_FORMAT " replication quorum to %s.", NODE_FORMAT_ARGS(primaryNode), NODE_FORMAT_ARGS(currentNode), currentNode->replicationQuorum ? "true" : "false"); SetNodeGoalState(primaryNode, REPLICATION_STATE_APPLY_SETTINGS, message); } /* if primaryNode is not NULL, then current state is APPLY_SETTINGS */ else if (primaryNode) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot set replication quorum when current state " "for primary " NODE_FORMAT " is \"%s\"", NODE_FORMAT_ARGS(primaryNode), ReplicationStateGetName(primaryNode->reportedState)))); } /* other case is that we failed to find a primary node, proceed */ } PG_RETURN_BOOL(true); } /* * update_node_metadata allows to update a node's nodename, hostname, and port. * * The pg_autoctl client fetches the list of "other" nodes on each iteration * and will take it from there that they need to update their HBA rules when * the hostname has changed. */ Datum update_node_metadata(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); int64 nodeid = 0; char *nodeName = NULL; char *nodeHost = NULL; int32 nodePort = 0; if (PG_ARGISNULL(0)) { ereport(ERROR, (errmsg("udpate_node_metadata requires a non-null nodeid"))); } else { nodeid = PG_GETARG_INT64(0); } AutoFailoverNode *currentNode = GetAutoFailoverNodeById(nodeid); if (currentNode == NULL) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("node %lld is not registered", (long long) nodeid))); } LockFormation(currentNode->formationId, ShareLock); LockNodeGroup(currentNode->formationId, currentNode->groupId, ExclusiveLock); /* * When arguments are NULL, replace them with the current value of the node * metadata, so that the UPDATE statement then is a noop on that field. */ if (PG_ARGISNULL(1)) { nodeName = currentNode->nodeName; } else { text *nodeNameText = PG_GETARG_TEXT_P(1); nodeName = text_to_cstring(nodeNameText); } if (PG_ARGISNULL(2)) { nodeHost = currentNode->nodeHost; } else { text *nodeHostText = PG_GETARG_TEXT_P(2); nodeHost = text_to_cstring(nodeHostText); } if (PG_ARGISNULL(3)) { nodePort = currentNode->nodePort; } else { nodePort = PG_GETARG_INT32(3); } UpdateAutoFailoverNodeMetadata(currentNode->nodeId, nodeName, nodeHost, nodePort); PG_RETURN_BOOL(true); } /* * synchronous_standby_names returns the synchronous_standby_names parameter * value for a given Postgres service group in a given formation. */ Datum synchronous_standby_names(PG_FUNCTION_ARGS) { checkPgAutoFailoverVersion(); text *formationIdText = PG_GETARG_TEXT_P(0); char *formationId = text_to_cstring(formationIdText); int32 groupId = PG_GETARG_INT32(1); AutoFailoverFormation *formation = GetFormation(formationId); List *nodesGroupList = AutoFailoverNodeGroup(formationId, groupId); int nodesCount = list_length(nodesGroupList); /* * When there's no nodes registered yet, there's no pg_autoctl process that * needs the information anyway. Return NULL. */ if (nodesCount == 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("no nodes found in group %d of formation \"%s\"", groupId, formationId))); } /* when we have a SINGLE node we disable synchronous replication */ if (nodesCount == 1) { PG_RETURN_TEXT_P(cstring_to_text("")); } /* when we have more than one node, fetch the primary */ AutoFailoverNode *primaryNode = GetPrimaryNodeInGroup(formationId, groupId); List *standbyNodesGroupList = AutoFailoverOtherNodesList(primaryNode); /* * Single standby case, we assume formation->number_sync_standbys == 0 */ if (nodesCount == 2) { AutoFailoverNode *secondaryNode = linitial(standbyNodesGroupList); if (secondaryNode != NULL && secondaryNode->replicationQuorum && secondaryNode->goalState == REPLICATION_STATE_SECONDARY) { /* enable synchronous replication */ StringInfo sbnames = makeStringInfo(); appendStringInfo(sbnames, "ANY 1 (pgautofailover_standby_%lld)", (long long) secondaryNode->nodeId); PG_RETURN_TEXT_P(cstring_to_text(sbnames->data)); } else { /* disable synchronous replication */ PG_RETURN_TEXT_P(cstring_to_text("")); } } /* * General case now, we have multiple standbys each with a candidate * priority, and with replicationQuorum (bool: true or false). * * - syncStandbyNodesGroupList contains only nodes that participates in * the replication quorum * * - then we build synchronous_standby_names with the following model: * * ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3) * * The num_sync number is the formation number_sync_standbys property. */ { List *syncStandbyNodesGroupList = GroupListSyncStandbys(standbyNodesGroupList); int count = list_length(syncStandbyNodesGroupList); if (count == 0 || IsCurrentState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY)) { /* * If no standby participates in the replication Quorum, we * disable synchronous replication. */ PG_RETURN_TEXT_P(cstring_to_text("")); } else { /* * We accept number_sync_standbys to be set to zero to enable our * failover trade-off, but won't send a synchronous_standby_names * setting with ANY 0 () or FIRST 0 (), that would not make sense. */ int number_sync_standbys = formation->number_sync_standbys == 0 ? 1 : formation->number_sync_standbys; StringInfo sbnames = makeStringInfo(); ListCell *nodeCell = NULL; bool firstNode = true; appendStringInfo(sbnames, "ANY %d (", number_sync_standbys); foreach(nodeCell, syncStandbyNodesGroupList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); appendStringInfo(sbnames, "%spgautofailover_standby_%lld", firstNode ? "" : ", ", (long long) node->nodeId); if (firstNode) { firstNode = false; } } appendStringInfoString(sbnames, ")"); PG_RETURN_TEXT_P(cstring_to_text(sbnames->data)); } } } pg_auto_failover-1.6.3/src/monitor/node_metadata.c000066400000000000000000001514621414244367200222750ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/node_metadata.c * * Implementation of functions related to health check metadata. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" #include "fmgr.h" #include "miscadmin.h" #include "nodes/pg_list.h" /* list_qsort is only in Postgres 11 and 12 */ #include "version_compat.h" #include "health_check.h" #include "metadata.h" #include "node_metadata.h" #include "notifications.h" #include "access/genam.h" #include "access/heapam.h" #include "access/htup.h" #include "access/htup_details.h" #include "access/tupdesc.h" #include "access/xact.h" #include "access/xlogdefs.h" #include "catalog/indexing.h" #include "catalog/namespace.h" #include "catalog/pg_extension.h" #include "catalog/pg_type.h" #include "commands/sequence.h" #include "executor/spi.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/pg_lsn.h" #include "utils/rel.h" #include "utils/relcache.h" #include "utils/syscache.h" /* GUC variables */ int DrainTimeoutMs = 30 * 1000; int UnhealthyTimeoutMs = 20 * 1000; int StartupGracePeriodMs = 10 * 1000; /* * AllAutoFailoverNodes returns all AutoFailover nodes in a formation as a * list. */ List * AllAutoFailoverNodes(char *formationId) { List *nodeList = NIL; MemoryContext callerContext = CurrentMemoryContext; Oid argTypes[] = { TEXTOID /* formationid */ }; Datum argValues[] = { CStringGetTextDatum(formationId) /* formationid */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); uint64 rowNumber = 0; const char *selectQuery = SELECT_ALL_FROM_AUTO_FAILOVER_NODE_TABLE " WHERE formationid = $1 "; SPI_connect(); int spiStatus = SPI_execute_with_args(selectQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_SELECT) { elog(ERROR, "could not select from " AUTO_FAILOVER_NODE_TABLE); } MemoryContext spiContext = MemoryContextSwitchTo(callerContext); for (rowNumber = 0; rowNumber < SPI_processed; rowNumber++) { HeapTuple heapTuple = SPI_tuptable->vals[rowNumber]; AutoFailoverNode *pgAutoFailoverNode = TupleToAutoFailoverNode(SPI_tuptable->tupdesc, heapTuple); nodeList = lappend(nodeList, pgAutoFailoverNode); } MemoryContextSwitchTo(spiContext); SPI_finish(); return nodeList; } /* * TupleToAutoFailoverNode constructs a AutoFailoverNode from a heap tuple. */ AutoFailoverNode * TupleToAutoFailoverNode(TupleDesc tupleDescriptor, HeapTuple heapTuple) { bool isNull = false; bool sysIdentifierIsNull = false; Datum formationId = heap_getattr(heapTuple, Anum_pgautofailover_node_formationid, tupleDescriptor, &isNull); Datum nodeId = heap_getattr(heapTuple, Anum_pgautofailover_node_nodeid, tupleDescriptor, &isNull); Datum groupId = heap_getattr(heapTuple, Anum_pgautofailover_node_groupid, tupleDescriptor, &isNull); Datum nodeName = heap_getattr(heapTuple, Anum_pgautofailover_node_nodename, tupleDescriptor, &isNull); Datum nodeHost = heap_getattr(heapTuple, Anum_pgautofailover_node_nodehost, tupleDescriptor, &isNull); Datum nodePort = heap_getattr(heapTuple, Anum_pgautofailover_node_nodeport, tupleDescriptor, &isNull); Datum sysIdentifier = heap_getattr(heapTuple, Anum_pgautofailover_node_sysidentifier, tupleDescriptor, &sysIdentifierIsNull); Datum goalState = heap_getattr(heapTuple, Anum_pgautofailover_node_goalstate, tupleDescriptor, &isNull); Datum reportedState = heap_getattr(heapTuple, Anum_pgautofailover_node_reportedstate, tupleDescriptor, &isNull); Datum pgIsRunning = heap_getattr(heapTuple, Anum_pgautofailover_node_reportedpgisrunning, tupleDescriptor, &isNull); Datum pgsrSyncState = heap_getattr(heapTuple, Anum_pgautofailover_node_reportedrepstate, tupleDescriptor, &isNull); Datum reportTime = heap_getattr(heapTuple, Anum_pgautofailover_node_reporttime, tupleDescriptor, &isNull); Datum walReportTime = heap_getattr(heapTuple, Anum_pgautofailover_node_walreporttime, tupleDescriptor, &isNull); Datum health = heap_getattr(heapTuple, Anum_pgautofailover_node_health, tupleDescriptor, &isNull); Datum healthCheckTime = heap_getattr(heapTuple, Anum_pgautofailover_node_healthchecktime, tupleDescriptor, &isNull); Datum stateChangeTime = heap_getattr(heapTuple, Anum_pgautofailover_node_statechangetime, tupleDescriptor, &isNull); Datum reportedTLI = heap_getattr(heapTuple, Anum_pgautofailover_node_reportedTLI, tupleDescriptor, &isNull); Datum reportedLSN = heap_getattr(heapTuple, Anum_pgautofailover_node_reportedLSN, tupleDescriptor, &isNull); Datum candidatePriority = heap_getattr(heapTuple, Anum_pgautofailover_node_candidate_priority, tupleDescriptor, &isNull); Datum replicationQuorum = heap_getattr(heapTuple, Anum_pgautofailover_node_replication_quorum, tupleDescriptor, &isNull); Datum nodeCluster = heap_getattr(heapTuple, Anum_pgautofailover_node_nodecluster, tupleDescriptor, &isNull); Oid goalStateOid = DatumGetObjectId(goalState); Oid reportedStateOid = DatumGetObjectId(reportedState); AutoFailoverNode *pgAutoFailoverNode = (AutoFailoverNode *) palloc0( sizeof(AutoFailoverNode)); pgAutoFailoverNode->formationId = TextDatumGetCString(formationId); pgAutoFailoverNode->nodeId = DatumGetInt64(nodeId); pgAutoFailoverNode->groupId = DatumGetInt32(groupId); pgAutoFailoverNode->nodeName = TextDatumGetCString(nodeName); pgAutoFailoverNode->nodeHost = TextDatumGetCString(nodeHost); pgAutoFailoverNode->nodePort = DatumGetInt32(nodePort); pgAutoFailoverNode->sysIdentifier = sysIdentifierIsNull ? 0 : DatumGetInt64(sysIdentifier); pgAutoFailoverNode->goalState = EnumGetReplicationState(goalStateOid); pgAutoFailoverNode->reportedState = EnumGetReplicationState(reportedStateOid); pgAutoFailoverNode->pgIsRunning = DatumGetBool(pgIsRunning); pgAutoFailoverNode->pgsrSyncState = SyncStateFromString(TextDatumGetCString(pgsrSyncState)); pgAutoFailoverNode->reportTime = DatumGetTimestampTz(reportTime); pgAutoFailoverNode->walReportTime = DatumGetTimestampTz(walReportTime); pgAutoFailoverNode->health = DatumGetInt32(health); pgAutoFailoverNode->healthCheckTime = DatumGetTimestampTz(healthCheckTime); pgAutoFailoverNode->stateChangeTime = DatumGetTimestampTz(stateChangeTime); pgAutoFailoverNode->reportedTLI = DatumGetInt32(reportedTLI); pgAutoFailoverNode->reportedLSN = DatumGetLSN(reportedLSN); pgAutoFailoverNode->candidatePriority = DatumGetInt32(candidatePriority); pgAutoFailoverNode->replicationQuorum = DatumGetBool(replicationQuorum); pgAutoFailoverNode->nodeCluster = TextDatumGetCString(nodeCluster); return pgAutoFailoverNode; } /* * AutoFailoverNodeGroup returns all nodes in the given formation and * group as a list. */ List * AutoFailoverNodeGroup(char *formationId, int groupId) { List *nodeList = NIL; MemoryContext callerContext = CurrentMemoryContext; Oid argTypes[] = { TEXTOID, /* formationid */ INT4OID /* groupid */ }; Datum argValues[] = { CStringGetTextDatum(formationId), /* formationid */ Int32GetDatum(groupId) /* groupid */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); uint64 rowNumber = 0; const char *selectQuery = SELECT_ALL_FROM_AUTO_FAILOVER_NODE_TABLE " WHERE formationid = $1 AND groupid = $2" " AND goalstate <> 'dropped'" " ORDER BY nodeid"; SPI_connect(); int spiStatus = SPI_execute_with_args(selectQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_SELECT) { elog(ERROR, "could not select from " AUTO_FAILOVER_NODE_TABLE); } MemoryContext spiContext = MemoryContextSwitchTo(callerContext); for (rowNumber = 0; rowNumber < SPI_processed; rowNumber++) { HeapTuple heapTuple = SPI_tuptable->vals[rowNumber]; AutoFailoverNode *pgAutoFailoverNode = TupleToAutoFailoverNode(SPI_tuptable->tupdesc, heapTuple); nodeList = lappend(nodeList, pgAutoFailoverNode); } MemoryContextSwitchTo(spiContext); SPI_finish(); return nodeList; } /* * AutoFailoverAllNodesInGroup returns all nodes in the given formation and * group as a list, and includes nodes that are currently being dropped. */ List * AutoFailoverAllNodesInGroup(char *formationId, int groupId) { List *nodeList = NIL; MemoryContext callerContext = CurrentMemoryContext; Oid argTypes[] = { TEXTOID, /* formationid */ INT4OID /* groupid */ }; Datum argValues[] = { CStringGetTextDatum(formationId), /* formationid */ Int32GetDatum(groupId) /* groupid */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); uint64 rowNumber = 0; const char *selectQuery = SELECT_ALL_FROM_AUTO_FAILOVER_NODE_TABLE " WHERE formationid = $1 AND groupid = $2" " ORDER BY nodeid"; SPI_connect(); int spiStatus = SPI_execute_with_args(selectQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_SELECT) { elog(ERROR, "could not select from " AUTO_FAILOVER_NODE_TABLE); } MemoryContext spiContext = MemoryContextSwitchTo(callerContext); for (rowNumber = 0; rowNumber < SPI_processed; rowNumber++) { HeapTuple heapTuple = SPI_tuptable->vals[rowNumber]; AutoFailoverNode *pgAutoFailoverNode = TupleToAutoFailoverNode(SPI_tuptable->tupdesc, heapTuple); nodeList = lappend(nodeList, pgAutoFailoverNode); } MemoryContextSwitchTo(spiContext); SPI_finish(); return nodeList; } /* * AutoFailoverOtherNodesList returns a list of all the other nodes in the same * formation and group as the given one. */ List * AutoFailoverOtherNodesList(AutoFailoverNode *pgAutoFailoverNode) { ListCell *nodeCell = NULL; List *otherNodesList = NIL; if (pgAutoFailoverNode == NULL) { return NIL; } List *groupNodeList = AutoFailoverNodeGroup(pgAutoFailoverNode->formationId, pgAutoFailoverNode->groupId); foreach(nodeCell, groupNodeList) { AutoFailoverNode *otherNode = (AutoFailoverNode *) lfirst(nodeCell); if (otherNode != NULL && otherNode->nodeId != pgAutoFailoverNode->nodeId) { otherNodesList = lappend(otherNodesList, otherNode); } } return otherNodesList; } /* * AutoFailoverOtherNodesList returns a list of all the other nodes in the same * formation and group as the given one. */ List * AutoFailoverOtherNodesListInState(AutoFailoverNode *pgAutoFailoverNode, ReplicationState currentState) { ListCell *nodeCell = NULL; List *otherNodesList = NIL; if (pgAutoFailoverNode == NULL) { return NIL; } List *groupNodeList = AutoFailoverNodeGroup(pgAutoFailoverNode->formationId, pgAutoFailoverNode->groupId); foreach(nodeCell, groupNodeList) { AutoFailoverNode *otherNode = (AutoFailoverNode *) lfirst(nodeCell); if (otherNode != NULL && otherNode->nodeId != pgAutoFailoverNode->nodeId && otherNode->goalState == currentState) { otherNodesList = lappend(otherNodesList, otherNode); } } return otherNodesList; } /* * AutoFailoverCandidateNodesList returns a list of all the other nodes in the * same formation and group as the given one, with candidate priority > 0. */ List * AutoFailoverCandidateNodesListInState(AutoFailoverNode *pgAutoFailoverNode, ReplicationState currentState) { ListCell *nodeCell = NULL; List *otherNodesList = NIL; if (pgAutoFailoverNode == NULL) { return NIL; } List *groupNodeList = AutoFailoverNodeGroup(pgAutoFailoverNode->formationId, pgAutoFailoverNode->groupId); foreach(nodeCell, groupNodeList) { AutoFailoverNode *otherNode = (AutoFailoverNode *) lfirst(nodeCell); if (otherNode != NULL && otherNode->nodeId != pgAutoFailoverNode->nodeId && otherNode->candidatePriority > 0 && otherNode->goalState == currentState) { otherNodesList = lappend(otherNodesList, otherNode); } } return otherNodesList; } /* * GetPrimaryNodeInGroup returns the writable node in the specified group, if * any. */ AutoFailoverNode * GetPrimaryNodeInGroup(char *formationId, int32 groupId) { AutoFailoverNode *writableNode = NULL; ListCell *nodeCell = NULL; List *groupNodeList = AutoFailoverNodeGroup(formationId, groupId); foreach(nodeCell, groupNodeList) { AutoFailoverNode *currentNode = (AutoFailoverNode *) lfirst(nodeCell); if (CanTakeWritesInState(currentNode->goalState)) { writableNode = currentNode; break; } } return writableNode; } /* * GetPrimaryNodeInGroup returns the writable node in the specified group, if * any. */ AutoFailoverNode * GetNodeToFailoverFromInGroup(char *formationId, int32 groupId) { AutoFailoverNode *failoverNode = NULL; ListCell *nodeCell = NULL; List *groupNodeList = AutoFailoverNodeGroup(formationId, groupId); foreach(nodeCell, groupNodeList) { AutoFailoverNode *currentNode = (AutoFailoverNode *) lfirst(nodeCell); if (CanInitiateFailover(currentNode->goalState) && currentNode->reportedState == currentNode->goalState) { failoverNode = currentNode; break; } } return failoverNode; } /* * GetPrimaryOrDemotedNodeInGroup returns the node in the group with a role * that only a primary can have. * * When handling multiple standbys, it could be that the primary node gets * demoted, triggering a failover with the other standby node(s). Then the * demoted node connects back to the monitor, and should be processed as a * standby that re-joins the group, not as a primary being demoted. */ AutoFailoverNode * GetPrimaryOrDemotedNodeInGroup(char *formationId, int32 groupId) { AutoFailoverNode *primaryNode = NULL; ListCell *nodeCell = NULL; List *groupNodeList = AutoFailoverNodeGroup(formationId, groupId); /* first find a node that is writable */ foreach(nodeCell, groupNodeList) { AutoFailoverNode *currentNode = (AutoFailoverNode *) lfirst(nodeCell); if (CanTakeWritesInState(currentNode->goalState)) { primaryNode = currentNode; break; } } /* if we found a writable node, we're done */ if (primaryNode != NULL) { return primaryNode; } /* * Maybe we have a primary that is draining or has been demoted? * In case there are more than one of those, choose the one that is * currently being demoted. */ foreach(nodeCell, groupNodeList) { AutoFailoverNode *currentNode = (AutoFailoverNode *) lfirst(nodeCell); if (StateBelongsToPrimary(currentNode->reportedState) && (!IsBeingDemotedPrimary(primaryNode) || !IsDemotedPrimary(currentNode))) { primaryNode = currentNode; } } return primaryNode; } /* * FindFailoverNewStandbyNode returns the first node found in given list that * is a new standby, so that we can process each standby one after the other. */ AutoFailoverNode * FindFailoverNewStandbyNode(List *groupNodeList) { ListCell *nodeCell = NULL; AutoFailoverNode *standbyNode = NULL; /* find the standby for errdetail */ foreach(nodeCell, groupNodeList) { AutoFailoverNode *otherNode = (AutoFailoverNode *) lfirst(nodeCell); if (IsCurrentState(otherNode, REPLICATION_STATE_WAIT_STANDBY) || IsCurrentState(otherNode, REPLICATION_STATE_CATCHINGUP)) { standbyNode = otherNode; } } return standbyNode; } /* * FindMostAdvancedStandby returns the node in groupNodeList that has the most * advanced LSN. */ AutoFailoverNode * FindMostAdvancedStandby(List *groupNodeList) { ListCell *nodeCell = NULL; AutoFailoverNode *mostAdvancedNode = NULL; /* find the standby for errdetail */ foreach(nodeCell, groupNodeList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (mostAdvancedNode == NULL || mostAdvancedNode->reportedLSN < node->reportedLSN) { mostAdvancedNode = node; } } return mostAdvancedNode; } /* * FindCandidateNodeBeingPromoted scans through the given groupNodeList and * returns the first node found that IsBeingPromoted(). */ bool IsFailoverInProgress(List *groupNodeList) { ListCell *nodeCell = NULL; foreach(nodeCell, groupNodeList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node == NULL) { /* shouldn't happen */ ereport(ERROR, (errmsg("BUG: node is NULL"))); } /* * A single node participating in a promotion allows to answer already. */ if (IsParticipatingInPromotion(node)) { return true; } /* no conclusions to be drawn from nodes in maintenance */ if (IsInMaintenance(node)) { continue; } } /* * If no node is participating in a promotion, then no failover is in * progress. */ return false; } /* * FindCandidateNodeBeingPromoted scans through the given groupNodeList and * returns the first node found that IsBeingPromoted(). */ AutoFailoverNode * FindCandidateNodeBeingPromoted(List *groupNodeList) { ListCell *nodeCell = NULL; foreach(nodeCell, groupNodeList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node == NULL) { /* shouldn't happen */ ereport(ERROR, (errmsg("BUG: node is NULL"))); } /* we might have a failover ongoing already */ if (IsBeingPromoted(node)) { return node; } } return NULL; } /* * pgautofailover_node_candidate_priority_compare * qsort comparator for sorting node lists by candidate priority */ #if (PG_VERSION_NUM >= 130000) static int pgautofailover_node_candidate_priority_compare(const union ListCell *a, const union ListCell *b) { AutoFailoverNode *node1 = (AutoFailoverNode *) lfirst(a); AutoFailoverNode *node2 = (AutoFailoverNode *) lfirst(b); #else static int pgautofailover_node_candidate_priority_compare(const void *a, const void *b) { AutoFailoverNode *node1 = (AutoFailoverNode *) lfirst(*(ListCell **) a); AutoFailoverNode *node2 = (AutoFailoverNode *) lfirst(*(ListCell **) b); #endif if (node1->candidatePriority > node2->candidatePriority) { return -1; } if (node1->candidatePriority < node2->candidatePriority) { return 1; } return 0; } /* * GroupListCandidates returns a list of nodes in groupNodeList that are all * candidates for failover (those with AutoFailoverNode.candidatePriority > 0), * sorted by candidatePriority. */ List * GroupListCandidates(List *groupNodeList) { ListCell *nodeCell = NULL; List *candidateNodesList = NIL; List *sortedNodeList = list_copy(groupNodeList); #if (PG_VERSION_NUM >= 130000) list_sort(sortedNodeList, pgautofailover_node_candidate_priority_compare); #else sortedNodeList = list_qsort(sortedNodeList, pgautofailover_node_candidate_priority_compare); #endif foreach(nodeCell, sortedNodeList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node->candidatePriority > 0) { candidateNodesList = lappend(candidateNodesList, node); } } list_free(sortedNodeList); return candidateNodesList; } /* * pgautofailover_node_reportedlsn_compare * qsort comparator for sorting node lists by reported lsn, descending */ #if (PG_VERSION_NUM >= 130000) static int pgautofailover_node_reportedlsn_compare(const union ListCell *a, const union ListCell *b) { AutoFailoverNode *node1 = (AutoFailoverNode *) lfirst(a); AutoFailoverNode *node2 = (AutoFailoverNode *) lfirst(b); #else static int pgautofailover_node_reportedlsn_compare(const void *a, const void *b) { AutoFailoverNode *node1 = (AutoFailoverNode *) lfirst(*(ListCell **) a); AutoFailoverNode *node2 = (AutoFailoverNode *) lfirst(*(ListCell **) b); #endif if (node1->reportedTLI > node2->reportedTLI || (node1->reportedTLI == node2->reportedTLI && node1->reportedLSN > node2->reportedLSN)) { return -1; } if (node1->reportedTLI < node2->reportedTLI || (node1->reportedTLI == node2->reportedTLI && node1->reportedLSN < node2->reportedLSN)) { return 1; } return 0; } /* * ListMostAdvancedStandbyNodes returns the nodes in groupNodeList that have * the most advanced LSN. */ List * ListMostAdvancedStandbyNodes(List *groupNodeList) { ListCell *nodeCell = NULL; List *mostAdvancedNodeList = NIL; XLogRecPtr mostAdvancedLSN = 0; #if (PG_VERSION_NUM >= 130000) List *sortedNodeList = list_copy(groupNodeList); list_sort(sortedNodeList, pgautofailover_node_reportedlsn_compare); #else List *sortedNodeList = list_qsort(groupNodeList, pgautofailover_node_reportedlsn_compare); #endif foreach(nodeCell, sortedNodeList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); /* skip old primary */ if (StateBelongsToPrimary(node->reportedState)) { continue; } if (mostAdvancedLSN == 0) { mostAdvancedLSN = node->reportedLSN; } if (node->reportedLSN == mostAdvancedLSN) { mostAdvancedNodeList = lappend(mostAdvancedNodeList, node); } } return mostAdvancedNodeList; } /* * GroupListSyncStandbys returns a list of nodes in groupNodeList that are all * candidates for failover (those with AutoFailoverNode.replicationQuorum set * to true), sorted by candidatePriority. */ List * GroupListSyncStandbys(List *groupNodeList) { ListCell *nodeCell = NULL; List *syncStandbyNodesList = NIL; if (groupNodeList == NIL) { return NIL; } #if (PG_VERSION_NUM >= 130000) List *sortedNodeList = list_copy(groupNodeList); list_sort(sortedNodeList, pgautofailover_node_candidate_priority_compare); #else List *sortedNodeList = list_qsort(groupNodeList, pgautofailover_node_candidate_priority_compare); #endif foreach(nodeCell, sortedNodeList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node->replicationQuorum) { syncStandbyNodesList = lappend(syncStandbyNodesList, node); } } list_free(sortedNodeList); return syncStandbyNodesList; } /* * CountSyncStandbys returns how many standby nodes have their * replicationQuorum property set to true in the given groupNodeList. */ int CountSyncStandbys(List *groupNodeList) { int count = 0; ListCell *nodeCell = NULL; foreach(nodeCell, groupNodeList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node->replicationQuorum) { ++count; } } return count; } /* * IsHealthySyncStandby returns true if the node its replicationQuorum property * set to true in the given groupNodeList, but only if only if that node is * currently currently in REPLICATION_STATE_SECONDARY and known healthy. */ bool IsHealthySyncStandby(AutoFailoverNode *node) { return node->replicationQuorum && IsCurrentState(node, REPLICATION_STATE_SECONDARY) && IsHealthy(node); } /* * CountHealthySyncStandbys returns how many standby nodes have their * replicationQuorum property set to true in the given groupNodeList, counting * only nodes that are currently in REPLICATION_STATE_SECONDARY and known * healthy. */ int CountHealthySyncStandbys(List *groupNodeList) { int count = 0; ListCell *nodeCell = NULL; foreach(nodeCell, groupNodeList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (IsHealthySyncStandby(node)) { ++count; } } return count; } /* * CountHealthyCandidates returns how many standby nodes have their * candidatePriority > 0 in the given groupNodeList, counting only nodes that * are currently in REPLICATION_STATE_SECONDARY and known healthy. */ int CountHealthyCandidates(List *groupNodeList) { int count = 0; ListCell *nodeCell = NULL; foreach(nodeCell, groupNodeList) { AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell); if (node->candidatePriority > 0 && IsCurrentState(node, REPLICATION_STATE_SECONDARY) && IsHealthy(node)) { ++count; } } return count; } /* * GetAutoFailoverNode returns a single AutoFailover node by hostname and port. */ AutoFailoverNode * GetAutoFailoverNode(char *nodeHost, int nodePort) { AutoFailoverNode *pgAutoFailoverNode = NULL; MemoryContext callerContext = CurrentMemoryContext; Oid argTypes[] = { TEXTOID, /* nodehost */ INT4OID /* nodeport */ }; Datum argValues[] = { CStringGetTextDatum(nodeHost), /* nodehost */ Int32GetDatum(nodePort) /* nodeport */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *selectQuery = SELECT_ALL_FROM_AUTO_FAILOVER_NODE_TABLE " WHERE nodehost = $1 AND nodeport = $2"; SPI_connect(); int spiStatus = SPI_execute_with_args(selectQuery, argCount, argTypes, argValues, NULL, false, 1); if (spiStatus != SPI_OK_SELECT) { elog(ERROR, "could not select from " AUTO_FAILOVER_NODE_TABLE); } if (SPI_processed > 0) { MemoryContext spiContext = MemoryContextSwitchTo(callerContext); pgAutoFailoverNode = TupleToAutoFailoverNode(SPI_tuptable->tupdesc, SPI_tuptable->vals[0]); MemoryContextSwitchTo(spiContext); } else { pgAutoFailoverNode = NULL; } SPI_finish(); return pgAutoFailoverNode; } /* * GetAutoFailoverNodeWithId returns a single AutoFailover * identified by node id, node name and node port. * * This function returns NULL, when the node could not be found. */ AutoFailoverNode * GetAutoFailoverNodeById(int64 nodeId) { AutoFailoverNode *pgAutoFailoverNode = NULL; MemoryContext callerContext = CurrentMemoryContext; Oid argTypes[] = { INT8OID /* nodeId */ }; Datum argValues[] = { Int64GetDatum(nodeId) /* nodeId */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *selectQuery = SELECT_ALL_FROM_AUTO_FAILOVER_NODE_TABLE " WHERE nodeid = $1"; SPI_connect(); int spiStatus = SPI_execute_with_args(selectQuery, argCount, argTypes, argValues, NULL, false, 1); if (spiStatus != SPI_OK_SELECT) { elog(ERROR, "could not select from " AUTO_FAILOVER_NODE_TABLE); } if (SPI_processed > 0) { MemoryContext spiContext = MemoryContextSwitchTo(callerContext); pgAutoFailoverNode = TupleToAutoFailoverNode(SPI_tuptable->tupdesc, SPI_tuptable->vals[0]); MemoryContextSwitchTo(spiContext); } else { pgAutoFailoverNode = NULL; } SPI_finish(); return pgAutoFailoverNode; } /* * GetAutoFailoverNodeWithId returns a single AutoFailover * identified by node id, node name and node port. */ AutoFailoverNode * GetAutoFailoverNodeByName(char *formationId, char *nodeName) { AutoFailoverNode *pgAutoFailoverNode = NULL; MemoryContext callerContext = CurrentMemoryContext; Oid argTypes[] = { TEXTOID, /* formationId */ TEXTOID /* nodename */ }; Datum argValues[] = { CStringGetTextDatum(formationId), /* formationId */ CStringGetTextDatum(nodeName) /* nodename */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *selectQuery = SELECT_ALL_FROM_AUTO_FAILOVER_NODE_TABLE " WHERE formationid = $1 and nodename = $2"; SPI_connect(); int spiStatus = SPI_execute_with_args(selectQuery, argCount, argTypes, argValues, NULL, false, 1); if (spiStatus != SPI_OK_SELECT) { elog(ERROR, "could not select from " AUTO_FAILOVER_NODE_TABLE); } if (SPI_processed > 0) { MemoryContext spiContext = MemoryContextSwitchTo(callerContext); pgAutoFailoverNode = TupleToAutoFailoverNode(SPI_tuptable->tupdesc, SPI_tuptable->vals[0]); MemoryContextSwitchTo(spiContext); } else { pgAutoFailoverNode = NULL; } SPI_finish(); return pgAutoFailoverNode; } /* * AddAutoFailoverNode adds a new AutoFailoverNode to pgautofailover.node with * the given properties. * * We use simple_heap_update instead of SPI to avoid recursing into triggers. */ int AddAutoFailoverNode(char *formationId, FormationKind formationKind, int64 nodeId, int groupId, char *nodeName, char *nodeHost, int nodePort, uint64 sysIdentifier, ReplicationState goalState, ReplicationState reportedState, int candidatePriority, bool replicationQuorum, char *nodeCluster) { Oid goalStateOid = ReplicationStateGetEnum(goalState); Oid reportedStateOid = ReplicationStateGetEnum(reportedState); Oid replicationStateTypeOid = ReplicationStateTypeOid(); const char *prefix = formationKind == FORMATION_KIND_CITUS ? (groupId == 0 ? "coordinator" : "worker") : "node"; Oid argTypes[] = { TEXTOID, /* formationid */ INT8OID, /* nodeid */ INT4OID, /* groupid */ TEXTOID, /* nodename */ TEXTOID, /* nodehost */ INT4OID, /* nodeport */ INT8OID, /* sysidentifier */ replicationStateTypeOid, /* goalstate */ replicationStateTypeOid, /* reportedstate */ INT4OID, /* candidate_priority */ BOOLOID, /* replication_quorum */ TEXTOID, /* node name prefix */ TEXTOID /* nodecluster */ }; Datum argValues[] = { CStringGetTextDatum(formationId), /* formationid */ Int64GetDatum(nodeId), /* nodeid */ Int32GetDatum(groupId), /* groupid */ nodeName == NULL ? (Datum) 0 : CStringGetTextDatum(nodeName), /* nodename */ CStringGetTextDatum(nodeHost), /* nodehost */ Int32GetDatum(nodePort), /* nodeport */ Int64GetDatum(sysIdentifier), /* sysidentifier */ ObjectIdGetDatum(goalStateOid), /* goalstate */ ObjectIdGetDatum(reportedStateOid), /* reportedstate */ Int32GetDatum(candidatePriority), /* candidate_priority */ BoolGetDatum(replicationQuorum), /* replication_quorum */ CStringGetTextDatum(prefix), /* prefix */ CStringGetTextDatum(nodeCluster) /* nodecluster */ }; /* * Rather than turning the register_node function as non STRICT, we accept * the default system identifier to be zero and then insert NULL here * instead. * * The alternative would imply testing the 10 args of the function against * the possibility of them being NULL. Also, on the client side, when * PGDATA does not exist our pg_control_data.system_identifier internal * structure is intialized with a zero value. */ const char argNulls[] = { ' ', /* formationid */ ' ', /* nodeid */ ' ', /* groupid */ nodeName == NULL ? 'n' : ' ', /* nodename */ ' ', /* nodehost */ ' ', /* nodeport */ sysIdentifier == 0 ? 'n' : ' ', /* sysidentifier */ ' ', /* goalstate */ ' ', /* reportedstate */ ' ', /* candidate_priority */ ' ', /* replication_quorum */ ' ', /* prefix */ ' ' /* nodecluster */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); int64 insertedNodeId = 0; /* * The node name can be specified by the user as the --name argument at * node registration time, in which case that's what we use of course. * * That said, when the user is not using --name, we still want the node * name NOT NULL and default to 'node_%d' using the nodeid. We can't use * another column in a DEFAULT value though, so we implement this default * in a CASE expression in the INSERT query. * * In a citus formation kind, we want to name the node with the convention * 'coordinator_%d' for the coordinator nodes, and 'worker%d' for the * worker nodes. */ const char *insertQuery = "WITH seq(nodeid) AS " "(SELECT case when $2 = -1 " " then nextval('pgautofailover.node_nodeid_seq'::regclass) " " else $2 end) " "INSERT INTO " AUTO_FAILOVER_NODE_TABLE " (formationid, nodeid, groupid, nodename, nodehost, nodeport, " " sysidentifier, goalstate, reportedstate, " " candidatepriority, replicationquorum, nodecluster)" " SELECT $1, seq.nodeid, $3, " " case when $4 is null then format('%s_%s', $12, seq.nodeid) else $4 end, " " $5, $6, $7, $8, $9, $10, $11, $13 " " FROM seq " "RETURNING nodeid"; SPI_connect(); int spiStatus = SPI_execute_with_args(insertQuery, argCount, argTypes, argValues, argNulls, false, 0); if (spiStatus == SPI_OK_INSERT_RETURNING && SPI_processed > 0) { bool isNull = false; Datum nodeIdDatum = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isNull); insertedNodeId = DatumGetInt64(nodeIdDatum); } else { elog(ERROR, "could not insert into " AUTO_FAILOVER_NODE_TABLE); } /* when a desired_node_id has been given, maintain the nodeid sequence */ if (nodeId != -1) { const char *setValQuery = "SELECT setval('pgautofailover.node_nodeid_seq'::regclass, " " max(nodeid)+1) " " FROM " AUTO_FAILOVER_NODE_TABLE; int spiStatus = SPI_execute_with_args(setValQuery, 0, NULL, NULL, NULL, false, 0); if (spiStatus != SPI_OK_SELECT) { elog(ERROR, "could not setval('pgautofailover.node_nodeid_seq'::regclass)"); } } SPI_finish(); return insertedNodeId; } /* * SetNodeGoalState updates the goal state of a node both on-disk and * in-memory, and notifies the state change. */ void SetNodeGoalState(AutoFailoverNode *pgAutoFailoverNode, ReplicationState goalState, const char *message) { Oid goalStateOid = ReplicationStateGetEnum(goalState); Oid replicationStateTypeOid = ReplicationStateTypeOid(); Oid argTypes[] = { replicationStateTypeOid, /* goalstate */ INT8OID /* nodeid */ }; Datum argValues[] = { ObjectIdGetDatum(goalStateOid), /* goalstate */ Int64GetDatum(pgAutoFailoverNode->nodeId) /* nodeid */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *updateQuery = "UPDATE " AUTO_FAILOVER_NODE_TABLE " SET goalstate = $1, statechangetime = now() " "WHERE nodeid = $2"; SPI_connect(); int spiStatus = SPI_execute_with_args(updateQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_UPDATE) { elog(ERROR, "could not update " AUTO_FAILOVER_NODE_TABLE); } SPI_finish(); /* * Now that the UPDATE went through, update the pgAutoFailoverNode struct * with the new goal State and notify the state change. */ pgAutoFailoverNode->goalState = goalState; if (message != NULL) { NotifyStateChange(pgAutoFailoverNode, (char *) message); } } /* * ReportAutoFailoverNodeState persists the reported state and nodes version of * a node. * * We use SPI to automatically handle triggers, function calls, etc. */ void ReportAutoFailoverNodeState(char *nodeHost, int nodePort, ReplicationState reportedState, bool pgIsRunning, SyncState pgSyncState, int reportedTLI, XLogRecPtr reportedLSN) { Oid reportedStateOid = ReplicationStateGetEnum(reportedState); Oid replicationStateTypeOid = ReplicationStateTypeOid(); Oid argTypes[] = { replicationStateTypeOid, /* reportedstate */ BOOLOID, /* pg_ctl status: is running */ TEXTOID, /* pg_stat_replication.sync_state */ INT4OID, /* reportedtli */ LSNOID, /* reportedlsn */ TEXTOID, /* nodehost */ INT4OID /* nodeport */ }; Datum argValues[] = { ObjectIdGetDatum(reportedStateOid), /* reportedstate */ BoolGetDatum(pgIsRunning), /* pg_ctl status: is running */ CStringGetTextDatum(SyncStateToString(pgSyncState)), /* sync_state */ Int32GetDatum(reportedTLI), /* reportedtli */ LSNGetDatum(reportedLSN), /* reportedlsn */ CStringGetTextDatum(nodeHost), /* nodehost */ Int32GetDatum(nodePort) /* nodeport */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *updateQuery = "UPDATE " AUTO_FAILOVER_NODE_TABLE " SET reportedstate = $1, reporttime = now(), " "reportedpgisrunning = $2, reportedrepstate = $3, " "reportedtli = CASE $4 WHEN 0 THEN reportedtli ELSE $4 END, " "reportedlsn = CASE $5 WHEN '0/0'::pg_lsn THEN reportedlsn ELSE $5 END, " "walreporttime = CASE $5 WHEN '0/0'::pg_lsn THEN walreporttime ELSE now() END, " "statechangetime = CASE WHEN reportedstate <> $1 THEN now() ELSE statechangetime END " "WHERE nodehost = $6 AND nodeport = $7"; SPI_connect(); int spiStatus = SPI_execute_with_args(updateQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_UPDATE) { elog(ERROR, "could not update " AUTO_FAILOVER_NODE_TABLE); } SPI_finish(); } /* * ReportAutoFailoverNodeHealth persists the current health of a node. * * We use SPI to automatically handle triggers, function calls, etc. */ void ReportAutoFailoverNodeHealth(char *nodeHost, int nodePort, ReplicationState goalState, NodeHealthState health) { Oid goalStateOid = ReplicationStateGetEnum(goalState); Oid replicationStateTypeOid = ReplicationStateTypeOid(); Oid argTypes[] = { replicationStateTypeOid, /* goalstate */ INT4OID, /* health */ TEXTOID, /* nodehost */ INT4OID /* nodeport */ }; Datum argValues[] = { ObjectIdGetDatum(goalStateOid), /* goalstate */ Int32GetDatum(health), /* reportedversion */ CStringGetTextDatum(nodeHost), /* nodehost */ Int32GetDatum(nodePort) /* nodeport */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *updateQuery = "UPDATE " AUTO_FAILOVER_NODE_TABLE " SET goalstate = $1, health = $2, " "healthchecktime = now(), statechangetime = now() " "WHERE nodehost = $3 AND nodeport = $4"; SPI_connect(); int spiStatus = SPI_execute_with_args(updateQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_UPDATE) { elog(ERROR, "could not update " AUTO_FAILOVER_NODE_TABLE); } SPI_finish(); } /* * ReportAutoFailoverNodeReplicationSetting persists the replication properties of * a node. * * We use SPI to automatically handle triggers, function calls, etc. */ void ReportAutoFailoverNodeReplicationSetting(int64 nodeid, char *nodeHost, int nodePort, int candidatePriority, bool replicationQuorum) { Oid argTypes[] = { INT4OID, /* candidate_priority */ BOOLOID, /* repliation_quorum */ INT8OID, /* nodeid */ TEXTOID, /* nodehost */ INT4OID /* nodeport */ }; Datum argValues[] = { Int32GetDatum(candidatePriority), /* candidate_priority */ BoolGetDatum(replicationQuorum), /* replication_quorum */ Int64GetDatum(nodeid), /* nodeid */ CStringGetTextDatum(nodeHost), /* nodehost */ Int32GetDatum(nodePort) /* nodeport */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *updateQuery = "UPDATE " AUTO_FAILOVER_NODE_TABLE " SET candidatepriority = $1, replicationquorum = $2 " " WHERE nodeid = $3 and nodehost = $4 AND nodeport = $5"; SPI_connect(); int spiStatus = SPI_execute_with_args(updateQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_UPDATE) { elog(ERROR, "could not update " AUTO_FAILOVER_NODE_TABLE); } SPI_finish(); } /* * UpdateAutoFailoverNodeMetadata updates a node registration to a possibly new * nodeName, nodeHost, and nodePort. Those are NULL (or zero) when not changed. * * We use SPI to automatically handle triggers, function calls, etc. */ void UpdateAutoFailoverNodeMetadata(int64 nodeid, char *nodeName, char *nodeHost, int nodePort) { Oid argTypes[] = { INT8OID, /* nodeid */ TEXTOID, /* nodename */ TEXTOID, /* nodehost */ INT4OID /* nodeport */ }; Datum argValues[] = { Int64GetDatum(nodeid), /* nodeid */ CStringGetTextDatum(nodeName), /* nodename */ CStringGetTextDatum(nodeHost), /* nodehost */ Int32GetDatum(nodePort) /* nodeport */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *updateQuery = "UPDATE " AUTO_FAILOVER_NODE_TABLE " SET nodename = $2, nodehost = $3, nodeport = $4 " "WHERE nodeid = $1"; SPI_connect(); int spiStatus = SPI_execute_with_args(updateQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_UPDATE) { elog(ERROR, "could not update " AUTO_FAILOVER_NODE_TABLE); } SPI_finish(); } /* * RemoveAutoFailoverNode removes a node from a AutoFailover formation. * * We use SPI to automatically handle triggers, function calls, etc. */ void RemoveAutoFailoverNode(AutoFailoverNode *pgAutoFailoverNode) { Oid argTypes[] = { INT8OID /* nodeId */ }; Datum argValues[] = { Int64GetDatum(pgAutoFailoverNode->nodeId) /* nodeId */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); const char *deleteQuery = "DELETE FROM " AUTO_FAILOVER_NODE_TABLE " WHERE nodeid = $1"; SPI_connect(); int spiStatus = SPI_execute_with_args(deleteQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus != SPI_OK_DELETE) { elog(ERROR, "could not delete from " AUTO_FAILOVER_NODE_TABLE); } SPI_finish(); } /* * SynStateFromString returns the enum value represented by given string. */ SyncState SyncStateFromString(const char *pgsrSyncState) { SyncState syncStateArray[] = { SYNC_STATE_UNKNOWN, SYNC_STATE_UNKNOWN, SYNC_STATE_SYNC, SYNC_STATE_ASYNC, SYNC_STATE_QUORUM, SYNC_STATE_POTENTIAL }; char *syncStateList[] = { "", "unknown", "sync", "async", "quorum", "potential", NULL }; for (int listIndex = 0; syncStateList[listIndex] != NULL; listIndex++) { char *candidate = syncStateList[listIndex]; if (strcmp(pgsrSyncState, candidate) == 0) { return syncStateArray[listIndex]; } } ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unknown pg_stat_replication.sync_state \"%s\"", pgsrSyncState))); /* never happens, make compiler happy */ return SYNC_STATE_UNKNOWN; } /* * SyncStateToString returns the string representation of a SyncState */ char * SyncStateToString(SyncState pgsrSyncState) { switch (pgsrSyncState) { case SYNC_STATE_UNKNOWN: { return "unknown"; } case SYNC_STATE_ASYNC: { return "async"; } case SYNC_STATE_SYNC: { return "sync"; } case SYNC_STATE_QUORUM: { return "quorum"; } case SYNC_STATE_POTENTIAL: { return "potential"; } default: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unknown SyncState enum value %d", pgsrSyncState))); } /* keep compiler happy */ return ""; } /* * IsCurrentState returns true if the given node is known to have converged to * the given state and false otherwise. */ bool IsCurrentState(AutoFailoverNode *pgAutoFailoverNode, ReplicationState state) { return pgAutoFailoverNode != NULL && pgAutoFailoverNode->goalState == pgAutoFailoverNode->reportedState && pgAutoFailoverNode->goalState == state; } /* * CanTakeWritesInState returns whether a node can take writes when in * the given state. */ bool CanTakeWritesInState(ReplicationState state) { return state == REPLICATION_STATE_SINGLE || state == REPLICATION_STATE_PRIMARY || state == REPLICATION_STATE_WAIT_PRIMARY || state == REPLICATION_STATE_JOIN_PRIMARY || state == REPLICATION_STATE_APPLY_SETTINGS; } /* * CanInitiateFailover returns whether a node is a primary that we can initiate * a (manual) failover from. We refuse to failover from a WAIT_PRIMARY node * because we're not sure if the secondary has done catching-up yet. */ bool CanInitiateFailover(ReplicationState state) { return state == REPLICATION_STATE_SINGLE || state == REPLICATION_STATE_PRIMARY || state == REPLICATION_STATE_JOIN_PRIMARY; } /* * StateBelongsToPrimary returns true when given state belongs to a primary * node, either in a healthy state or even when in the middle of being demoted. */ bool StateBelongsToPrimary(ReplicationState state) { return CanTakeWritesInState(state) || state == REPLICATION_STATE_DRAINING || state == REPLICATION_STATE_DEMOTE_TIMEOUT || state == REPLICATION_STATE_PREPARE_MAINTENANCE; } /* * IsBeingDemotedPrimary returns true when a given node is currently going * through a demotion. */ bool IsBeingDemotedPrimary(AutoFailoverNode *node) { return node != NULL && (StateBelongsToPrimary(node->reportedState) && (node->goalState == REPLICATION_STATE_DRAINING || node->goalState == REPLICATION_STATE_DEMOTE_TIMEOUT || node->goalState == REPLICATION_STATE_PREPARE_MAINTENANCE)); } /* * IsDemotedPrimary returns true when a node has completed a process of * demotion. */ bool IsDemotedPrimary(AutoFailoverNode *node) { return node != NULL && (node->goalState == REPLICATION_STATE_DEMOTED && (StateBelongsToPrimary(node->reportedState) || node->reportedState == REPLICATION_STATE_DEMOTED)); } /* * IsBeingPromoted returns whether a standby node is going through the process * of a promotion. * * We need to recognize a node going though the FSM even before it has reached * a stable state (where reportedState and goalState are the same). */ bool IsBeingPromoted(AutoFailoverNode *node) { return node != NULL && ((node->reportedState == REPLICATION_STATE_REPORT_LSN && (node->goalState == REPLICATION_STATE_FAST_FORWARD || node->goalState == REPLICATION_STATE_PREPARE_PROMOTION)) || (node->reportedState == REPLICATION_STATE_FAST_FORWARD && (node->goalState == REPLICATION_STATE_FAST_FORWARD || node->goalState == REPLICATION_STATE_PREPARE_PROMOTION)) || (node->reportedState == REPLICATION_STATE_PREPARE_PROMOTION && (node->goalState == REPLICATION_STATE_PREPARE_PROMOTION || node->goalState == REPLICATION_STATE_STOP_REPLICATION || node->goalState == REPLICATION_STATE_WAIT_PRIMARY)) || (node->reportedState == REPLICATION_STATE_STOP_REPLICATION && (node->goalState == REPLICATION_STATE_STOP_REPLICATION || node->goalState == REPLICATION_STATE_WAIT_PRIMARY))); } /* * CandidateNodeIsReadyToStreamWAL returns whether a newly selected candidate * node, possibly still being promoted, is ready for the other standby nodes is * REPORT_LSN to already use the new primary as an upstream node. * * We're okay with making progress when the selected candidate is on the * expected path of FAST_FORWARD to PREPARE_PROMOTION to STOP_REPLICATION to * WAIT_PRIMARY to PRIMARY. We want to allow matching intermediate states (when * reportedState and goalState are not the same), and we also want to prevent * matching other FSM paths. * * Finally, FAST_FORWARD is a little too soon, so we skip that. */ bool CandidateNodeIsReadyToStreamWAL(AutoFailoverNode *node) { return node != NULL && ((node->reportedState == REPLICATION_STATE_PREPARE_PROMOTION && (node->goalState == REPLICATION_STATE_STOP_REPLICATION || node->goalState == REPLICATION_STATE_WAIT_PRIMARY)) || (node->reportedState == REPLICATION_STATE_STOP_REPLICATION && (node->goalState == REPLICATION_STATE_STOP_REPLICATION || node->goalState == REPLICATION_STATE_WAIT_PRIMARY)) || (node->reportedState == REPLICATION_STATE_WAIT_PRIMARY && (node->goalState == REPLICATION_STATE_WAIT_PRIMARY || node->goalState == REPLICATION_STATE_PRIMARY)) || (node->reportedState == REPLICATION_STATE_PRIMARY && node->goalState == REPLICATION_STATE_PRIMARY)); } /* * IsParticipatingInPromotion returns whether a node is currently participating * in a promotion, either as a candidate that IsBeingPromoted, or as a * "support" node that is reporting its LSN or re-joining as a secondary. */ bool IsParticipatingInPromotion(AutoFailoverNode *node) { return IsBeingPromoted(node) || node->reportedState == REPLICATION_STATE_REPORT_LSN || node->goalState == REPLICATION_STATE_REPORT_LSN || node->reportedState == REPLICATION_STATE_JOIN_SECONDARY || node->goalState == REPLICATION_STATE_JOIN_SECONDARY; } /* * IsInWaitOrJoinState returns true when the given node is a primary node that * is currently busy with registering a standby: it's then been assigned either * WAIT_STANDBY or JOIN_STANDBY replication state. */ bool IsInWaitOrJoinState(AutoFailoverNode *node) { return node != NULL && (node->reportedState == REPLICATION_STATE_WAIT_PRIMARY || node->goalState == REPLICATION_STATE_WAIT_PRIMARY || node->reportedState == REPLICATION_STATE_JOIN_PRIMARY || node->goalState == REPLICATION_STATE_JOIN_PRIMARY); } /* * IsInPrimaryState returns true if the given node is known to have converged * to a state that makes it the primary node in its group. */ bool IsInPrimaryState(AutoFailoverNode *pgAutoFailoverNode) { return pgAutoFailoverNode != NULL && ((pgAutoFailoverNode->goalState == pgAutoFailoverNode->reportedState && CanTakeWritesInState(pgAutoFailoverNode->goalState)) || /* * We accept both apply_settings -> primary and primary -> * apply_settings as primary states. */ ((pgAutoFailoverNode->goalState == REPLICATION_STATE_APPLY_SETTINGS || pgAutoFailoverNode->goalState == REPLICATION_STATE_PRIMARY) && (pgAutoFailoverNode->reportedState == REPLICATION_STATE_PRIMARY || pgAutoFailoverNode->reportedState == REPLICATION_STATE_APPLY_SETTINGS))); } /* * IsInMaintenance returns true if the given node has been assigned a * maintenance state, whether it reached it yet or not. */ bool IsInMaintenance(AutoFailoverNode *node) { return node != NULL && (node->goalState == REPLICATION_STATE_PREPARE_MAINTENANCE || node->goalState == REPLICATION_STATE_WAIT_MAINTENANCE || node->goalState == REPLICATION_STATE_MAINTENANCE); } /* * IsStateIn returns true if state is equal to any of allowedStates */ bool IsStateIn(ReplicationState state, List *allowedStates) { ListCell *cell = NULL; foreach(cell, allowedStates) { ReplicationState allowedState = (ReplicationState) lfirst_int(cell); if (state == allowedState) { return true; } } return false; } /* * IsHealthy returns whether the given node is heathly, meaning it succeeds the * last health check and its PostgreSQL instance is reported as running by the * keeper. */ bool IsHealthy(AutoFailoverNode *pgAutoFailoverNode) { TimestampTz now = GetCurrentTimestamp(); int nodeActiveCallsFrequencyMs = 1 * 1000; /* keeper sleep time */ if (pgAutoFailoverNode == NULL) { return false; } /* * If the keeper has been reporting that Postgres is running after our last * background check run, and within the node-active protocol client-time * sleep time (1 second), then trust pg_autoctl node reporting: we might be * out of a network split or node-local failure mode, and our background * checks might not have run yet to clarify that "back to good" situation. * * In any case, the pg_autoctl node-active process could connect to the * monitor, so there is no network split at this time. */ if (pgAutoFailoverNode->health == NODE_HEALTH_BAD && TimestampDifferenceExceeds(pgAutoFailoverNode->healthCheckTime, pgAutoFailoverNode->reportTime, 0) && TimestampDifferenceExceeds(pgAutoFailoverNode->reportTime, now, nodeActiveCallsFrequencyMs)) { return pgAutoFailoverNode->pgIsRunning; } /* nominal case: trust background checks + reported Postgres state */ return pgAutoFailoverNode->health == NODE_HEALTH_GOOD && pgAutoFailoverNode->pgIsRunning == true; } /* * IsUnhealthy returns whether the given node is unhealthy, meaning it failed * its last health check and has not reported for more than UnhealthyTimeoutMs, * and it's PostgreSQL instance has been reporting as running by the keeper. */ bool IsUnhealthy(AutoFailoverNode *pgAutoFailoverNode) { TimestampTz now = GetCurrentTimestamp(); if (pgAutoFailoverNode == NULL) { return true; } /* if the keeper isn't reporting, trust our Health Checks */ if (TimestampDifferenceExceeds(pgAutoFailoverNode->reportTime, now, UnhealthyTimeoutMs)) { if (pgAutoFailoverNode->health == NODE_HEALTH_BAD && TimestampDifferenceExceeds(PgStartTime, pgAutoFailoverNode->healthCheckTime, 0)) { if (TimestampDifferenceExceeds(PgStartTime, now, StartupGracePeriodMs)) { return true; } } } /* * If the keeper reports that PostgreSQL is not running, then the node * isn't Healthy. */ if (!pgAutoFailoverNode->pgIsRunning) { return true; } /* clues show that everything is fine, the node is not unhealthy */ return false; } /* * IsReporting returns whether the given node has reported recently, within the * UnhealthyTimeoutMs interval. */ bool IsReporting(AutoFailoverNode *pgAutoFailoverNode) { TimestampTz now = GetCurrentTimestamp(); if (pgAutoFailoverNode == NULL) { return false; } if (TimestampDifferenceExceeds(pgAutoFailoverNode->reportTime, now, UnhealthyTimeoutMs)) { return false; } return true; } /* * IsDrainTimeExpired returns whether the node should be done according * to the drain time-outs. */ bool IsDrainTimeExpired(AutoFailoverNode *pgAutoFailoverNode) { bool drainTimeExpired = false; if (pgAutoFailoverNode == NULL || pgAutoFailoverNode->goalState != REPLICATION_STATE_DEMOTE_TIMEOUT) { return false; } TimestampTz now = GetCurrentTimestamp(); if (TimestampDifferenceExceeds(pgAutoFailoverNode->stateChangeTime, now, DrainTimeoutMs)) { drainTimeExpired = true; } return drainTimeExpired; } pg_auto_failover-1.6.3/src/monitor/node_metadata.h000066400000000000000000000213341414244367200222740ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/node_metadata.h * * Declarations for public functions and types related to node metadata. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #pragma once #include #include "access/xlogdefs.h" #include "datatype/timestamp.h" #include "health_check.h" #include "replication_state.h" #define AUTO_FAILOVER_NODE_TABLE_NAME "node" /* column indexes for pgautofailover.node * indices must match with the columns given * in the following definition. */ #define Natts_pgautofailover_node 19 #define Anum_pgautofailover_node_formationid 1 #define Anum_pgautofailover_node_nodeid 2 #define Anum_pgautofailover_node_groupid 3 #define Anum_pgautofailover_node_nodename 4 #define Anum_pgautofailover_node_nodehost 5 #define Anum_pgautofailover_node_nodeport 6 #define Anum_pgautofailover_node_sysidentifier 7 #define Anum_pgautofailover_node_goalstate 8 #define Anum_pgautofailover_node_reportedstate 9 #define Anum_pgautofailover_node_reportedpgisrunning 10 #define Anum_pgautofailover_node_reportedrepstate 11 #define Anum_pgautofailover_node_reporttime 12 #define Anum_pgautofailover_node_reportedTLI 13 #define Anum_pgautofailover_node_reportedLSN 14 #define Anum_pgautofailover_node_walreporttime 15 #define Anum_pgautofailover_node_health 16 #define Anum_pgautofailover_node_healthchecktime 17 #define Anum_pgautofailover_node_statechangetime 18 #define Anum_pgautofailover_node_candidate_priority 19 #define Anum_pgautofailover_node_replication_quorum 20 #define Anum_pgautofailover_node_nodecluster 21 #define AUTO_FAILOVER_NODE_TABLE_ALL_COLUMNS \ "formationid, " \ "nodeid, " \ "groupid, " \ "nodename, " \ "nodehost, " \ "nodeport, " \ "sysidentifier, " \ "goalstate, " \ "reportedstate, " \ "reportedpgisrunning, " \ "reportedrepstate, " \ "reporttime, " \ "reportedtli, " \ "reportedlsn, " \ "walreporttime, " \ "health, " \ "healthchecktime, " \ "statechangetime, " \ "candidatepriority, " \ "replicationquorum, " \ "nodecluster" #define SELECT_ALL_FROM_AUTO_FAILOVER_NODE_TABLE \ "SELECT " AUTO_FAILOVER_NODE_TABLE_ALL_COLUMNS " FROM " AUTO_FAILOVER_NODE_TABLE /* pg_stat_replication.sync_state: "sync", "async", "quorum", "potential" */ typedef enum SyncState { SYNC_STATE_UNKNOWN = 0, SYNC_STATE_SYNC, SYNC_STATE_ASYNC, SYNC_STATE_QUORUM, SYNC_STATE_POTENTIAL } SyncState; /* * We restrict candidatePriority values in the range 0..100 to the users. * Internally, we increment the candidatePriority (+= 100) when the * perform_promotion API is used, in order to tweak the selection of the * candidate. */ #define MAX_USER_DEFINED_CANDIDATE_PRIORITY 100 #define CANDIDATE_PRIORITY_INCREMENT (MAX_USER_DEFINED_CANDIDATE_PRIORITY + 1) /* * Use the same output format each time we are notifying and logging about an * AutoFailoverNode, for consistency. Well, apart when registering, where we * don't have the node id and/or the node name yet. */ #define NODE_FORMAT "node %lld \"%s\" (%s:%d)" #define NODE_FORMAT_ARGS(node) \ (long long) node->nodeId, node->nodeName, node->nodeHost, node->nodePort /* * AutoFailoverNode represents a Postgres node that is being tracked by the * pg_auto_failover monitor. */ typedef struct AutoFailoverNode { char *formationId; int64 nodeId; int groupId; char *nodeName; char *nodeHost; int nodePort; uint64 sysIdentifier; ReplicationState goalState; ReplicationState reportedState; TimestampTz reportTime; bool pgIsRunning; SyncState pgsrSyncState; TimestampTz walReportTime; NodeHealthState health; TimestampTz healthCheckTime; TimestampTz stateChangeTime; int reportedTLI; XLogRecPtr reportedLSN; int candidatePriority; bool replicationQuorum; char *nodeCluster; } AutoFailoverNode; /* * Formation.kind: "pgsql" or "citus" * * We define the formation kind here to avoid cyclic dependency between the * formation_metadata.h and node_metadata.h headers. */ typedef enum FormationKind { FORMATION_KIND_UNKNOWN = 0, FORMATION_KIND_PGSQL, FORMATION_KIND_CITUS } FormationKind; /* public function declarations */ extern List * AllAutoFailoverNodes(char *formationId); extern List * AutoFailoverNodeGroup(char *formationId, int groupId); extern List * AutoFailoverAllNodesInGroup(char *formationId, int groupId); extern List * AutoFailoverOtherNodesList(AutoFailoverNode *pgAutoFailoverNode); extern List * AutoFailoverOtherNodesListInState(AutoFailoverNode *pgAutoFailoverNode, ReplicationState currentState); extern List * AutoFailoverCandidateNodesListInState(AutoFailoverNode *pgAutoFailoverNode, ReplicationState currentState); extern AutoFailoverNode * GetPrimaryNodeInGroup(char *formationId, int32 groupId); AutoFailoverNode * GetNodeToFailoverFromInGroup(char *formationId, int32 groupId); extern AutoFailoverNode * GetPrimaryOrDemotedNodeInGroup(char *formationId, int32 groupId); extern AutoFailoverNode * FindFailoverNewStandbyNode(List *groupNodeList); extern List * GroupListCandidates(List *groupNodeList); extern List * ListMostAdvancedStandbyNodes(List *groupNodeList); extern List * GroupListSyncStandbys(List *groupNodeList); extern bool AllNodesHaveSameCandidatePriority(List *groupNodeList); extern int CountSyncStandbys(List *groupNodeList); extern bool IsHealthySyncStandby(AutoFailoverNode *node); extern int CountHealthySyncStandbys(List *groupNodeList); extern int CountHealthyCandidates(List *groupNodeList); extern bool IsFailoverInProgress(List *groupNodeList); extern AutoFailoverNode * FindMostAdvancedStandby(List *groupNodeList); extern AutoFailoverNode * FindCandidateNodeBeingPromoted(List *groupNodeList); extern AutoFailoverNode * GetAutoFailoverNode(char *nodeHost, int nodePort); extern AutoFailoverNode * GetAutoFailoverNodeById(int64 nodeId); extern AutoFailoverNode * GetAutoFailoverNodeByName(char *formationId, char *nodeName); extern AutoFailoverNode * OtherNodeInGroup(AutoFailoverNode *pgAutoFailoverNode); extern AutoFailoverNode * GetWritableNodeInGroup(char *formationId, int32 groupId); extern AutoFailoverNode * TupleToAutoFailoverNode(TupleDesc tupleDescriptor, HeapTuple heapTuple); extern int AddAutoFailoverNode(char *formationId, FormationKind formationKind, int64 nodeId, int groupId, char *nodeName, char *nodeHost, int nodePort, uint64 sysIdentifier, ReplicationState goalState, ReplicationState reportedState, int candidatePriority, bool replicationQuorum, char *nodeCluster); extern void SetNodeGoalState(AutoFailoverNode *pgAutoFailoverNode, ReplicationState goalState, const char *message); extern void ReportAutoFailoverNodeState(char *nodeHost, int nodePort, ReplicationState reportedState, bool pgIsRunning, SyncState pgSyncState, int reportedTLI, XLogRecPtr reportedLSN); extern void ReportAutoFailoverNodeHealth(char *nodeHost, int nodePort, ReplicationState goalState, NodeHealthState health); extern void ReportAutoFailoverNodeReplicationSetting(int64 nodeid, char *nodeHost, int nodePort, int candidatePriority, bool replicationQuorum); extern void UpdateAutoFailoverNodeMetadata(int64 nodeid, char *nodeName, char *nodeHost, int nodePort); extern void RemoveAutoFailoverNode(AutoFailoverNode *pgAutoFailoverNode); extern SyncState SyncStateFromString(const char *pgsrSyncState); extern char * SyncStateToString(SyncState pgsrSyncState); extern bool IsCurrentState(AutoFailoverNode *pgAutoFailoverNode, ReplicationState state); extern bool CanTakeWritesInState(ReplicationState state); extern bool CanInitiateFailover(ReplicationState state); extern bool StateBelongsToPrimary(ReplicationState state); extern bool IsBeingPromoted(AutoFailoverNode *node); extern bool IsBeingDemotedPrimary(AutoFailoverNode *node); extern bool IsDemotedPrimary(AutoFailoverNode *node); extern bool CandidateNodeIsReadyToStreamWAL(AutoFailoverNode *node); extern bool IsParticipatingInPromotion(AutoFailoverNode *node); extern bool IsInWaitOrJoinState(AutoFailoverNode *node); extern bool IsInPrimaryState(AutoFailoverNode *pgAutoFailoverNode); extern bool IsInMaintenance(AutoFailoverNode *node); extern bool IsStateIn(ReplicationState state, List *allowedStates); extern bool IsHealthy(AutoFailoverNode *pgAutoFailoverNode); extern bool IsUnhealthy(AutoFailoverNode *pgAutoFailoverNode); extern bool IsDrainTimeExpired(AutoFailoverNode *pgAutoFailoverNode); extern bool IsReporting(AutoFailoverNode *pgAutoFailoverNode); pg_auto_failover-1.6.3/src/monitor/notifications.c000066400000000000000000000130071414244367200223510ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/notifications.c * * Implementation of the functions used to send messages to the * pg_auto_failover monitor clients. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include #include "postgres.h" #include "metadata.h" #include "node_metadata.h" #include "notifications.h" #include "replication_state.h" #include "catalog/pg_type.h" #include "commands/async.h" #include "executor/spi.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/json.h" #include "utils/pg_lsn.h" /* * LogAndNotifyMessage emits the given message both as a log entry and also as * a notification on the CHANNEL_LOG channel. */ void LogAndNotifyMessage(char *message, size_t size, const char *fmt, ...) { va_list args; va_start(args, fmt); /* * Explanation of IGNORE-BANNED * Arguments are always non-null and we * do not write before the allocated buffer. * */ int n = vsnprintf(message, size - 2, fmt, args); /* IGNORE-BANNED */ va_end(args); if (n < 0) { ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } ereport(LOG, (errmsg("%s", message))); Async_Notify(CHANNEL_LOG, message); } /* * NotifyStateChange emits a notification message on the CHANNEL_STATE channel * about a state change decided by the monitor. This state change is encoded so * as to be easy to parse by a machine. */ int64 NotifyStateChange(AutoFailoverNode *node, char *description) { StringInfo payload = makeStringInfo(); /* * Insert the event in our events table. */ int64 eventid = InsertEvent(node, description); /* build a json object from the notification pieces */ appendStringInfoChar(payload, '{'); appendStringInfo(payload, "\"type\": \"state\""); appendStringInfo(payload, ", \"formation\": "); escape_json(payload, node->formationId); appendStringInfo(payload, ", \"groupId\": %d", node->groupId); appendStringInfo(payload, ", \"nodeId\": %lld", (long long) node->nodeId); appendStringInfo(payload, ", \"name\": "); escape_json(payload, node->nodeName); appendStringInfo(payload, ", \"host\": "); escape_json(payload, node->nodeHost); appendStringInfo(payload, ", \"port\": %d", node->nodePort); appendStringInfo(payload, ", \"reportedState\": "); escape_json(payload, ReplicationStateGetName(node->reportedState)); appendStringInfo(payload, ", \"goalState\": "); escape_json(payload, ReplicationStateGetName(node->goalState)); appendStringInfo(payload, ", \"health\":"); escape_json(payload, NodeHealthToString(node->health)); appendStringInfoChar(payload, '}'); Async_Notify(CHANNEL_STATE, payload->data); pfree(payload->data); pfree(payload); return eventid; } /* * InsertEvent populates the monitor's pgautofailover.event table with a new * entry, and returns the id of the new event. */ int64 InsertEvent(AutoFailoverNode *node, char *description) { Oid goalStateOid = ReplicationStateGetEnum(node->goalState); Oid reportedStateOid = ReplicationStateGetEnum(node->reportedState); Oid replicationStateTypeOid = ReplicationStateTypeOid(); Oid argTypes[] = { TEXTOID, /* formationid */ INT8OID, /* nodeid */ INT4OID, /* groupid */ TEXTOID, /* nodename */ TEXTOID, /* nodehost */ INT4OID, /* nodeport */ replicationStateTypeOid, /* reportedstate */ replicationStateTypeOid, /* goalstate */ TEXTOID, /* pg_stat_replication.sync_state */ INT4OID, /* timeline_id */ LSNOID, /* reportedLSN */ INT4OID, /* candidate_priority */ BOOLOID, /* replication_quorum */ TEXTOID /* description */ }; Datum argValues[] = { CStringGetTextDatum(node->formationId), /* formationid */ Int64GetDatum(node->nodeId), /* nodeid */ Int32GetDatum(node->groupId), /* groupid */ CStringGetTextDatum(node->nodeName), /* nodename */ CStringGetTextDatum(node->nodeHost), /* nodehost */ Int32GetDatum(node->nodePort), /* nodeport */ ObjectIdGetDatum(reportedStateOid), /* reportedstate */ ObjectIdGetDatum(goalStateOid), /* goalstate */ CStringGetTextDatum(SyncStateToString(node->pgsrSyncState)), /* sync_state */ Int32GetDatum(node->reportedTLI), /* reportedTLI */ LSNGetDatum(node->reportedLSN), /* reportedLSN */ Int32GetDatum(node->candidatePriority), /* candidate_priority */ BoolGetDatum(node->replicationQuorum), /* replication_quorum */ CStringGetTextDatum(description) /* description */ }; const int argCount = sizeof(argValues) / sizeof(argValues[0]); int64 eventId = 0; const char *insertQuery = "INSERT INTO " AUTO_FAILOVER_EVENT_TABLE "(formationid, nodeid, groupid, nodename, nodehost, nodeport," " reportedstate, goalstate, reportedrepstate, reportedtli, reportedlsn," " candidatepriority, replicationquorum, description) " "VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14) " "RETURNING eventid"; SPI_connect(); int spiStatus = SPI_execute_with_args(insertQuery, argCount, argTypes, argValues, NULL, false, 0); if (spiStatus == SPI_OK_INSERT_RETURNING && SPI_processed > 0) { bool isNull = false; Datum eventIdDatum = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isNull); eventId = DatumGetInt64(eventIdDatum); } else { elog(ERROR, "could not insert into " AUTO_FAILOVER_EVENT_TABLE); } SPI_finish(); return eventId; } pg_auto_failover-1.6.3/src/monitor/notifications.h000066400000000000000000000023701414244367200223570ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/notifications.h * * Declarations for public functions and types related to monitor * notifications. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #pragma once #include "postgres.h" #include "c.h" #include "node_metadata.h" #include "replication_state.h" /* * pg_auto_failover notifies on different channels about every event it * produces: * * - the "state" channel is used when a node's state is assigned to something * new * * - the "log" channel is used to duplicate message that are sent to the * PostgreSQL logs, in order for a pg_auto_failover monitor client to subscribe to * the chatter without having to actually have the privileges to tail the * PostgreSQL server logs. */ #define CHANNEL_STATE "state" #define CHANNEL_LOG "log" #define BUFSIZE 8192 void LogAndNotifyMessage(char *message, size_t size, const char *fmt, ...) __attribute__( (format(printf, 3, 4))); int64 NotifyStateChange(AutoFailoverNode *node, char *description); int64 InsertEvent(AutoFailoverNode *node, char *description); pg_auto_failover-1.6.3/src/monitor/pg_auto_failover.c000066400000000000000000000166651414244367200230420ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/pg_auto_failover.c * * Implementation of the pg_auto_failover extension. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" /* these are internal headers */ #include "health_check.h" #include "group_state_machine.h" #include "metadata.h" #include "version_compat.h" /* these are always necessary for a bgworker */ #include "miscadmin.h" #include "postmaster/bgworker.h" #include "storage/ipc.h" #include "storage/latch.h" #include "storage/lwlock.h" #include "storage/proc.h" #include "storage/shmem.h" /* these headers are used by this particular worker's code */ #include "commands/dbcommands.h" #include "postmaster/postmaster.h" #include "utils/builtins.h" #include "utils/memutils.h" #include "tcop/utility.h" ProcessUtility_hook_type PreviousProcessUtility_hook = NULL; void _PG_init(void); static void StartMonitorNode(void); #if (PG_VERSION_NUM < 140000) static void pgautofailover_ProcessUtility(PlannedStmt *pstmt, const char *queryString, ProcessUtilityContext context, ParamListInfo params, struct QueryEnvironment *queryEnv, DestReceiver *dest, QueryCompletion *completionTag); #else static void pgautofailover_ProcessUtility(PlannedStmt *pstmt, const char *queryString, bool readOnlyTree, ProcessUtilityContext context, ParamListInfo params, struct QueryEnvironment *queryEnv, DestReceiver *dest, QueryCompletion *completionTag); #endif PG_MODULE_MAGIC; /* * Entrypoint of this module. */ void _PG_init(void) { if (!process_shared_preload_libraries_in_progress) { ereport(ERROR, (errmsg("pgautofailover can only be loaded via shared_preload_libraries"), errhint("Add pgautofailover to shared_preload_libraries " "configuration variable in postgresql.conf."))); } StartMonitorNode(); } /* * StartMonitor register GUCs for monitor mode and starts the * health check worker. */ static void StartMonitorNode(void) { BackgroundWorker worker; DefineCustomBoolVariable("pgautofailover.enable_version_checks", "Enable extension version compatiblity checks", NULL, &EnableVersionChecks, true, PGC_SIGHUP, GUC_NO_SHOW_ALL, NULL, NULL, NULL); DefineCustomBoolVariable("pgautofailover.enable_health_checks", "Enable background health checks", NULL, &HealthChecksEnabled, true, PGC_SIGHUP, GUC_NO_SHOW_ALL, NULL, NULL, NULL); DefineCustomIntVariable("pgautofailover.health_check_period", "Duration between each check (in milliseconds).", NULL, &HealthCheckPeriod, 5 * 1000, 1, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); DefineCustomIntVariable("pgautofailover.health_check_timeout", "Connect timeout (in milliseconds).", NULL, &HealthCheckTimeout, 5 * 1000, 1, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); DefineCustomIntVariable("pgautofailover.health_check_max_retries", "Maximum number of re-tries before marking a node as failed.", NULL, &HealthCheckMaxRetries, 2, 1, 100, PGC_SIGHUP, 0, NULL, NULL, NULL); DefineCustomIntVariable("pgautofailover.health_check_retry_delay", "Delay between consecutive retries.", NULL, &HealthCheckRetryDelay, 2 * 1000, 1, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); DefineCustomIntVariable("pgautofailover.enable_sync_wal_log_threshold", "Don't enable synchronous replication until secondary xlog" " is within this many bytes of the primary's", NULL, &EnableSyncXlogThreshold, DEFAULT_XLOG_SEG_SIZE, 1, INT_MAX, PGC_SIGHUP, 0, NULL, NULL, NULL); DefineCustomIntVariable("pgautofailover.promote_wal_log_threshold", "Don't promote secondary unless xlog is with this many bytes" " of the master", NULL, &PromoteXlogThreshold, DEFAULT_XLOG_SEG_SIZE, 1, INT_MAX, PGC_SIGHUP, 0, NULL, NULL, NULL); DefineCustomIntVariable("pgautofailover.primary_demote_timeout", "Give the primary this long to drain before promoting the secondary", NULL, &DrainTimeoutMs, 30 * 1000, 1, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); DefineCustomIntVariable("pgautofailover.node_considered_unhealthy_timeout", "Mark node unhealthy if last ping was over this long ago", NULL, &UnhealthyTimeoutMs, 20 * 1000, 1, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); DefineCustomIntVariable("pgautofailover.startup_grace_period", "Wait for at least this much time after startup before " "initiating a failover.", NULL, &StartupGracePeriodMs, 10 * 1000, 1, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); PreviousProcessUtility_hook = ProcessUtility_hook; ProcessUtility_hook = pgautofailover_ProcessUtility; InitializeHealthCheckWorker(); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; worker.bgw_start_time = BgWorkerStart_RecoveryFinished; worker.bgw_restart_time = 1; worker.bgw_main_arg = Int32GetDatum(0); worker.bgw_notify_pid = 0; strlcpy(worker.bgw_library_name, "pgautofailover", sizeof(worker.bgw_library_name)); strlcpy(worker.bgw_name, "pg_auto_failover monitor", sizeof(worker.bgw_name)); strlcpy(worker.bgw_function_name, "HealthCheckWorkerLauncherMain", sizeof(worker.bgw_function_name)); RegisterBackgroundWorker(&worker); } /* * pgautofailover_ProcessUtility is a PostgreSQL utility hook that allows terminating * background workers attached to a database when a DROP DATABASE command is * executed. As long as the background worker is connected, the DROP DATABASE * command would otherwise fail to complete. */ #if (PG_VERSION_NUM < 140000) void pgautofailover_ProcessUtility(PlannedStmt *pstmt, const char *queryString, ProcessUtilityContext context, ParamListInfo params, struct QueryEnvironment *queryEnv, DestReceiver *dest, QueryCompletion *completionTag) #else void pgautofailover_ProcessUtility(PlannedStmt * pstmt, const char * queryString, bool readOnlyTree, ProcessUtilityContext context, ParamListInfo params, struct QueryEnvironment *queryEnv, DestReceiver * dest, QueryCompletion * completionTag) #endif { Node *parsetree = pstmt->utilityStmt; /* * Make sure that on DROP DATABASE we terminate the background deamon * associated with it. */ if (IsA(parsetree, DropdbStmt)) { DropdbStmt *dropDbStatement = (DropdbStmt *) parsetree; char *dbname = dropDbStatement->dbname; Oid databaseOid = get_database_oid(dbname, true); if (databaseOid != InvalidOid) { StopHealthCheckWorker(databaseOid); } } if (PreviousProcessUtility_hook) { #if (PG_VERSION_NUM < 140000) PreviousProcessUtility_hook(pstmt, queryString, context, params, queryEnv, dest, completionTag); #else PreviousProcessUtility_hook(pstmt, queryString, readOnlyTree, context, params, queryEnv, dest, completionTag); #endif } else { #if (PG_VERSION_NUM < 140000) standard_ProcessUtility(pstmt, queryString, context, params, queryEnv, dest, completionTag); #else standard_ProcessUtility(pstmt, queryString, readOnlyTree, context, params, queryEnv, dest, completionTag); #endif } } pg_auto_failover-1.6.3/src/monitor/pgautofailover--1.0--1.1.sql000066400000000000000000000151241414244367200240310ustar00rootroot00000000000000-- -- extension update file from 1.0 to 1.1 -- -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION pgautofailover UPDATE TO 1.1" to load this file. \quit ALTER TABLE pgautofailover.node RENAME TO node_upgrade_old; CREATE TABLE pgautofailover.node ( formationid text not null default 'default', nodeid bigint not null DEFAULT nextval('pgautofailover.node_nodeid_seq'::regclass), groupid int not null, nodename text not null, nodeport integer not null, goalstate pgautofailover.replication_state not null default 'init', reportedstate pgautofailover.replication_state not null, reportedpgisrunning bool default true, reportedrepstate text default 'async', reporttime timestamptz not null default now(), reportedlsn pg_lsn not null default '0/0', walreporttime timestamptz not null default now(), health integer not null default -1, healthchecktime timestamptz not null default now(), statechangetime timestamptz not null default now(), UNIQUE (nodename, nodeport), PRIMARY KEY (nodeid), FOREIGN KEY (formationid) REFERENCES pgautofailover.formation(formationid) ) -- we expect few rows and lots of UPDATE, let's benefit from HOT WITH (fillfactor = 25); ALTER SEQUENCE pgautofailover.node_nodeid_seq OWNED BY pgautofailover.node.nodeid; INSERT INTO pgautofailover.node (formationid, nodeid, groupid, nodename, nodeport, goalstate, reportedstate, reportedpgisrunning, reportedrepstate, reporttime, walreporttime, health, healthchecktime, statechangetime) SELECT formationid, nodeid, groupid, nodename, nodeport, goalstate, reportedstate, reportedpgisrunning, reportedrepstate, reporttime, walreporttime, health, healthchecktime, statechangetime FROM pgautofailover.node_upgrade_old; ALTER TABLE pgautofailover.event RENAME TO event_upgrade_old; ALTER TABLE pgautofailover.event_upgrade_old ALTER COLUMN nodeid DROP NOT NULL, ALTER COLUMN nodeid SET DEFAULT NULL; DROP SEQUENCE pgautofailover.event_nodeid_seq; CREATE TABLE pgautofailover.event ( eventid bigint not null DEFAULT nextval('pgautofailover.event_eventid_seq'::regclass), eventtime timestamptz not null default now(), formationid text not null, nodeid bigint not null, groupid int not null, nodename text not null, nodeport integer not null, reportedstate pgautofailover.replication_state not null, goalstate pgautofailover.replication_state not null, reportedrepstate text, reportedlsn pg_lsn not null default '0/0', description text, PRIMARY KEY (eventid) ); ALTER SEQUENCE pgautofailover.event_eventid_seq OWNED BY pgautofailover.event.eventid; INSERT INTO pgautofailover.event (eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, description) SELECT eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, description FROM pgautofailover.event_upgrade_old; GRANT SELECT ON ALL TABLES IN SCHEMA pgautofailover TO autoctl_node; DROP FUNCTION pgautofailover.node_active(text,text,int,int,int, pgautofailover.replication_state,bool,bigint,text); CREATE FUNCTION pgautofailover.node_active ( IN formation_id text, IN node_name text, IN node_port int, IN current_node_id int default -1, IN current_group_id int default -1, IN current_group_role pgautofailover.replication_state default 'init', IN current_pg_is_running bool default true, IN current_lsn pg_lsn default '0/0', IN current_rep_state text default '', OUT assigned_node_id int, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$node_active$$; grant execute on function pgautofailover.node_active(text,text,int,int,int, pgautofailover.replication_state,bool,pg_lsn,text) to autoctl_node; grant execute on function pgautofailover.remove_node(text,int) to autoctl_node; ALTER FUNCTION pgautofailover.perform_failover(text,int) SECURITY DEFINER; grant execute on function pgautofailover.perform_failover(text,int) to autoctl_node; grant execute on function pgautofailover.start_maintenance(text,int) to autoctl_node; grant execute on function pgautofailover.stop_maintenance(text,int) to autoctl_node; DROP FUNCTION pgautofailover.last_events(integer); CREATE OR REPLACE FUNCTION pgautofailover.last_events ( count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, reportedlsn, description from pgautofailover.event order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; DROP FUNCTION pgautofailover.last_events(text, integer); CREATE OR REPLACE FUNCTION pgautofailover.last_events ( formation_id text default 'default', count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, reportedlsn, description from pgautofailover.event where formationid = formation_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; DROP FUNCTION pgautofailover.last_events(text, integer, integer); CREATE OR REPLACE FUNCTION pgautofailover.last_events ( formation_id text, group_id int, count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, reportedlsn, description from pgautofailover.event where formationid = formation_id and groupid = group_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; DROP TABLE pgautofailover.node_upgrade_old; DROP TABLE pgautofailover.event_upgrade_old; pg_auto_failover-1.6.3/src/monitor/pgautofailover--1.0.sql000066400000000000000000000334031414244367200234570ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pgautofailover" to load this file. \quit DO $body$ BEGIN if not exists (select * from pg_catalog.pg_user where usename = 'autoctl_node') then create role autoctl_node with login; end if; END $body$; CREATE SCHEMA pgautofailover; GRANT USAGE ON SCHEMA pgautofailover TO autoctl_node; CREATE TYPE pgautofailover.replication_state AS ENUM ( 'unknown', 'init', 'single', 'wait_primary', 'primary', 'draining', 'demote_timeout', 'demoted', 'catchingup', 'secondary', 'prepare_promotion', 'stop_replication', 'wait_standby', 'maintenance' ); CREATE TABLE pgautofailover.formation ( formationid text NOT NULL DEFAULT 'default', kind text NOT NULL DEFAULT 'pgsql', dbname name NOT NULL DEFAULT 'postgres', opt_secondary bool NOT NULL DEFAULT true, PRIMARY KEY (formationid) ); insert into pgautofailover.formation (formationid) values ('default'); CREATE FUNCTION pgautofailover.create_formation ( IN formation_id text, IN kind text, IN dbname name, IN opt_secondary bool, OUT formation_id text, OUT kind text, OUT dbname name, OUT opt_secondary bool ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$create_formation$$; grant execute on function pgautofailover.create_formation(text,text,name,bool) to autoctl_node; CREATE FUNCTION pgautofailover.drop_formation ( IN formation_id text ) RETURNS void LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$drop_formation$$; grant execute on function pgautofailover.drop_formation(text) to autoctl_node; CREATE TABLE pgautofailover.node ( formationid text not null default 'default', nodeid bigserial, groupid int not null, nodename text not null, nodeport integer not null, goalstate pgautofailover.replication_state not null default 'init', reportedstate pgautofailover.replication_state not null, reportedpgisrunning bool default true, reportedrepstate text default 'async', reporttime timestamptz not null default now(), waldelta bigint not null default -1, walreporttime timestamptz not null default now(), health integer not null default -1, healthchecktime timestamptz not null default now(), statechangetime timestamptz not null default now(), UNIQUE (nodename, nodeport), PRIMARY KEY (nodeid), FOREIGN KEY (formationid) REFERENCES pgautofailover.formation(formationid) ) -- we expect few rows and lots of UPDATE, let's benefit from HOT WITH (fillfactor = 25); CREATE TABLE pgautofailover.event ( eventid bigserial not null, eventtime timestamptz not null default now(), formationid text not null, nodeid bigserial, groupid int not null, nodename text not null, nodeport integer not null, reportedstate pgautofailover.replication_state not null, goalstate pgautofailover.replication_state not null, reportedrepstate text, waldelta bigint not null default -1, description text, PRIMARY KEY (eventid) ); GRANT SELECT ON ALL TABLES IN SCHEMA pgautofailover TO autoctl_node; CREATE FUNCTION pgautofailover.register_node ( IN formation_id text, IN node_name text, IN node_port int, IN dbname name, IN desired_group_id int default -1, IN initial_group_role pgautofailover.replication_state default 'init', IN node_kind text default 'standalone', OUT assigned_node_id int, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$register_node$$; grant execute on function pgautofailover.register_node(text,text,int,name,int,pgautofailover.replication_state,text) to autoctl_node; CREATE FUNCTION pgautofailover.node_active ( IN formation_id text, IN node_name text, IN node_port int, IN current_node_id int default -1, IN current_group_id int default -1, IN current_group_role pgautofailover.replication_state default 'init', IN current_pg_is_running bool default true, IN current_wal_delta bigint default -1, IN current_rep_state text default '', OUT assigned_node_id int, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$node_active$$; grant execute on function pgautofailover.node_active(text,text,int,int,int, pgautofailover.replication_state,bool,bigint,text) to autoctl_node; CREATE FUNCTION pgautofailover.get_primary ( IN formation_id text default 'default', IN group_id int default 0, OUT primary_name text, OUT primary_port int ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$get_primary$$; comment on function pgautofailover.get_primary(text,int) is 'get the writable node for a group'; grant execute on function pgautofailover.get_primary(text,int) to autoctl_node; CREATE FUNCTION pgautofailover.get_other_node ( IN node_name text, IN node_port int, OUT secondary_name text, OUT secondary_port int ) RETURNS record LANGUAGE C STRICT AS 'MODULE_PATHNAME', $$get_other_node$$; comment on function pgautofailover.get_other_node(text,int) is 'get the other node in a group'; grant execute on function pgautofailover.get_other_node(text,int) to autoctl_node; CREATE FUNCTION pgautofailover.get_coordinator ( IN formation_id text default 'default', OUT node_name text, OUT node_port int ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select nodename, nodeport from pgautofailover.node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = 0 and goalstate in ('single', 'wait_primary', 'primary') and reportedstate in ('single', 'wait_primary', 'primary'); $$; grant execute on function pgautofailover.get_coordinator(text) to autoctl_node; CREATE FUNCTION pgautofailover.remove_node ( node_name text, node_port int default 5432 ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$remove_node$$; comment on function pgautofailover.remove_node(text,int) is 'remove a node from the monitor'; CREATE FUNCTION pgautofailover.perform_failover ( formation_id text default 'default', group_id int default 0 ) RETURNS void LANGUAGE C STRICT AS 'MODULE_PATHNAME', $$perform_failover$$; comment on function pgautofailover.perform_failover(text,int) is 'manually failover from the primary to the secondary'; CREATE FUNCTION pgautofailover.start_maintenance ( node_name text, node_port int default 5432 ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$start_maintenance$$; comment on function pgautofailover.start_maintenance(text,int) is 'set a node in maintenance state'; CREATE FUNCTION pgautofailover.stop_maintenance ( node_name text, node_port int default 5432 ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$stop_maintenance$$; comment on function pgautofailover.stop_maintenance(text,int) is 'set a node out of maintenance state'; CREATE FUNCTION pgautofailover.last_events ( count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, waldelta, description from pgautofailover.event order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(int) is 'retrieve last COUNT events'; CREATE FUNCTION pgautofailover.last_events ( formation_id text default 'default', count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, waldelta, description from pgautofailover.event where formationid = formation_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(text,int) is 'retrieve last COUNT events for given formation'; CREATE FUNCTION pgautofailover.last_events ( formation_id text, group_id int, count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, waldelta, description from pgautofailover.event where formationid = formation_id and groupid = group_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(text,int,int) is 'retrieve last COUNT events for given formation and group'; CREATE FUNCTION pgautofailover.current_state ( IN formation_id text default 'default', OUT nodename text, OUT nodeport int, OUT group_id int, OUT node_id bigint, OUT current_group_state pgautofailover.replication_state, OUT assigned_group_state pgautofailover.replication_state ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select nodename, nodeport, groupid, nodeid, reportedstate, goalstate from pgautofailover.node where formationid = formation_id order by groupid, nodeid; $$; comment on function pgautofailover.current_state(text) is 'get the current state of both nodes of a formation'; CREATE FUNCTION pgautofailover.current_state ( IN formation_id text, IN group_id int, OUT nodename text, OUT nodeport int, OUT group_id int, OUT node_id bigint, OUT current_group_state pgautofailover.replication_state, OUT assigned_group_state pgautofailover.replication_state ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select nodename, nodeport, groupid, nodeid, reportedstate, goalstate from pgautofailover.node where formationid = formation_id and groupid = group_id order by groupid, nodeid; $$; comment on function pgautofailover.current_state(text, int) is 'get the current state of both nodes of a group in a formation'; CREATE FUNCTION pgautofailover.formation_uri ( IN formation_id text DEFAULT 'default' ) RETURNS text LANGUAGE SQL STRICT AS $$ select case when string_agg(format('%s:%s', nodename, nodeport),',') is not null then format('postgres://%s/%s?target_session_attrs=read-write', string_agg(format('%s:%s', nodename, nodeport),','), -- as we join formation on node we get the same dbname for all -- entries, pick one. min(dbname) ) end as uri from pgautofailover.node as node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = 0; $$; CREATE FUNCTION pgautofailover.enable_secondary ( formation_id text ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$enable_secondary$$; comment on function pgautofailover.enable_secondary(text) is 'changes the state of a formation to assign secondaries for nodes when added'; CREATE FUNCTION pgautofailover.disable_secondary ( formation_id text ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$disable_secondary$$; comment on function pgautofailover.disable_secondary(text) is 'changes the state of a formation to disable the assignment of secondaries for nodes when added'; CREATE OR REPLACE FUNCTION pgautofailover.update_secondary_check() RETURNS trigger LANGUAGE 'plpgsql' AS $$ declare nodeid integer := null; reportedstate pgautofailover.replication_state := null; begin -- when secondary changes from true to false, check all nodes remaining are primary if new.opt_secondary is false and new.opt_secondary is distinct from old.opt_secondary then select node.nodeid, node.reportedstate into nodeid, reportedstate from pgautofailover.node where node.formationid = new.formationid and node.reportedstate <> 'single'; if nodeid is not null then raise exception object_not_in_prerequisite_state using message = 'formation has nodes that are not in SINGLE state', detail = 'nodeid ' || nodeid || ' is in state ' || reportedstate, hint = 'drop secondary nodes before disabling secondaries on formation'; end if; end if; return new; end $$; comment on function pgautofailover.update_secondary_check() is 'performs a check when changes to hassecondary on pgautofailover.formation are made, verifying cluster state allows the change'; CREATE TRIGGER disable_secondary_check BEFORE UPDATE ON pgautofailover.formation FOR EACH ROW EXECUTE PROCEDURE pgautofailover.update_secondary_check(); pg_auto_failover-1.6.3/src/monitor/pgautofailover--1.1--1.2.sql000066400000000000000000000021531414244367200240310ustar00rootroot00000000000000-- -- extension update file from 1.1 to 1.2 -- -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION pgautofailover UPDATE TO 1.2" to load this file. \quit DROP FUNCTION IF EXISTS pgautofailover.formation_uri(text); CREATE FUNCTION pgautofailover.formation_uri ( IN formation_id text DEFAULT 'default', IN sslmode text DEFAULT 'prefer' ) RETURNS text LANGUAGE SQL STRICT AS $$ select case when string_agg(format('%s:%s', nodename, nodeport),',') is not null then format('postgres://%s/%s?target_session_attrs=read-write&sslmode=%s', string_agg(format('%s:%s', nodename, nodeport),','), -- as we join formation on node we get the same dbname for all -- entries, pick one. min(dbname), min(sslmode) ) end as uri from pgautofailover.node as node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = 0; $$; pg_auto_failover-1.6.3/src/monitor/pgautofailover--1.2--1.3.sql000066400000000000000000000470261414244367200240430ustar00rootroot00000000000000-- -- extension update file from 1.2 to 1.3 -- -- complain if script is sourced in psql, rather than via CREATE EXTENSION -- \echo Use "ALTER EXTENSION pgautofailover UPDATE TO 1.3" to load this file. \quit --- The following only works in Postgres 12 onward -- ALTER TYPE pgautofailover.replication_state ADD VALUE 'join_primary'; -- ALTER TYPE pgautofailover.replication_state ADD VALUE 'apply_settings'; DROP FUNCTION IF EXISTS pgautofailover.register_node(text,text,integer,name,integer,pgautofailover.replication_state,text); DROP FUNCTION IF EXISTS pgautofailover.node_active(text,text,int,int,int, pgautofailover.replication_state,bool,pg_lsn,text); DROP FUNCTION IF EXISTS pgautofailover.current_state(text); DROP FUNCTION IF EXISTS pgautofailover.current_state(text, int); ALTER TYPE pgautofailover.replication_state RENAME TO old_replication_state; CREATE TYPE pgautofailover.replication_state AS ENUM ( 'unknown', 'init', 'single', 'wait_primary', 'primary', 'draining', 'demote_timeout', 'demoted', 'catchingup', 'secondary', 'prepare_promotion', 'stop_replication', 'wait_standby', 'maintenance', 'join_primary', 'apply_settings' ); -- Note the double cast here, first to text and only then to the new enums ALTER TABLE pgautofailover.node ALTER COLUMN goalstate DROP NOT NULL, ALTER COLUMN goalstate DROP DEFAULT, ALTER COLUMN goalstate TYPE pgautofailover.replication_state USING goalstate::text::pgautofailover.replication_state, ALTER COLUMN goalstate SET DEFAULT 'init', ALTER COLUMN goalstate SET NOT NULL, ALTER COLUMN reportedstate TYPE pgautofailover.replication_state USING reportedstate::text::pgautofailover.replication_state; ALTER TABLE pgautofailover.event ALTER COLUMN goalstate TYPE pgautofailover.replication_state USING goalstate::text::pgautofailover.replication_state, ALTER COLUMN reportedstate TYPE pgautofailover.replication_state USING reportedstate::text::pgautofailover.replication_state; DROP TYPE pgautofailover.old_replication_state; ALTER TABLE pgautofailover.formation ADD COLUMN number_sync_standbys int NOT NULL DEFAULT 1; DROP FUNCTION IF EXISTS pgautofailover.create_formation(text, text); DROP FUNCTION IF EXISTS pgautofailover.create_formation(text,text,name,boolean); DROP FUNCTION IF EXISTS pgautofailover.get_other_node(text,integer); CREATE FUNCTION pgautofailover.set_formation_number_sync_standbys ( IN formation_id text, IN number_sync_standbys int ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$set_formation_number_sync_standbys$$; grant execute on function pgautofailover.set_formation_number_sync_standbys(text, int) to autoctl_node; CREATE FUNCTION pgautofailover.create_formation ( IN formation_id text, IN kind text, IN dbname name, IN opt_secondary bool, IN number_sync_standbys int, OUT formation_id text, OUT kind text, OUT dbname name, OUT opt_secondary bool, OUT number_sync_standbys int ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$create_formation$$; grant execute on function pgautofailover.create_formation(text,text,name,bool,int) to autoctl_node; ALTER TABLE pgautofailover.node RENAME TO node_upgrade_old; CREATE TABLE pgautofailover.node ( formationid text not null default 'default', nodeid bigint not null DEFAULT nextval('pgautofailover.node_nodeid_seq'::regclass), groupid int not null, nodename text not null, nodeport int not null, goalstate pgautofailover.replication_state not null default 'init', reportedstate pgautofailover.replication_state not null, reportedpgisrunning bool default true, reportedrepstate text default 'async', reporttime timestamptz not null default now(), reportedlsn pg_lsn not null default '0/0', walreporttime timestamptz not null default now(), health integer not null default -1, healthchecktime timestamptz not null default now(), statechangetime timestamptz not null default now(), candidatepriority int not null default 100, replicationquorum bool not null default true, UNIQUE (nodename, nodeport), PRIMARY KEY (nodeid), FOREIGN KEY (formationid) REFERENCES pgautofailover.formation(formationid) ) -- we expect few rows and lots of UPDATE, let's benefit from HOT WITH (fillfactor = 25); ALTER SEQUENCE pgautofailover.node_nodeid_seq OWNED BY pgautofailover.node.nodeid; INSERT INTO pgautofailover.node ( formationid, nodeid, groupid, nodename, nodeport, goalstate, reportedstate, reportedpgisrunning, reportedrepstate, reporttime, reportedlsn, walreporttime, health, healthchecktime, statechangetime ) SELECT formationid, nodeid, groupid, nodename, nodeport, goalstate, reportedstate, reportedpgisrunning, reportedrepstate, reporttime, reportedlsn, walreporttime, health, healthchecktime, statechangetime FROM pgautofailover.node_upgrade_old; ALTER TABLE pgautofailover.event RENAME TO event_upgrade_old; CREATE TABLE pgautofailover.event ( eventid bigint not null DEFAULT nextval('pgautofailover.event_eventid_seq'::regclass), eventtime timestamptz not null default now(), formationid text not null, nodeid bigint not null, groupid int not null, nodename text not null, nodeport integer not null, reportedstate pgautofailover.replication_state not null, goalstate pgautofailover.replication_state not null, reportedrepstate text, reportedlsn pg_lsn not null default '0/0', candidatepriority int, replicationquorum bool, description text, PRIMARY KEY (eventid) ); ALTER SEQUENCE pgautofailover.event_eventid_seq OWNED BY pgautofailover.event.eventid; INSERT INTO pgautofailover.event ( eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, description ) SELECT eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, description FROM pgautofailover.event_upgrade_old; GRANT SELECT ON ALL TABLES IN SCHEMA pgautofailover TO autoctl_node; CREATE FUNCTION pgautofailover.set_node_nodename ( IN node_id bigint, IN node_name text, OUT node_id bigint, OUT name text, OUT port int ) RETURNS record LANGUAGE SQL STRICT SECURITY DEFINER AS $$ update pgautofailover.node set nodename = node_name where nodeid = node_id returning nodeid, nodename, nodeport; $$; grant execute on function pgautofailover.set_node_nodename(bigint,text) to autoctl_node; DROP FUNCTION IF EXISTS pgautofailover.register_node(text, text); CREATE FUNCTION pgautofailover.register_node ( IN formation_id text, IN node_name text, IN node_port int, IN dbname name, IN desired_group_id int default -1, IN initial_group_role pgautofailover.replication_state default 'init', IN node_kind text default 'standalone', IN candidate_priority int default 100, IN replication_quorum bool default true, OUT assigned_node_id int, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state, OUT assigned_candidate_priority int, OUT assigned_replication_quorum bool ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$register_node$$; grant execute on function pgautofailover.register_node(text,text,int,name,int,pgautofailover.replication_state,text, int, bool) to autoctl_node; CREATE FUNCTION pgautofailover.node_active ( In formation_id text, IN node_name text, IN node_port int, IN current_node_id int default -1, IN current_group_id int default -1, IN current_group_role pgautofailover.replication_state default 'init', IN current_pg_is_running bool default true, IN current_lsn pg_lsn default '0/0', IN current_rep_state text default '', OUT assigned_node_id int, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state, OUT assigned_candidate_priority int, OUT assigned_replication_quorum bool ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$node_active$$; grant execute on function pgautofailover.node_active(text,text,int,int,int, pgautofailover.replication_state,bool,pg_lsn,text) to autoctl_node; DROP FUNCTION IF EXISTS pgautofailover.get_nodes(text, text); CREATE FUNCTION pgautofailover.get_nodes ( IN formation_id text default 'default', IN group_id int default NULL, OUT node_id int, OUT node_name text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C AS 'MODULE_PATHNAME', $$get_nodes$$; comment on function pgautofailover.get_nodes(text,int) is 'get all the nodes in a group'; grant execute on function pgautofailover.get_nodes(text,int) to autoctl_node; DROP FUNCTION IF EXISTS pgautofailover.get_primary(text,int); CREATE FUNCTION pgautofailover.get_primary ( IN formation_id text default 'default', IN group_id int default 0, OUT primary_node_id int, OUT primary_name text, OUT primary_port int ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$get_primary$$; comment on function pgautofailover.get_primary(text,int) is 'get the writable node for a group'; grant execute on function pgautofailover.get_primary(text,int) to autoctl_node; DROP FUNCTION IF EXISTS pgautofailover.get_other_nodes(text. int); CREATE FUNCTION pgautofailover.get_other_nodes ( IN node_name text, IN node_port int, OUT node_id int, OUT node_name text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C STRICT AS 'MODULE_PATHNAME', $$get_other_nodes$$; comment on function pgautofailover.get_other_nodes(text,int) is 'get the other nodes in a group'; grant execute on function pgautofailover.get_other_nodes(text,int) to autoctl_node; DROP FUNCTION IF EXISTS pgautofailover.get_other_nodes (text. int, pgautofailover.replication_state); CREATE FUNCTION pgautofailover.get_other_nodes ( IN node_name text, IN node_port int, IN current_state pgautofailover.replication_state, OUT node_id int, OUT node_name text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C STRICT AS 'MODULE_PATHNAME', $$get_other_nodes$$; comment on function pgautofailover.get_other_nodes (text,int,pgautofailover.replication_state) is 'get the other nodes in a group, filtering on current_state'; grant execute on function pgautofailover.get_other_nodes (text,int,pgautofailover.replication_state) to autoctl_node; DROP FUNCTION IF EXISTS pgautofailover.last_events(int); CREATE FUNCTION pgautofailover.last_events ( count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(int) is 'retrieve last COUNT events'; DROP FUNCTION IF EXISTS pgautofailover.last_events(text,int); CREATE FUNCTION pgautofailover.last_events ( formation_id text default 'default', count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event where formationid = formation_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(text,int) is 'retrieve last COUNT events for given formation'; DROP FUNCTION IF EXISTS pgautofailover.last_events(text,int,int); CREATE FUNCTION pgautofailover.last_events ( formation_id text, group_id int, count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodeport, reportedstate, goalstate, reportedrepstate, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event where formationid = formation_id and groupid = group_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(text,int,int) is 'retrieve last COUNT events for given formation and group'; CREATE FUNCTION pgautofailover.current_state ( IN formation_id text default 'default', OUT nodename text, OUT nodeport int, OUT group_id int, OUT node_id bigint, OUT current_group_state pgautofailover.replication_state, OUT assigned_group_state pgautofailover.replication_state, OUT candidate_priority int, OUT replication_quorum bool ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select nodename, nodeport, groupid, nodeid, reportedstate, goalstate, candidatepriority, replicationquorum from pgautofailover.node where formationid = formation_id order by groupid, nodeid; $$; comment on function pgautofailover.current_state(text) is 'get the current state of both nodes of a formation'; CREATE FUNCTION pgautofailover.current_state ( IN formation_id text, IN group_id int, OUT nodename text, OUT nodeport int, OUT group_id int, OUT node_id bigint, OUT current_group_state pgautofailover.replication_state, OUT assigned_group_state pgautofailover.replication_state, OUT candidate_priority int, OUT replication_quorum bool ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select nodename, nodeport, groupid, nodeid, reportedstate, goalstate, candidatepriority, replicationquorum from pgautofailover.node where formationid = formation_id and groupid = group_id order by groupid, nodeid; $$; comment on function pgautofailover.current_state(text, int) is 'get the current state of both nodes of a group in a formation'; DROP FUNCTION IF EXISTS pgautofailover.formation_uri(text, text); CREATE FUNCTION pgautofailover.formation_uri ( IN formation_id text DEFAULT 'default', IN sslmode text DEFAULT 'prefer', IN sslrootcert text DEFAULT '', IN sslcrl text DEFAULT '' ) RETURNS text LANGUAGE SQL STRICT AS $$ select case when string_agg(format('%s:%s', nodename, nodeport),',') is not null then format( 'postgres://%s/%s?target_session_attrs=read-write&sslmode=%s%s%s', string_agg(format('%s:%s', nodename, nodeport),','), -- as we join formation on node we get the same dbname for all -- entries, pick one. min(dbname), min(sslmode), CASE WHEN min(sslrootcert) = '' THEN '' ELSE '&sslrootcert=' || sslrootcert END, CASE WHEN min(sslcrl) = '' THEN '' ELSE '&sslcrl=' || sslcrl END ) end as uri from pgautofailover.node as node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = 0; $$; CREATE FUNCTION pgautofailover.set_node_candidate_priority ( IN nodeid int, IN nodename text, IN nodeport int, IN candidate_priority int ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$set_node_candidate_priority$$; comment on function pgautofailover.set_node_candidate_priority(int, text, int, int) is 'sets the candidate priority value for a node. Expects a priority value between 0 and 100. 0 if the node is not a candidate to be promoted to be primary.'; grant execute on function pgautofailover.set_node_candidate_priority(int, text, int, int) to autoctl_node; CREATE FUNCTION pgautofailover.set_node_replication_quorum ( IN nodeid int, IN nodename text, IN nodeport int, IN replication_quorum bool ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$set_node_replication_quorum$$; comment on function pgautofailover.set_node_replication_quorum(int, text, int, bool) is 'sets the replication quorum value for a node. true if the node participates in write quorum'; grant execute on function pgautofailover.set_node_replication_quorum(int, text, int, bool) to autoctl_node; create function pgautofailover.synchronous_standby_names ( IN formation_id text default 'default', IN group_id int default 0 ) returns text language C strict AS 'MODULE_PATHNAME', $$synchronous_standby_names$$; comment on function pgautofailover.synchronous_standby_names(text, int) is 'get the synchronous_standby_names setting for a given group'; grant execute on function pgautofailover.synchronous_standby_names(text, int) to autoctl_node; CREATE OR REPLACE FUNCTION pgautofailover.adjust_number_sync_standbys() RETURNS trigger LANGUAGE 'plpgsql' AS $$ declare standby_count integer := null; number_sync_standbys integer := null; begin select count(*) - 1 into standby_count from pgautofailover.node where formationid = old.formationid; select formation.number_sync_standbys into number_sync_standbys from pgautofailover.formation where formation.formationid = old.formationid; if number_sync_standbys > 1 then -- we must have number_sync_standbys + 1 <= standby_count if (number_sync_standbys + 1) > standby_count then update pgautofailover.formation set number_sync_standbys = greatest(standby_count - 1, 1) where formation.formationid = old.formationid; end if; end if; return old; end $$; comment on function pgautofailover.adjust_number_sync_standbys() is 'adjust formation number_sync_standbys when removing a node, if needed'; CREATE TRIGGER adjust_number_sync_standbys AFTER DELETE ON pgautofailover.node FOR EACH ROW EXECUTE PROCEDURE pgautofailover.adjust_number_sync_standbys(); DROP TABLE pgautofailover.node_upgrade_old; DROP TABLE pgautofailover.event_upgrade_old; pg_auto_failover-1.6.3/src/monitor/pgautofailover--1.3--1.4.sql000066400000000000000000000627631414244367200240520ustar00rootroot00000000000000-- -- extension update file from 1.3 to 1.4 -- -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pgautofailover" to load this file. \quit DROP FUNCTION IF EXISTS pgautofailover.register_node(text,text,int,name,int, pgautofailover.replication_state,text, int, bool); DROP FUNCTION IF EXISTS pgautofailover.node_active(text,text,int,int,int, pgautofailover.replication_state,bool,pg_lsn,text); DROP FUNCTION IF EXISTS pgautofailover.get_other_nodes (text,integer,pgautofailover.replication_state); DROP FUNCTION IF EXISTS pgautofailover.current_state(text); DROP FUNCTION IF EXISTS pgautofailover.current_state(text, int); ALTER TYPE pgautofailover.replication_state RENAME TO old_replication_state; CREATE TYPE pgautofailover.replication_state AS ENUM ( 'unknown', 'init', 'single', 'wait_primary', 'primary', 'draining', 'demote_timeout', 'demoted', 'catchingup', 'secondary', 'prepare_promotion', 'stop_replication', 'wait_standby', 'maintenance', 'join_primary', 'apply_settings', 'prepare_maintenance', 'wait_maintenance', 'report_lsn', 'fast_forward', 'join_secondary' ); -- Note the double cast here, first to text and only then to the new enums ALTER TABLE pgautofailover.node ALTER COLUMN goalstate DROP NOT NULL, ALTER COLUMN goalstate DROP DEFAULT, ALTER COLUMN goalstate TYPE pgautofailover.replication_state USING goalstate::text::pgautofailover.replication_state, ALTER COLUMN goalstate SET DEFAULT 'init', ALTER COLUMN goalstate SET NOT NULL, ALTER COLUMN reportedstate TYPE pgautofailover.replication_state USING reportedstate::text::pgautofailover.replication_state; ALTER TABLE pgautofailover.event ALTER COLUMN goalstate TYPE pgautofailover.replication_state USING goalstate::text::pgautofailover.replication_state, ALTER COLUMN reportedstate TYPE pgautofailover.replication_state USING reportedstate::text::pgautofailover.replication_state; DROP TYPE pgautofailover.old_replication_state; ALTER TABLE pgautofailover.formation ALTER COLUMN number_sync_standbys SET DEFAULT 0; -- -- The default used to be 1, now it's zero. Change it for people who left -- the default (everybody, most certainly, because this used to have no -- impact). -- UPDATE pgautofailover.formation SET number_sync_standbys = 0 WHERE number_sync_standbys = 1; ALTER TABLE pgautofailover.formation ADD CHECK (kind IN ('pgsql', 'citus')); ALTER TABLE pgautofailover.node RENAME TO node_upgrade_old; CREATE TABLE pgautofailover.node ( formationid text not null default 'default', nodeid bigint not null DEFAULT nextval('pgautofailover.node_nodeid_seq'::regclass), groupid int not null, nodename text not null, nodehost text not null, nodeport int not null, sysidentifier bigint, goalstate pgautofailover.replication_state not null default 'init', reportedstate pgautofailover.replication_state not null, reportedpgisrunning bool default true, reportedrepstate text default 'async', reporttime timestamptz not null default now(), reportedlsn pg_lsn not null default '0/0', walreporttime timestamptz not null default now(), health integer not null default -1, healthchecktime timestamptz not null default now(), statechangetime timestamptz not null default now(), candidatepriority int not null default 100, replicationquorum bool not null default true, -- node names must be unique in a given formation UNIQUE (formationid, nodename), -- any nodehost:port can only be a unique node in the system UNIQUE (nodehost, nodeport), -- -- The EXCLUDE constraint only allows the same sysidentifier for all the -- nodes in the same group. The system_identifier is a property that is -- kept when implementing streaming replication and should be unique per -- Postgres instance in all other cases. -- -- We allow the sysidentifier column to be NULL when registering a new -- primary server from scratch, because we have not done pg_ctl initdb -- at the time we call the register_node() function. -- CONSTRAINT system_identifier_is_null_at_init_only CHECK ( ( sysidentifier IS NULL AND reportedstate in ('init', 'wait_standby', 'catchingup') ) OR sysidentifier IS NOT NULL), CONSTRAINT same_system_identifier_within_group EXCLUDE USING gist(formationid with =, groupid with =, sysidentifier with <>) DEFERRABLE INITIALLY DEFERRED, PRIMARY KEY (nodeid), FOREIGN KEY (formationid) REFERENCES pgautofailover.formation(formationid) ) -- we expect few rows and lots of UPDATE, let's benefit from HOT WITH (fillfactor = 25); ALTER SEQUENCE pgautofailover.node_nodeid_seq OWNED BY pgautofailover.node.nodeid; INSERT INTO pgautofailover.node ( formationid, nodeid, groupid, nodename, nodehost, nodeport, sysidentifier, goalstate, reportedstate, reportedpgisrunning, reportedrepstate, reporttime, reportedlsn, walreporttime, health, healthchecktime, statechangetime, candidatepriority, replicationquorum ) SELECT formationid, nodeid, groupid, format('node_%s', nodeid) as nodename, nodename as nodehost, nodeport, 0 as sysidentifier, goalstate, reportedstate, reportedpgisrunning, reportedrepstate, reporttime, reportedlsn, walreporttime, health, healthchecktime, statechangetime, candidatepriority, replicationquorum FROM pgautofailover.node_upgrade_old; ALTER TABLE pgautofailover.event RENAME TO event_upgrade_old; CREATE TABLE pgautofailover.event ( eventid bigint not null DEFAULT nextval('pgautofailover.event_eventid_seq'::regclass), eventtime timestamptz not null default now(), formationid text not null, nodeid bigint not null, groupid int not null, nodename text not null, nodehost text not null, nodeport integer not null, reportedstate pgautofailover.replication_state not null, goalstate pgautofailover.replication_state not null, reportedrepstate text, reportedlsn pg_lsn not null default '0/0', candidatepriority int, replicationquorum bool, description text, PRIMARY KEY (eventid) ); ALTER SEQUENCE pgautofailover.event_eventid_seq OWNED BY pgautofailover.event.eventid; INSERT INTO pgautofailover.event ( eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, description ) SELECT eventid, eventtime, event.formationid, event.nodeid, event.groupid, node.nodename, node.nodehost, event.nodeport, event.reportedstate, event.goalstate, event.reportedrepstate, event.description FROM pgautofailover.event_upgrade_old as event JOIN pgautofailover.node USING(nodeid); GRANT SELECT ON ALL TABLES IN SCHEMA pgautofailover TO autoctl_node; CREATE FUNCTION pgautofailover.set_node_system_identifier ( IN node_id bigint, IN node_sysidentifier bigint, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int ) RETURNS record LANGUAGE SQL STRICT SECURITY DEFINER AS $$ update pgautofailover.node set sysidentifier = node_sysidentifier where nodeid = set_node_system_identifier.node_id returning nodeid, nodename, nodehost, nodeport; $$; grant execute on function pgautofailover.set_node_system_identifier(bigint,bigint) to autoctl_node; CREATE FUNCTION pgautofailover.set_group_system_identifier ( IN group_id bigint, IN node_sysidentifier bigint, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int ) RETURNS setof record LANGUAGE SQL STRICT SECURITY DEFINER AS $$ update pgautofailover.node set sysidentifier = node_sysidentifier where groupid = set_group_system_identifier.group_id and sysidentifier = 0 returning nodeid, nodename, nodehost, nodeport; $$; grant execute on function pgautofailover.set_group_system_identifier(bigint,bigint) to autoctl_node; DROP FUNCTION pgautofailover.set_node_nodename(bigint,text); CREATE FUNCTION pgautofailover.update_node_metadata ( IN node_id bigint, IN node_name text, IN node_host text, IN node_port int ) RETURNS boolean LANGUAGE C SECURITY DEFINER AS 'MODULE_PATHNAME', $$update_node_metadata$$; grant execute on function pgautofailover.update_node_metadata(bigint,text,text,int) to autoctl_node; CREATE FUNCTION pgautofailover.register_node ( IN formation_id text, IN node_host text, IN node_port int, IN dbname name, IN node_name text default '', IN sysidentifier bigint default 0, IN desired_group_id int default -1, IN initial_group_role pgautofailover.replication_state default 'init', IN node_kind text default 'standalone', IN candidate_priority int default 100, IN replication_quorum bool default true, OUT assigned_node_id int, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state, OUT assigned_candidate_priority int, OUT assigned_replication_quorum bool, OUT assigned_node_name text ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$register_node$$; grant execute on function pgautofailover.register_node(text,text,int,name,text,bigint,int,pgautofailover.replication_state,text, int, bool) to autoctl_node; CREATE FUNCTION pgautofailover.node_active ( IN formation_id text, IN node_id int, IN group_id int, IN current_group_role pgautofailover.replication_state default 'init', IN current_pg_is_running bool default true, IN current_lsn pg_lsn default '0/0', IN current_rep_state text default '', OUT assigned_node_id int, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state, OUT assigned_candidate_priority int, OUT assigned_replication_quorum bool ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$node_active$$; grant execute on function pgautofailover.node_active(text,int,int, pgautofailover.replication_state,bool,pg_lsn,text) to autoctl_node; DROP FUNCTION pgautofailover.get_nodes(text, int); CREATE FUNCTION pgautofailover.get_nodes ( IN formation_id text default 'default', IN group_id int default NULL, OUT node_id int, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C AS 'MODULE_PATHNAME', $$get_nodes$$; comment on function pgautofailover.get_nodes(text,int) is 'get all the nodes in a group'; grant execute on function pgautofailover.get_nodes(text,int) to autoctl_node; DROP FUNCTION pgautofailover.get_primary(text, int); CREATE FUNCTION pgautofailover.get_primary ( IN formation_id text default 'default', IN group_id int default 0, OUT primary_node_id int, OUT primary_name text, OUT primary_host text, OUT primary_port int ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$get_primary$$; comment on function pgautofailover.get_primary(text,int) is 'get the writable node for a group'; grant execute on function pgautofailover.get_primary(text,int) to autoctl_node; DROP FUNCTION IF EXISTS pgautofailover.get_other_nodes (text,integer); CREATE FUNCTION pgautofailover.get_other_nodes ( IN nodeid int, OUT node_id int, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C STRICT AS 'MODULE_PATHNAME', $$get_other_nodes$$; comment on function pgautofailover.get_other_nodes(int) is 'get the other nodes in a group'; grant execute on function pgautofailover.get_other_nodes(int) to autoctl_node; CREATE FUNCTION pgautofailover.get_other_nodes ( IN nodeid int, IN current_state pgautofailover.replication_state, OUT node_id int, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C STRICT AS 'MODULE_PATHNAME', $$get_other_nodes$$; comment on function pgautofailover.get_other_nodes (int,pgautofailover.replication_state) is 'get the other nodes in a group, filtering on current_state'; grant execute on function pgautofailover.get_other_nodes (int,pgautofailover.replication_state) to autoctl_node; DROP FUNCTION pgautofailover.get_coordinator(text); CREATE FUNCTION pgautofailover.get_coordinator ( IN formation_id text default 'default', OUT node_host text, OUT node_port int ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select nodehost, nodeport from pgautofailover.node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = 0 and goalstate in ('single', 'wait_primary', 'primary') and reportedstate in ('single', 'wait_primary', 'primary'); $$; grant execute on function pgautofailover.get_coordinator(text) to autoctl_node; CREATE FUNCTION pgautofailover.get_most_advanced_standby ( IN formationid text default 'default', IN groupid int default 0, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select nodeid, nodename, nodehost, nodeport, reportedlsn, false from pgautofailover.node where formationid = $1 and groupid = $2 and reportedstate = 'report_lsn' order by reportedlsn desc, health desc limit 1; $$; grant execute on function pgautofailover.get_most_advanced_standby(text,int) to autoctl_node; DROP FUNCTION IF EXISTS pgautofailover.remove_node(text, int); CREATE FUNCTION pgautofailover.remove_node ( node_id int ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$remove_node_by_nodeid$$; comment on function pgautofailover.remove_node(int) is 'remove a node from the monitor'; grant execute on function pgautofailover.remove_node(int) to autoctl_node; CREATE FUNCTION pgautofailover.remove_node ( node_host text, node_port int default 5432 ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$remove_node_by_host$$; comment on function pgautofailover.remove_node(text,int) is 'remove a node from the monitor'; grant execute on function pgautofailover.remove_node(text,int) to autoctl_node; CREATE FUNCTION pgautofailover.perform_promotion ( formation_id text, node_name text ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$perform_promotion$$; comment on function pgautofailover.perform_promotion(text,text) is 'manually failover from the primary to the given node'; grant execute on function pgautofailover.perform_promotion(text,text) to autoctl_node; DROP FUNCTION pgautofailover.start_maintenance(text, int); DROP FUNCTION pgautofailover.stop_maintenance(text, int); CREATE FUNCTION pgautofailover.start_maintenance(node_id int) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$start_maintenance$$; comment on function pgautofailover.start_maintenance(int) is 'set a node in maintenance state'; grant execute on function pgautofailover.start_maintenance(int) to autoctl_node; CREATE FUNCTION pgautofailover.stop_maintenance(node_id int) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$stop_maintenance$$; comment on function pgautofailover.stop_maintenance(int) is 'set a node out of maintenance state'; grant execute on function pgautofailover.stop_maintenance(int) to autoctl_node; DROP FUNCTION pgautofailover.last_events(int); DROP FUNCTION pgautofailover.last_events(text,int); DROP FUNCTION pgautofailover.last_events(text,int,int); CREATE FUNCTION pgautofailover.last_events ( count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(int) is 'retrieve last COUNT events'; CREATE FUNCTION pgautofailover.last_events ( formation_id text default 'default', count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event where formationid = formation_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(text,int) is 'retrieve last COUNT events for given formation'; CREATE FUNCTION pgautofailover.last_events ( formation_id text, group_id int, count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event where formationid = formation_id and groupid = group_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(text,int,int) is 'retrieve last COUNT events for given formation and group'; CREATE FUNCTION pgautofailover.current_state ( IN formation_id text default 'default', OUT formation_kind text, OUT nodename text, OUT nodehost text, OUT nodeport int, OUT group_id int, OUT node_id bigint, OUT current_group_state pgautofailover.replication_state, OUT assigned_group_state pgautofailover.replication_state, OUT candidate_priority int, OUT replication_quorum bool, OUT reported_lsn pg_lsn, OUT health integer ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select kind, nodename, nodehost, nodeport, groupid, nodeid, reportedstate, goalstate, candidatepriority, replicationquorum, reportedlsn, health from pgautofailover.node join pgautofailover.formation using(formationid) where formationid = formation_id order by groupid, nodeid; $$; comment on function pgautofailover.current_state(text) is 'get the current state of both nodes of a formation'; CREATE FUNCTION pgautofailover.current_state ( IN formation_id text, IN group_id int, OUT formation_kind text, OUT nodename text, OUT nodehost text, OUT nodeport int, OUT group_id int, OUT node_id bigint, OUT current_group_state pgautofailover.replication_state, OUT assigned_group_state pgautofailover.replication_state, OUT candidate_priority int, OUT replication_quorum bool, OUT reported_lsn pg_lsn, OUT health integer ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select kind, nodename, nodehost, nodeport, groupid, nodeid, reportedstate, goalstate, candidatepriority, replicationquorum, reportedlsn, health from pgautofailover.node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = group_id order by groupid, nodeid; $$; comment on function pgautofailover.current_state(text, int) is 'get the current state of both nodes of a group in a formation'; CREATE OR REPLACE FUNCTION pgautofailover.formation_uri ( IN formation_id text DEFAULT 'default', IN sslmode text DEFAULT 'prefer', IN sslrootcert text DEFAULT '', IN sslcrl text DEFAULT '' ) RETURNS text LANGUAGE SQL STRICT AS $$ select case when string_agg(format('%s:%s', nodehost, nodeport),',') is not null then format( 'postgres://%s/%s?target_session_attrs=read-write&sslmode=%s%s%s', string_agg(format('%s:%s', nodehost, nodeport),','), -- as we join formation on node we get the same dbname for all -- entries, pick one. min(dbname), min(sslmode), CASE WHEN min(sslrootcert) = '' THEN '' ELSE '&sslrootcert=' || sslrootcert END, CASE WHEN min(sslcrl) = '' THEN '' ELSE '&sslcrl=' || sslcrl END ) end as uri from pgautofailover.node as node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = 0; $$; DROP FUNCTION pgautofailover.set_node_candidate_priority(int,text,int,int); CREATE FUNCTION pgautofailover.set_node_candidate_priority ( IN formation_id text, IN node_name text, IN candidate_priority int ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$set_node_candidate_priority$$; comment on function pgautofailover.set_node_candidate_priority(text, text, int) is 'sets the candidate priority value for a node. Expects a priority value between 0 and 100. 0 if the node is not a candidate to be promoted to be primary.'; grant execute on function pgautofailover.set_node_candidate_priority(text, text, int) to autoctl_node; DROP FUNCTION pgautofailover.set_node_replication_quorum(int,text,int,bool); CREATE FUNCTION pgautofailover.set_node_replication_quorum ( IN formation_id text, IN node_name text, IN replication_quorum bool ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$set_node_replication_quorum$$; comment on function pgautofailover.set_node_replication_quorum(text, text, bool) is 'sets the replication quorum value for a node. true if the node participates in write quorum'; grant execute on function pgautofailover.set_node_replication_quorum(text, text, bool) to autoctl_node; CREATE FUNCTION pgautofailover.formation_settings ( IN formation_id text default 'default', OUT context text, OUT group_id int, OUT node_id bigint, OUT nodename text, OUT setting text, OUT value text ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ with groups(formationid, groupid) as ( select formationid, groupid from pgautofailover.node where formationid = formation_id group by formationid, groupid ) -- context: formation, number_sync_standbys select 'formation' as context, NULL as group_id, NULL as node_id, formationid as nodename, 'number_sync_standbys' as setting, cast(number_sync_standbys as text) as value from pgautofailover.formation where formationid = formation_id union all -- context: primary, one entry per group in the formation select 'primary', groups.groupid, nodes.node_id, nodes.node_name, 'synchronous_standby_names', format('''%s''', pgautofailover.synchronous_standby_names(formationid, groupid)) from groups, pgautofailover.get_nodes(formationid, groupid) as nodes where node_is_primary union all ( -- context: node, one entry per node in the formation select 'node', node.groupid, node.nodeid, node.nodename, 'replication quorum', cast(node.replicationquorum as text) from pgautofailover.node as node where node.formationid = formation_id order by nodeid ) union all ( select 'node', node.groupid, node.nodeid, node.nodename, 'candidate priority', cast(node.candidatepriority as text) from pgautofailover.node as node where node.formationid = formation_id order by nodeid ) $$; comment on function pgautofailover.formation_settings(text) is 'get the current replication settings a formation'; drop function pgautofailover.adjust_number_sync_standbys() cascade; DROP TABLE pgautofailover.node_upgrade_old; DROP TABLE pgautofailover.event_upgrade_old; pg_auto_failover-1.6.3/src/monitor/pgautofailover--1.4--1.5.sql000066400000000000000000000062341414244367200240430ustar00rootroot00000000000000-- -- extension update file from 1.4.2 to 1.5.1 -- -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pgautofailover" to load this file. \quit ALTER TABLE pgautofailover.node ADD COLUMN nodecluster text not null default 'default'; DROP FUNCTION IF EXISTS pgautofailover.formation_uri(text, text, text, text); CREATE FUNCTION pgautofailover.formation_uri ( IN formation_id text DEFAULT 'default', IN cluster_name text DEFAULT 'default', IN sslmode text DEFAULT 'prefer', IN sslrootcert text DEFAULT '', IN sslcrl text DEFAULT '' ) RETURNS text LANGUAGE SQL STRICT AS $$ select case when string_agg(format('%s:%s', nodehost, nodeport),',') is not null then format( 'postgres://%s/%s?%ssslmode=%s%s%s', string_agg(format('%s:%s', nodehost, nodeport),','), -- as we join formation on node we get the same dbname for all -- entries, pick one. min(dbname), case when cluster_name = 'default' then 'target_session_attrs=read-write&' else '' end, min(sslmode), CASE WHEN min(sslrootcert) = '' THEN '' ELSE '&sslrootcert=' || sslrootcert END, CASE WHEN min(sslcrl) = '' THEN '' ELSE '&sslcrl=' || sslcrl END ) end as uri from pgautofailover.node as node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = 0 and nodecluster = cluster_name; $$; DROP FUNCTION IF EXISTS pgautofailover.register_node(text,text,int,name,text,bigint,int, pgautofailover.replication_state,text, int,bool,text); CREATE FUNCTION pgautofailover.register_node ( IN formation_id text, IN node_host text, IN node_port int, IN dbname name, IN node_name text default '', IN sysidentifier bigint default 0, IN desired_node_id int default -1, IN desired_group_id int default -1, IN initial_group_role pgautofailover.replication_state default 'init', IN node_kind text default 'standalone', IN candidate_priority int default 100, IN replication_quorum bool default true, IN node_cluster text default 'default', OUT assigned_node_id int, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state, OUT assigned_candidate_priority int, OUT assigned_replication_quorum bool, OUT assigned_node_name text ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$register_node$$; grant execute on function pgautofailover.register_node(text,text,int,name,text,bigint,int,int, pgautofailover.replication_state,text, int,bool,text) to autoctl_node; pg_auto_failover-1.6.3/src/monitor/pgautofailover--1.5--1.6.sql000066400000000000000000000507501414244367200240470ustar00rootroot00000000000000-- -- extension update file from 1.5 to 1.6 -- -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pgautofailover" to load this file. \quit -- remove a possible leftover from pg_auto_failover 1.4 that was not correctly -- removed in a migration to 1.5 DROP FUNCTION IF EXISTS pgautofailover.register_node(text,text,int,name,text,bigint,int, pgautofailover.replication_state,text, int,bool); DROP FUNCTION pgautofailover.register_node(text,text,int,name,text,bigint,int,int, pgautofailover.replication_state,text, int,bool,text); DROP FUNCTION pgautofailover.node_active(text,int,int, pgautofailover.replication_state,bool,pg_lsn,text); DROP FUNCTION pgautofailover.get_other_nodes(int); DROP FUNCTION pgautofailover.get_other_nodes (integer,pgautofailover.replication_state); DROP FUNCTION pgautofailover.last_events(int); DROP FUNCTION pgautofailover.last_events(text,int); DROP FUNCTION pgautofailover.last_events(text,int,int); DROP FUNCTION pgautofailover.current_state(text); DROP FUNCTION pgautofailover.current_state(text,int); DROP TRIGGER disable_secondary_check ON pgautofailover.formation; DROP FUNCTION pgautofailover.update_secondary_check(); ALTER TYPE pgautofailover.replication_state RENAME TO old_replication_state; CREATE TYPE pgautofailover.replication_state AS ENUM ( 'unknown', 'init', 'single', 'wait_primary', 'primary', 'draining', 'demote_timeout', 'demoted', 'catchingup', 'secondary', 'prepare_promotion', 'stop_replication', 'wait_standby', 'maintenance', 'join_primary', 'apply_settings', 'prepare_maintenance', 'wait_maintenance', 'report_lsn', 'fast_forward', 'join_secondary', 'dropped' ); -- Note the double cast here, first to text and only then to the new enums ALTER TABLE pgautofailover.event ALTER COLUMN goalstate TYPE pgautofailover.replication_state USING goalstate::text::pgautofailover.replication_state, ALTER COLUMN reportedstate TYPE pgautofailover.replication_state USING reportedstate::text::pgautofailover.replication_state; ALTER TABLE pgautofailover.node RENAME TO node_upgrade_old; ALTER TABLE pgautofailover.node_upgrade_old RENAME CONSTRAINT system_identifier_is_null_at_init_only TO system_identifier_is_null_at_init_only_old; ALTER TABLE pgautofailover.node_upgrade_old RENAME CONSTRAINT same_system_identifier_within_group TO same_system_identifier_within_group_old; CREATE TABLE pgautofailover.node ( formationid text not null default 'default', nodeid bigint not null DEFAULT nextval('pgautofailover.node_nodeid_seq'::regclass), groupid int not null, nodename text not null, nodehost text not null, nodeport int not null, sysidentifier bigint, goalstate pgautofailover.replication_state not null default 'init', reportedstate pgautofailover.replication_state not null, reportedpgisrunning bool default true, reportedrepstate text default 'async', reporttime timestamptz not null default now(), reportedtli int not null default 1 check (reportedtli > 0), reportedlsn pg_lsn not null default '0/0', walreporttime timestamptz not null default now(), health integer not null default -1, healthchecktime timestamptz not null default now(), statechangetime timestamptz not null default now(), candidatepriority int not null default 100, replicationquorum bool not null default true, nodecluster text not null default 'default', -- node names must be unique in a given formation UNIQUE (formationid, nodename), -- any nodehost:port can only be a unique node in the system UNIQUE (nodehost, nodeport), -- -- The EXCLUDE constraint only allows the same sysidentifier for all the -- nodes in the same group. The system_identifier is a property that is -- kept when implementing streaming replication and should be unique per -- Postgres instance in all other cases. -- -- We allow the sysidentifier column to be NULL when registering a new -- primary server from scratch, because we have not done pg_ctl initdb -- at the time we call the register_node() function. -- CONSTRAINT system_identifier_is_null_at_init_only CHECK ( ( sysidentifier IS NULL AND reportedstate IN ( 'init', 'wait_standby', 'catchingup', 'dropped' ) ) OR sysidentifier IS NOT NULL ), CONSTRAINT same_system_identifier_within_group EXCLUDE USING gist(formationid with =, groupid with =, sysidentifier with <>) DEFERRABLE INITIALLY DEFERRED, PRIMARY KEY (nodeid), FOREIGN KEY (formationid) REFERENCES pgautofailover.formation(formationid) ) -- we expect few rows and lots of UPDATE, let's benefit from HOT WITH (fillfactor = 25); ALTER SEQUENCE pgautofailover.node_nodeid_seq OWNED BY pgautofailover.node.nodeid; INSERT INTO pgautofailover.node ( formationid, nodeid, groupid, nodename, nodehost, nodeport, sysidentifier, goalstate, reportedstate, reportedpgisrunning, reportedrepstate, reporttime, reportedtli, reportedlsn, walreporttime, health, healthchecktime, statechangetime, candidatepriority, replicationquorum, nodecluster ) SELECT formationid, nodeid, groupid, nodename, nodehost, nodeport, sysidentifier, goalstate::text::pgautofailover.replication_state, reportedstate::text::pgautofailover.replication_state, reportedpgisrunning, reportedrepstate, reporttime, 1 as reportedtli, reportedlsn, walreporttime, health, healthchecktime, statechangetime, candidatepriority, replicationquorum, nodecluster FROM pgautofailover.node_upgrade_old; ALTER TABLE pgautofailover.event RENAME TO event_upgrade_old; CREATE TABLE pgautofailover.event ( eventid bigint not null DEFAULT nextval('pgautofailover.event_eventid_seq'::regclass), eventtime timestamptz not null default now(), formationid text not null, nodeid bigint not null, groupid int not null, nodename text not null, nodehost text not null, nodeport integer not null, reportedstate pgautofailover.replication_state not null, goalstate pgautofailover.replication_state not null, reportedrepstate text, reportedtli int not null default 1 check (reportedtli > 0), reportedlsn pg_lsn not null default '0/0', candidatepriority int, replicationquorum bool, description text, PRIMARY KEY (eventid) ); ALTER SEQUENCE pgautofailover.event_eventid_seq OWNED BY pgautofailover.event.eventid; INSERT INTO pgautofailover.event ( eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, reportedtli, reportedlsn, candidatepriority, replicationquorum, description ) SELECT eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, 1 as reportedtli, reportedlsn, candidatepriority, replicationquorum, description FROM pgautofailover.event_upgrade_old as event; DROP TABLE pgautofailover.event_upgrade_old; DROP TABLE pgautofailover.node_upgrade_old; DROP TYPE pgautofailover.old_replication_state; GRANT SELECT ON ALL TABLES IN SCHEMA pgautofailover TO autoctl_node; CREATE FUNCTION pgautofailover.register_node ( IN formation_id text, IN node_host text, IN node_port int, IN dbname name, IN node_name text default '', IN sysidentifier bigint default 0, IN desired_node_id bigint default -1, IN desired_group_id int default -1, IN initial_group_role pgautofailover.replication_state default 'init', IN node_kind text default 'standalone', IN candidate_priority int default 100, IN replication_quorum bool default true, IN node_cluster text default 'default', OUT assigned_node_id bigint, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state, OUT assigned_candidate_priority int, OUT assigned_replication_quorum bool, OUT assigned_node_name text ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$register_node$$; grant execute on function pgautofailover.register_node(text,text,int,name,text,bigint,bigint,int, pgautofailover.replication_state,text, int,bool,text) to autoctl_node; CREATE FUNCTION pgautofailover.node_active ( IN formation_id text, IN node_id bigint, IN group_id int, IN current_group_role pgautofailover.replication_state default 'init', IN current_pg_is_running bool default true, IN current_tli integer default 1, IN current_lsn pg_lsn default '0/0', IN current_rep_state text default '', OUT assigned_node_id bigint, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state, OUT assigned_candidate_priority int, OUT assigned_replication_quorum bool ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$node_active$$; grant execute on function pgautofailover.node_active(text,bigint,int, pgautofailover.replication_state,bool,int,pg_lsn,text) to autoctl_node; DROP FUNCTION pgautofailover.get_nodes(text,int); CREATE FUNCTION pgautofailover.get_nodes ( IN formation_id text default 'default', IN group_id int default NULL, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C AS 'MODULE_PATHNAME', $$get_nodes$$; comment on function pgautofailover.get_nodes(text,int) is 'get all the nodes in a group'; grant execute on function pgautofailover.get_nodes(text,int) to autoctl_node; DROP FUNCTION pgautofailover.get_primary(text,int); CREATE FUNCTION pgautofailover.get_primary ( IN formation_id text default 'default', IN group_id int default 0, OUT primary_node_id bigint, OUT primary_name text, OUT primary_host text, OUT primary_port int ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$get_primary$$; comment on function pgautofailover.get_primary(text,int) is 'get the writable node for a group'; grant execute on function pgautofailover.get_primary(text,int) to autoctl_node; CREATE FUNCTION pgautofailover.get_other_nodes ( IN nodeid bigint, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C STRICT AS 'MODULE_PATHNAME', $$get_other_nodes$$; comment on function pgautofailover.get_other_nodes(bigint) is 'get the other nodes in a group'; grant execute on function pgautofailover.get_other_nodes(bigint) to autoctl_node; CREATE FUNCTION pgautofailover.get_other_nodes ( IN nodeid bigint, IN current_state pgautofailover.replication_state, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C STRICT AS 'MODULE_PATHNAME', $$get_other_nodes$$; comment on function pgautofailover.get_other_nodes (bigint,pgautofailover.replication_state) is 'get the other nodes in a group, filtering on current_state'; grant execute on function pgautofailover.get_other_nodes (bigint,pgautofailover.replication_state) to autoctl_node; DROP FUNCTION pgautofailover.remove_node(int); CREATE FUNCTION pgautofailover.remove_node ( node_id bigint, force bool default 'false' ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$remove_node_by_nodeid$$; comment on function pgautofailover.remove_node(bigint,bool) is 'remove a node from the monitor'; grant execute on function pgautofailover.remove_node(bigint,bool) to autoctl_node; DROP FUNCTION pgautofailover.remove_node(text,int); CREATE FUNCTION pgautofailover.remove_node ( node_host text, node_port int default 5432, force bool default 'false' ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$remove_node_by_host$$; comment on function pgautofailover.remove_node(text,int,bool) is 'remove a node from the monitor'; grant execute on function pgautofailover.remove_node(text,int,bool) to autoctl_node; DROP FUNCTION pgautofailover.start_maintenance(node_id int); CREATE FUNCTION pgautofailover.start_maintenance(node_id bigint) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$start_maintenance$$; comment on function pgautofailover.start_maintenance(bigint) is 'set a node in maintenance state'; grant execute on function pgautofailover.start_maintenance(bigint) to autoctl_node; DROP FUNCTION pgautofailover.stop_maintenance(node_id int); CREATE FUNCTION pgautofailover.stop_maintenance(node_id bigint) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$stop_maintenance$$; comment on function pgautofailover.stop_maintenance(bigint) is 'set a node out of maintenance state'; grant execute on function pgautofailover.stop_maintenance(bigint) to autoctl_node; CREATE OR REPLACE FUNCTION pgautofailover.update_secondary_check() RETURNS trigger LANGUAGE 'plpgsql' AS $$ declare nodeid bigint := null; reportedstate pgautofailover.replication_state := null; begin -- when secondary changes from true to false, check all nodes remaining are primary if new.opt_secondary is false and new.opt_secondary is distinct from old.opt_secondary then select node.nodeid, node.reportedstate into nodeid, reportedstate from pgautofailover.node where node.formationid = new.formationid and node.reportedstate <> 'single' and node.goalstate <> 'dropped'; if nodeid is not null then raise exception object_not_in_prerequisite_state using message = 'formation has nodes that are not in SINGLE state', detail = 'nodeid ' || nodeid || ' is in state ' || reportedstate, hint = 'drop secondary nodes before disabling secondaries on formation'; end if; end if; return new; end $$; comment on function pgautofailover.update_secondary_check() is 'performs a check when changes to hassecondary on pgautofailover.formation are made, verifying cluster state allows the change'; CREATE TRIGGER disable_secondary_check BEFORE UPDATE ON pgautofailover.formation FOR EACH ROW EXECUTE PROCEDURE pgautofailover.update_secondary_check(); CREATE FUNCTION pgautofailover.last_events ( count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, reportedtli, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(int) is 'retrieve last COUNT events'; grant execute on function pgautofailover.last_events(int) to autoctl_node; CREATE FUNCTION pgautofailover.last_events ( formation_id text default 'default', count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, reportedtli, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event where formationid = formation_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(text,int) is 'retrieve last COUNT events for given formation'; grant execute on function pgautofailover.last_events(text,int) to autoctl_node; CREATE FUNCTION pgautofailover.last_events ( formation_id text, group_id int, count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, reportedtli, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event where formationid = formation_id and groupid = group_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(text,int,int) is 'retrieve last COUNT events for given formation and group'; grant execute on function pgautofailover.last_events(text,int,int) to autoctl_node; CREATE FUNCTION pgautofailover.current_state ( IN formation_id text default 'default', OUT formation_kind text, OUT nodename text, OUT nodehost text, OUT nodeport int, OUT group_id int, OUT node_id bigint, OUT current_group_state pgautofailover.replication_state, OUT assigned_group_state pgautofailover.replication_state, OUT candidate_priority int, OUT replication_quorum bool, OUT reported_tli int, OUT reported_lsn pg_lsn, OUT health integer, OUT nodecluster text ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select kind, nodename, nodehost, nodeport, groupid, nodeid, reportedstate, goalstate, candidatepriority, replicationquorum, reportedtli, reportedlsn, health, nodecluster from pgautofailover.node join pgautofailover.formation using(formationid) where formationid = formation_id order by groupid, nodeid; $$; comment on function pgautofailover.current_state(text) is 'get the current state of both nodes of a formation'; grant execute on function pgautofailover.current_state(text) to autoctl_node; CREATE FUNCTION pgautofailover.current_state ( IN formation_id text, IN group_id int, OUT formation_kind text, OUT nodename text, OUT nodehost text, OUT nodeport int, OUT group_id int, OUT node_id bigint, OUT current_group_state pgautofailover.replication_state, OUT assigned_group_state pgautofailover.replication_state, OUT candidate_priority int, OUT replication_quorum bool, OUT reported_tli int, OUT reported_lsn pg_lsn, OUT health integer, OUT nodecluster text ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select kind, nodename, nodehost, nodeport, groupid, nodeid, reportedstate, goalstate, candidatepriority, replicationquorum, reportedtli, reportedlsn, health, nodecluster from pgautofailover.node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = group_id order by groupid, nodeid; $$; grant execute on function pgautofailover.current_state(text, int) to autoctl_node; comment on function pgautofailover.current_state(text, int) is 'get the current state of both nodes of a group in a formation'; pg_auto_failover-1.6.3/src/monitor/pgautofailover--1.6--dummy.sql000066400000000000000000000003271414244367200246720ustar00rootroot00000000000000-- -- dummy extension update file that does nothing -- -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION pgautofailover UPDATE TO dummy" to load this file. \quit pg_auto_failover-1.6.3/src/monitor/pgautofailover.control000066400000000000000000000002141414244367200237610ustar00rootroot00000000000000comment = 'pg_auto_failover' default_version = '1.6' module_pathname = '$libdir/pgautofailover' relocatable = false requires = 'btree_gist' pg_auto_failover-1.6.3/src/monitor/pgautofailover.sql000066400000000000000000000666301414244367200231160ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pgautofailover" to load this file. \quit DO $body$ BEGIN if not exists (select * from pg_catalog.pg_user where usename = 'autoctl_node') then create role autoctl_node with login; end if; END $body$; CREATE SCHEMA pgautofailover; GRANT USAGE ON SCHEMA pgautofailover TO autoctl_node; CREATE TYPE pgautofailover.replication_state AS ENUM ( 'unknown', 'init', 'single', 'wait_primary', 'primary', 'draining', 'demote_timeout', 'demoted', 'catchingup', 'secondary', 'prepare_promotion', 'stop_replication', 'wait_standby', 'maintenance', 'join_primary', 'apply_settings', 'prepare_maintenance', 'wait_maintenance', 'report_lsn', 'fast_forward', 'join_secondary', 'dropped' ); CREATE TABLE pgautofailover.formation ( formationid text NOT NULL DEFAULT 'default', kind text NOT NULL DEFAULT 'pgsql', dbname name NOT NULL DEFAULT 'postgres', opt_secondary bool NOT NULL DEFAULT true, number_sync_standbys int NOT NULL DEFAULT 0, PRIMARY KEY (formationid), CHECK (kind IN ('pgsql', 'citus')) ); insert into pgautofailover.formation (formationid) values ('default'); CREATE FUNCTION pgautofailover.create_formation ( IN formation_id text, IN kind text, IN dbname name, IN opt_secondary bool, IN number_sync_standbys int, OUT formation_id text, OUT kind text, OUT dbname name, OUT opt_secondary bool, OUT number_sync_standbys int ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$create_formation$$; grant execute on function pgautofailover.create_formation(text,text,name,bool,int) to autoctl_node; CREATE FUNCTION pgautofailover.drop_formation ( IN formation_id text ) RETURNS void LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$drop_formation$$; grant execute on function pgautofailover.drop_formation(text) to autoctl_node; CREATE FUNCTION pgautofailover.set_formation_number_sync_standbys ( IN formation_id text, IN number_sync_standbys int ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$set_formation_number_sync_standbys$$; grant execute on function pgautofailover.set_formation_number_sync_standbys(text, int) to autoctl_node; CREATE TABLE pgautofailover.node ( formationid text not null default 'default', nodeid bigserial, groupid int not null, nodename text not null, nodehost text not null, nodeport int not null, sysidentifier bigint, goalstate pgautofailover.replication_state not null default 'init', reportedstate pgautofailover.replication_state not null, reportedpgisrunning bool default true, reportedrepstate text default 'async', reporttime timestamptz not null default now(), reportedtli int not null default 1 check (reportedtli > 0), reportedlsn pg_lsn not null default '0/0', walreporttime timestamptz not null default now(), health integer not null default -1, healthchecktime timestamptz not null default now(), statechangetime timestamptz not null default now(), candidatepriority int not null default 100, replicationquorum bool not null default true, nodecluster text not null default 'default', -- node names must be unique in a given formation UNIQUE (formationid, nodename), -- any nodehost:port can only be a unique node in the system UNIQUE (nodehost, nodeport), -- -- The EXCLUDE constraint only allows the same sysidentifier for all the -- nodes in the same group. The system_identifier is a property that is -- kept when implementing streaming replication and should be unique per -- Postgres instance in all other cases. -- -- We allow the sysidentifier column to be NULL when registering a new -- primary server from scratch, because we have not done pg_ctl initdb -- at the time we call the register_node() function. -- CONSTRAINT system_identifier_is_null_at_init_only CHECK ( ( sysidentifier IS NULL AND reportedstate IN ( 'init', 'wait_standby', 'catchingup', 'dropped' ) ) OR sysidentifier IS NOT NULL ), CONSTRAINT same_system_identifier_within_group EXCLUDE USING gist(formationid with =, groupid with =, sysidentifier with <>) DEFERRABLE INITIALLY DEFERRED, PRIMARY KEY (nodeid), FOREIGN KEY (formationid) REFERENCES pgautofailover.formation(formationid) ) -- we expect few rows and lots of UPDATE, let's benefit from HOT WITH (fillfactor = 25); CREATE TABLE pgautofailover.event ( eventid bigserial not null, eventtime timestamptz not null default now(), formationid text not null, nodeid bigint not null, groupid int not null, nodename text not null, nodehost text not null, nodeport integer not null, reportedstate pgautofailover.replication_state not null, goalstate pgautofailover.replication_state not null, reportedrepstate text, reportedtli int not null default 1 check (reportedtli > 0), reportedlsn pg_lsn not null default '0/0', candidatepriority int, replicationquorum bool, description text, PRIMARY KEY (eventid) ); GRANT SELECT ON ALL TABLES IN SCHEMA pgautofailover TO autoctl_node; CREATE FUNCTION pgautofailover.set_node_system_identifier ( IN node_id bigint, IN node_sysidentifier bigint, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int ) RETURNS record LANGUAGE SQL STRICT SECURITY DEFINER AS $$ update pgautofailover.node set sysidentifier = node_sysidentifier where nodeid = set_node_system_identifier.node_id returning nodeid, nodename, nodehost, nodeport; $$; grant execute on function pgautofailover.set_node_system_identifier(bigint,bigint) to autoctl_node; CREATE FUNCTION pgautofailover.set_group_system_identifier ( IN group_id bigint, IN node_sysidentifier bigint, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int ) RETURNS setof record LANGUAGE SQL STRICT SECURITY DEFINER AS $$ update pgautofailover.node set sysidentifier = node_sysidentifier where groupid = set_group_system_identifier.group_id and sysidentifier = 0 returning nodeid, nodename, nodehost, nodeport; $$; grant execute on function pgautofailover.set_group_system_identifier(bigint,bigint) to autoctl_node; CREATE FUNCTION pgautofailover.update_node_metadata ( IN node_id bigint, IN node_name text, IN node_host text, IN node_port int ) RETURNS boolean LANGUAGE C SECURITY DEFINER AS 'MODULE_PATHNAME', $$update_node_metadata$$; grant execute on function pgautofailover.update_node_metadata(bigint,text,text,int) to autoctl_node; CREATE FUNCTION pgautofailover.register_node ( IN formation_id text, IN node_host text, IN node_port int, IN dbname name, IN node_name text default '', IN sysidentifier bigint default 0, IN desired_node_id bigint default -1, IN desired_group_id int default -1, IN initial_group_role pgautofailover.replication_state default 'init', IN node_kind text default 'standalone', IN candidate_priority int default 100, IN replication_quorum bool default true, IN node_cluster text default 'default', OUT assigned_node_id bigint, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state, OUT assigned_candidate_priority int, OUT assigned_replication_quorum bool, OUT assigned_node_name text ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$register_node$$; grant execute on function pgautofailover.register_node(text,text,int,name,text,bigint,bigint,int, pgautofailover.replication_state,text, int,bool,text) to autoctl_node; CREATE FUNCTION pgautofailover.node_active ( IN formation_id text, IN node_id bigint, IN group_id int, IN current_group_role pgautofailover.replication_state default 'init', IN current_pg_is_running bool default true, IN current_tli integer default 1, IN current_lsn pg_lsn default '0/0', IN current_rep_state text default '', OUT assigned_node_id bigint, OUT assigned_group_id int, OUT assigned_group_state pgautofailover.replication_state, OUT assigned_candidate_priority int, OUT assigned_replication_quorum bool ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$node_active$$; grant execute on function pgautofailover.node_active(text,bigint,int, pgautofailover.replication_state,bool,int,pg_lsn,text) to autoctl_node; CREATE FUNCTION pgautofailover.get_nodes ( IN formation_id text default 'default', IN group_id int default NULL, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C AS 'MODULE_PATHNAME', $$get_nodes$$; comment on function pgautofailover.get_nodes(text,int) is 'get all the nodes in a group'; grant execute on function pgautofailover.get_nodes(text,int) to autoctl_node; CREATE FUNCTION pgautofailover.get_primary ( IN formation_id text default 'default', IN group_id int default 0, OUT primary_node_id bigint, OUT primary_name text, OUT primary_host text, OUT primary_port int ) RETURNS record LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$get_primary$$; comment on function pgautofailover.get_primary(text,int) is 'get the writable node for a group'; grant execute on function pgautofailover.get_primary(text,int) to autoctl_node; CREATE FUNCTION pgautofailover.get_other_nodes ( IN nodeid bigint, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C STRICT AS 'MODULE_PATHNAME', $$get_other_nodes$$; comment on function pgautofailover.get_other_nodes(bigint) is 'get the other nodes in a group'; grant execute on function pgautofailover.get_other_nodes(bigint) to autoctl_node; CREATE FUNCTION pgautofailover.get_other_nodes ( IN nodeid bigint, IN current_state pgautofailover.replication_state, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE C STRICT AS 'MODULE_PATHNAME', $$get_other_nodes$$; comment on function pgautofailover.get_other_nodes (bigint,pgautofailover.replication_state) is 'get the other nodes in a group, filtering on current_state'; grant execute on function pgautofailover.get_other_nodes (bigint,pgautofailover.replication_state) to autoctl_node; CREATE FUNCTION pgautofailover.get_coordinator ( IN formation_id text default 'default', OUT node_host text, OUT node_port int ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select nodehost, nodeport from pgautofailover.node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = 0 and goalstate in ('single', 'wait_primary', 'primary') and reportedstate in ('single', 'wait_primary', 'primary'); $$; grant execute on function pgautofailover.get_coordinator(text) to autoctl_node; CREATE FUNCTION pgautofailover.get_most_advanced_standby ( IN formationid text default 'default', IN groupid int default 0, OUT node_id bigint, OUT node_name text, OUT node_host text, OUT node_port int, OUT node_lsn pg_lsn, OUT node_is_primary bool ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select nodeid, nodename, nodehost, nodeport, reportedlsn, false from pgautofailover.node where formationid = $1 and groupid = $2 and reportedstate = 'report_lsn' order by reportedlsn desc, health desc limit 1; $$; grant execute on function pgautofailover.get_most_advanced_standby(text,int) to autoctl_node; CREATE FUNCTION pgautofailover.remove_node ( node_id bigint, force bool default 'false' ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$remove_node_by_nodeid$$; comment on function pgautofailover.remove_node(bigint,bool) is 'remove a node from the monitor'; grant execute on function pgautofailover.remove_node(bigint,bool) to autoctl_node; CREATE FUNCTION pgautofailover.remove_node ( node_host text, node_port int default 5432, force bool default 'false' ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$remove_node_by_host$$; comment on function pgautofailover.remove_node(text,int,bool) is 'remove a node from the monitor'; grant execute on function pgautofailover.remove_node(text,int,bool) to autoctl_node; CREATE FUNCTION pgautofailover.perform_failover ( formation_id text default 'default', group_id int default 0 ) RETURNS void LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$perform_failover$$; comment on function pgautofailover.perform_failover(text,int) is 'manually failover from the primary to the secondary'; grant execute on function pgautofailover.perform_failover(text,int) to autoctl_node; CREATE FUNCTION pgautofailover.perform_promotion ( formation_id text, node_name text ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$perform_promotion$$; comment on function pgautofailover.perform_promotion(text,text) is 'manually failover from the primary to the given node'; grant execute on function pgautofailover.perform_promotion(text,text) to autoctl_node; CREATE FUNCTION pgautofailover.start_maintenance(node_id bigint) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$start_maintenance$$; comment on function pgautofailover.start_maintenance(bigint) is 'set a node in maintenance state'; grant execute on function pgautofailover.start_maintenance(bigint) to autoctl_node; CREATE FUNCTION pgautofailover.stop_maintenance(node_id bigint) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$stop_maintenance$$; comment on function pgautofailover.stop_maintenance(bigint) is 'set a node out of maintenance state'; grant execute on function pgautofailover.stop_maintenance(bigint) to autoctl_node; CREATE FUNCTION pgautofailover.last_events ( count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, reportedtli, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(int) is 'retrieve last COUNT events'; grant execute on function pgautofailover.last_events(int) to autoctl_node; CREATE FUNCTION pgautofailover.last_events ( formation_id text default 'default', count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, reportedtli, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event where formationid = formation_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(text,int) is 'retrieve last COUNT events for given formation'; grant execute on function pgautofailover.last_events(text,int) to autoctl_node; CREATE FUNCTION pgautofailover.last_events ( formation_id text, group_id int, count int default 10 ) RETURNS SETOF pgautofailover.event LANGUAGE SQL STRICT AS $$ with last_events as ( select eventid, eventtime, formationid, nodeid, groupid, nodename, nodehost, nodeport, reportedstate, goalstate, reportedrepstate, reportedtli, reportedlsn, candidatepriority, replicationquorum, description from pgautofailover.event where formationid = formation_id and groupid = group_id order by eventid desc limit count ) select * from last_events order by eventtime, eventid; $$; comment on function pgautofailover.last_events(text,int,int) is 'retrieve last COUNT events for given formation and group'; grant execute on function pgautofailover.last_events(text,int,int) to autoctl_node; CREATE FUNCTION pgautofailover.current_state ( IN formation_id text default 'default', OUT formation_kind text, OUT nodename text, OUT nodehost text, OUT nodeport int, OUT group_id int, OUT node_id bigint, OUT current_group_state pgautofailover.replication_state, OUT assigned_group_state pgautofailover.replication_state, OUT candidate_priority int, OUT replication_quorum bool, OUT reported_tli int, OUT reported_lsn pg_lsn, OUT health integer, OUT nodecluster text ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select kind, nodename, nodehost, nodeport, groupid, nodeid, reportedstate, goalstate, candidatepriority, replicationquorum, reportedtli, reportedlsn, health, nodecluster from pgautofailover.node join pgautofailover.formation using(formationid) where formationid = formation_id order by groupid, nodeid; $$; comment on function pgautofailover.current_state(text) is 'get the current state of both nodes of a formation'; grant execute on function pgautofailover.current_state(text) to autoctl_node; CREATE FUNCTION pgautofailover.current_state ( IN formation_id text, IN group_id int, OUT formation_kind text, OUT nodename text, OUT nodehost text, OUT nodeport int, OUT group_id int, OUT node_id bigint, OUT current_group_state pgautofailover.replication_state, OUT assigned_group_state pgautofailover.replication_state, OUT candidate_priority int, OUT replication_quorum bool, OUT reported_tli int, OUT reported_lsn pg_lsn, OUT health integer, OUT nodecluster text ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ select kind, nodename, nodehost, nodeport, groupid, nodeid, reportedstate, goalstate, candidatepriority, replicationquorum, reportedtli, reportedlsn, health, nodecluster from pgautofailover.node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = group_id order by groupid, nodeid; $$; comment on function pgautofailover.current_state(text, int) is 'get the current state of both nodes of a group in a formation'; grant execute on function pgautofailover.current_state(text, int) to autoctl_node; CREATE FUNCTION pgautofailover.formation_uri ( IN formation_id text DEFAULT 'default', IN cluster_name text DEFAULT 'default', IN sslmode text DEFAULT 'prefer', IN sslrootcert text DEFAULT '', IN sslcrl text DEFAULT '' ) RETURNS text LANGUAGE SQL STRICT AS $$ select case when string_agg(format('%s:%s', nodehost, nodeport),',') is not null then format( 'postgres://%s/%s?%ssslmode=%s%s%s', string_agg(format('%s:%s', nodehost, nodeport),','), -- as we join formation on node we get the same dbname for all -- entries, pick one. min(dbname), case when cluster_name = 'default' then 'target_session_attrs=read-write&' else '' end, min(sslmode), CASE WHEN min(sslrootcert) = '' THEN '' ELSE '&sslrootcert=' || sslrootcert END, CASE WHEN min(sslcrl) = '' THEN '' ELSE '&sslcrl=' || sslcrl END ) end as uri from pgautofailover.node as node join pgautofailover.formation using(formationid) where formationid = formation_id and groupid = 0 and nodecluster = cluster_name; $$; CREATE FUNCTION pgautofailover.enable_secondary ( formation_id text ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$enable_secondary$$; comment on function pgautofailover.enable_secondary(text) is 'changes the state of a formation to assign secondaries for nodes when added'; CREATE FUNCTION pgautofailover.disable_secondary ( formation_id text ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$disable_secondary$$; comment on function pgautofailover.disable_secondary(text) is 'changes the state of a formation to disable the assignment of secondaries for nodes when added'; CREATE OR REPLACE FUNCTION pgautofailover.update_secondary_check() RETURNS trigger LANGUAGE 'plpgsql' AS $$ declare nodeid bigint := null; reportedstate pgautofailover.replication_state := null; begin -- when secondary changes from true to false, check all nodes remaining are primary if new.opt_secondary is false and new.opt_secondary is distinct from old.opt_secondary then select node.nodeid, node.reportedstate into nodeid, reportedstate from pgautofailover.node where node.formationid = new.formationid and node.reportedstate <> 'single' and node.goalstate <> 'dropped'; if nodeid is not null then raise exception object_not_in_prerequisite_state using message = 'formation has nodes that are not in SINGLE state', detail = 'nodeid ' || nodeid || ' is in state ' || reportedstate, hint = 'drop secondary nodes before disabling secondaries on formation'; end if; end if; return new; end $$; comment on function pgautofailover.update_secondary_check() is 'performs a check when changes to hassecondary on pgautofailover.formation are made, verifying cluster state allows the change'; CREATE TRIGGER disable_secondary_check BEFORE UPDATE ON pgautofailover.formation FOR EACH ROW EXECUTE PROCEDURE pgautofailover.update_secondary_check(); CREATE FUNCTION pgautofailover.set_node_candidate_priority ( IN formation_id text, IN node_name text, IN candidate_priority int ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$set_node_candidate_priority$$; comment on function pgautofailover.set_node_candidate_priority(text, text, int) is 'sets the candidate priority value for a node. Expects a priority value between 0 and 100. 0 if the node is not a candidate to be promoted to be primary.'; grant execute on function pgautofailover.set_node_candidate_priority(text, text, int) to autoctl_node; CREATE FUNCTION pgautofailover.set_node_replication_quorum ( IN formation_id text, IN node_name text, IN replication_quorum bool ) RETURNS bool LANGUAGE C STRICT SECURITY DEFINER AS 'MODULE_PATHNAME', $$set_node_replication_quorum$$; comment on function pgautofailover.set_node_replication_quorum(text, text, bool) is 'sets the replication quorum value for a node. true if the node participates in write quorum'; grant execute on function pgautofailover.set_node_replication_quorum(text, text, bool) to autoctl_node; create function pgautofailover.synchronous_standby_names ( IN formation_id text default 'default', IN group_id int default 0 ) returns text language C strict AS 'MODULE_PATHNAME', $$synchronous_standby_names$$; comment on function pgautofailover.synchronous_standby_names(text, int) is 'get the synchronous_standby_names setting for a given group'; grant execute on function pgautofailover.synchronous_standby_names(text, int) to autoctl_node; CREATE FUNCTION pgautofailover.formation_settings ( IN formation_id text default 'default', OUT context text, OUT group_id int, OUT node_id bigint, OUT nodename text, OUT setting text, OUT value text ) RETURNS SETOF record LANGUAGE SQL STRICT AS $$ with groups(formationid, groupid) as ( select formationid, groupid from pgautofailover.node where formationid = formation_id group by formationid, groupid ) -- context: formation, number_sync_standbys select 'formation' as context, NULL as group_id, NULL as node_id, formationid as nodename, 'number_sync_standbys' as setting, cast(number_sync_standbys as text) as value from pgautofailover.formation where formationid = formation_id union all -- context: primary, one entry per group in the formation select 'primary', groups.groupid, nodes.node_id, nodes.node_name, 'synchronous_standby_names', format('''%s''', pgautofailover.synchronous_standby_names(formationid, groupid)) from groups, pgautofailover.get_nodes(formationid, groupid) as nodes where node_is_primary union all ( -- context: node, one entry per node in the formation select 'node', node.groupid, node.nodeid, node.nodename, 'replication quorum', cast(node.replicationquorum as text) from pgautofailover.node as node where node.formationid = formation_id order by nodeid ) union all ( select 'node', node.groupid, node.nodeid, node.nodename, 'candidate priority', cast(node.candidatepriority as text) from pgautofailover.node as node where node.formationid = formation_id order by nodeid ) $$; comment on function pgautofailover.formation_settings(text) is 'get the current replication settings a formation'; pg_auto_failover-1.6.3/src/monitor/replication_state.c000066400000000000000000000122701414244367200232120ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/replication_state.c * * Implementation of functions related to (de)serialising replication * states. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" #include "c.h" #include "metadata.h" #include "replication_state.h" #include "version_compat.h" #include "access/htup.h" #include "access/htup_details.h" #include "catalog/pg_enum.h" #include "nodes/makefuncs.h" #include "nodes/parsenodes.h" #include "nodes/value.h" #include "parser/parse_type.h" #include "utils/syscache.h" /* private function forward declarations */ static bool IsReplicationStateName(char *name, ReplicationState replicationState); /* * ReplicationStateTypeOid returns the OID of the * pgautofailover.replication_state type. */ Oid ReplicationStateTypeOid(void) { Value *schemaName = makeString(AUTO_FAILOVER_SCHEMA_NAME); Value *typeName = makeString(REPLICATION_STATE_TYPE_NAME); List *enumTypeNameList = list_make2(schemaName, typeName); TypeName *enumTypeName = makeTypeNameFromNameList(enumTypeNameList); Oid enumTypeOid = typenameTypeId(NULL, enumTypeName); return enumTypeOid; } /* * EnumGetReplicationState returns the internal value of a replication state enum. */ ReplicationState EnumGetReplicationState(Oid replicationStateOid) { HeapTuple enumTuple = SearchSysCache1(ENUMOID, ObjectIdGetDatum(replicationStateOid)); if (!HeapTupleIsValid(enumTuple)) { ereport(ERROR, (errmsg("invalid input value for enum: %u", replicationStateOid))); } Form_pg_enum enumForm = (Form_pg_enum) GETSTRUCT(enumTuple); char *enumName = NameStr(enumForm->enumlabel); ReplicationState replicationState = NameGetReplicationState(enumName); ReleaseSysCache(enumTuple); return replicationState; } /* * ReplicationStateGetEnum returns the enum value of an internal replication * state. */ Oid ReplicationStateGetEnum(ReplicationState replicationState) { const char *enumName = ReplicationStateGetName(replicationState); Oid enumTypeOid = ReplicationStateTypeOid(); HeapTuple enumTuple = SearchSysCache2(ENUMTYPOIDNAME, ObjectIdGetDatum(enumTypeOid), CStringGetDatum(enumName)); if (!HeapTupleIsValid(enumTuple)) { ereport(ERROR, (errmsg("invalid value for enum: %d", replicationState))); } Oid replicationStateOid = HeapTupleGetOid(enumTuple); ReleaseSysCache(enumTuple); return replicationStateOid; } /* * NameGetReplicationState returns the value of a replication state as an * integer. */ ReplicationState NameGetReplicationState(char *replicationStateName) { ReplicationState replicationState = REPLICATION_STATE_INITIAL; for (replicationState = REPLICATION_STATE_INITIAL; !IsReplicationStateName(replicationStateName, replicationState) && replicationState < REPLICATION_STATE_UNKNOWN; replicationState++) { } return replicationState; } /* * IsReplicationStateName returns true if the given name is the name of the * replication state, and false otherwise. */ static bool IsReplicationStateName(char *name, ReplicationState replicationState) { const char *replicationStateName = ReplicationStateGetName(replicationState); if (strncmp(name, replicationStateName, NAMEDATALEN) == 0) { return true; } return false; } /* * ReplicationStateGetName returns the (enum) name of a replication state. */ const char * ReplicationStateGetName(ReplicationState replicationState) { switch (replicationState) { case REPLICATION_STATE_INITIAL: { return "init"; } case REPLICATION_STATE_SINGLE: { return "single"; } case REPLICATION_STATE_WAIT_PRIMARY: { return "wait_primary"; } case REPLICATION_STATE_PRIMARY: { return "primary"; } case REPLICATION_STATE_DRAINING: { return "draining"; } case REPLICATION_STATE_DEMOTE_TIMEOUT: { return "demote_timeout"; } case REPLICATION_STATE_DEMOTED: { return "demoted"; } case REPLICATION_STATE_CATCHINGUP: { return "catchingup"; } case REPLICATION_STATE_SECONDARY: { return "secondary"; } case REPLICATION_STATE_PREPARE_PROMOTION: { return "prepare_promotion"; } case REPLICATION_STATE_STOP_REPLICATION: { return "stop_replication"; } case REPLICATION_STATE_WAIT_STANDBY: { return "wait_standby"; } case REPLICATION_STATE_MAINTENANCE: { return "maintenance"; } case REPLICATION_STATE_JOIN_PRIMARY: { return "join_primary"; } case REPLICATION_STATE_APPLY_SETTINGS: { return "apply_settings"; } case REPLICATION_STATE_PREPARE_MAINTENANCE: { return "prepare_maintenance"; } case REPLICATION_STATE_WAIT_MAINTENANCE: { return "wait_maintenance"; } case REPLICATION_STATE_REPORT_LSN: { return "report_lsn"; } case REPLICATION_STATE_FAST_FORWARD: { return "fast_forward"; } case REPLICATION_STATE_JOIN_SECONDARY: { return "join_secondary"; } case REPLICATION_STATE_DROPPED: { return "dropped"; } default: { ereport(ERROR, (errmsg("bug: unknown replication state (%d)", replicationState))); } } } pg_auto_failover-1.6.3/src/monitor/replication_state.h000066400000000000000000000033151414244367200232170ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/replication_state.h * * Declarations for public functions and types related to (de)serialising * replication states. * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #pragma once /* * ReplicationState represents the current role of a node in a group. */ typedef enum ReplicationState { REPLICATION_STATE_INITIAL = 0, REPLICATION_STATE_SINGLE = 1, REPLICATION_STATE_WAIT_PRIMARY = 2, REPLICATION_STATE_PRIMARY = 3, REPLICATION_STATE_DRAINING = 4, REPLICATION_STATE_DEMOTE_TIMEOUT = 5, REPLICATION_STATE_DEMOTED = 6, REPLICATION_STATE_CATCHINGUP = 7, REPLICATION_STATE_SECONDARY = 8, REPLICATION_STATE_PREPARE_PROMOTION = 9, REPLICATION_STATE_STOP_REPLICATION = 10, REPLICATION_STATE_WAIT_STANDBY = 11, REPLICATION_STATE_MAINTENANCE = 12, REPLICATION_STATE_JOIN_PRIMARY = 13, REPLICATION_STATE_APPLY_SETTINGS = 14, REPLICATION_STATE_PREPARE_MAINTENANCE = 15, REPLICATION_STATE_WAIT_MAINTENANCE = 16, REPLICATION_STATE_REPORT_LSN = 17, REPLICATION_STATE_FAST_FORWARD = 18, REPLICATION_STATE_JOIN_SECONDARY = 19, REPLICATION_STATE_DROPPED = 20, REPLICATION_STATE_UNKNOWN = 21 } ReplicationState; /* declarations of public functions */ extern Oid ReplicationStateTypeOid(void); extern ReplicationState EnumGetReplicationState(Oid replicationStateOid); extern Oid ReplicationStateGetEnum(ReplicationState replicationState); extern ReplicationState NameGetReplicationState(char *replicationStateName); extern const char * ReplicationStateGetName(ReplicationState replicationState); pg_auto_failover-1.6.3/src/monitor/sql/000077500000000000000000000000001414244367200201325ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/monitor/sql/create_extension.sql000066400000000000000000000002211414244367200242050ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. create extension pgautofailover cascade; pg_auto_failover-1.6.3/src/monitor/sql/drop_extension.sql000066400000000000000000000002071414244367200237120ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. drop extension pgautofailover; pg_auto_failover-1.6.3/src/monitor/sql/dummy_update.sql000066400000000000000000000007441414244367200233550ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. select version from pg_available_extension_versions where name = 'pgautofailover' and version = 'dummy'; alter extension pgautofailover update to dummy; select installed_version from pg_available_extensions where name = 'pgautofailover'; -- should error because installed extension isn't compatible with .so select * from pgautofailover.get_primary('unknown formation'); pg_auto_failover-1.6.3/src/monitor/sql/monitor.sql000066400000000000000000000051301414244367200223410ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. \x on select * from pgautofailover.register_node('default', 'localhost', 9876, 'postgres'); select * from pgautofailover.set_node_system_identifier(1, 6852685710417058800); -- node_1 reports single select * from pgautofailover.node_active('default', 1, 0, current_group_role => 'single'); -- register node_2 select * from pgautofailover.register_node('default', 'localhost', 9877, 'postgres'); -- node_2 reports wait_standby already select * from pgautofailover.node_active('default', 2, 0, current_group_role => 'wait_standby'); -- node_1 reports single again, and gets assigned wait_primary select * from pgautofailover.node_active('default', 1, 0, current_group_role => 'single'); -- node_1 now reports wait_primary select * from pgautofailover.node_active('default', 1, 0, current_group_role => 'wait_primary'); -- node_2 now reports wait_standby, gets assigned catchingup select * from pgautofailover.node_active('default', 2, 0, current_group_role => 'wait_standby'); -- register node_3 concurrently to node2 (probably) doing pg_basebackup select * from pgautofailover.register_node('default', 'localhost', 9879, 'postgres'); select formationid, nodename, goalstate, reportedstate from pgautofailover.node; table pgautofailover.formation; -- dump the pgautofailover.node table, omitting the timely columns select formationid, nodeid, groupid, nodehost, nodeport, goalstate, reportedstate, reportedpgisrunning, reportedrepstate from pgautofailover.node order by nodeid; select * from pgautofailover.get_primary('unknown formation'); select * from pgautofailover.get_primary(group_id => -10); select * from pgautofailover.get_primary(); select * from pgautofailover.get_primary('default', 0); select * from pgautofailover.get_other_nodes(1); -- remove the primary node select pgautofailover.remove_node(1); table pgautofailover.formation; select pgautofailover.remove_node(1, force => 'true'); -- dump the pgautofailover.node table, omitting the timely columns select formationid, nodeid, groupid, nodehost, nodeport, goalstate, reportedstate, reportedpgisrunning, reportedrepstate from pgautofailover.node order by nodeid; select * from pgautofailover.set_node_system_identifier(2, 6852685710417058800); -- should fail as there's no primary at this point select pgautofailover.perform_failover(); pg_auto_failover-1.6.3/src/monitor/sql/upgrade.sql000066400000000000000000000006261414244367200223060ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. CREATE EXTENSION pgautofailover VERSION '1.0'; \dx pgautofailover ALTER EXTENSION pgautofailover UPDATE TO '1.1'; \dx pgautofailover ALTER EXTENSION pgautofailover UPDATE TO '1.2'; \dx pgautofailover ALTER EXTENSION pgautofailover UPDATE TO '1.3'; \dx pgautofailover DROP EXTENSION pgautofailover; pg_auto_failover-1.6.3/src/monitor/sql/workers.sql000066400000000000000000000020671414244367200223540ustar00rootroot00000000000000-- Copyright (c) Microsoft Corporation. All rights reserved. -- Licensed under the PostgreSQL License. -- This only tests that names are assigned properly \x on -- create a citus formation select * from pgautofailover.create_formation('citus', 'citus', 'citus', true, 0); -- register the first coordinator select * from pgautofailover.register_node('citus', 'localhost', 9876, dbname => 'citus', desired_group_id => 0, node_kind => 'coordinator'); select * from pgautofailover.set_node_system_identifier(4, 6862008014275870855); -- coordinator_1 reports single select * from pgautofailover.node_active('citus', 4, 0, current_group_role => 'single'); -- register first worker select * from pgautofailover.register_node('citus', 'localhost', 9878, dbname => 'citus', desired_group_id => 1, node_kind => 'worker'); pg_auto_failover-1.6.3/src/monitor/version_compat.c000066400000000000000000000027351414244367200225360ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/version_compat.h * Compatibility macros for writing code agnostic to PostgreSQL versions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #include "postgres.h" #if (PG_VERSION_NUM < 110000) /* * The list_qsort API was introduced in Postgres 11: * * https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=ab72716778128fb63d54ac256adf7fe6820a1185 */ #include "nodes/pg_list.h" #include "version_compat.h" /* * Sort a list using qsort. A sorted list is built but the cells of the * original list are re-used. The comparator function receives arguments of * type ListCell ** */ List * list_qsort(const List *list, list_qsort_comparator cmp) { ListCell *cell; int i; int len = list_length(list); ListCell **list_arr; List *new_list; if (len == 0) { return NIL; } i = 0; list_arr = palloc(sizeof(ListCell *) * len); foreach(cell, list) list_arr[i++] = cell; qsort(list_arr, len, sizeof(ListCell *), cmp); new_list = (List *) palloc(sizeof(List)); new_list->type = list->type; new_list->length = len; new_list->head = list_arr[0]; new_list->tail = list_arr[len - 1]; for (i = 0; i < len - 1; i++) { list_arr[i]->next = list_arr[i + 1]; } list_arr[len - 1]->next = NULL; pfree(list_arr); return new_list; } #endif pg_auto_failover-1.6.3/src/monitor/version_compat.h000066400000000000000000000035321414244367200225370ustar00rootroot00000000000000/*------------------------------------------------------------------------- * * src/monitor/version_compat.h * Compatibility macros for writing code agnostic to PostgreSQL versions * * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the PostgreSQL License. * *------------------------------------------------------------------------- */ #ifndef VERSION_COMPAT_H #define VERSION_COMPAT_H #include "postgres.h" /* we support Postgres versions 10, 11, 12, 13, and 14. */ #if (PG_VERSION_NUM < 100000 || PG_VERSION_NUM >= 150000) #error "Unknown or unsupported postgresql version" #endif #if (PG_VERSION_NUM < 110000) #include "postmaster/bgworker.h" #include "utils/memutils.h" #define DEFAULT_XLOG_SEG_SIZE XLOG_SEG_SIZE #define BackgroundWorkerInitializeConnection(dbname, username, flags) \ BackgroundWorkerInitializeConnection(dbname, username) #define BackgroundWorkerInitializeConnectionByOid(dboid, useroid, flags) \ BackgroundWorkerInitializeConnectionByOid(dboid, useroid) #include "nodes/pg_list.h" typedef int (*list_qsort_comparator) (const void *a, const void *b); extern List * list_qsort(const List *list, list_qsort_comparator cmp); #endif #if (PG_VERSION_NUM < 120000) #define table_beginscan_catalog heap_beginscan_catalog #define TableScanDesc HeapScanDesc #endif #if (PG_VERSION_NUM >= 120000) #include "access/htup_details.h" #include "catalog/pg_database.h" static inline Oid HeapTupleGetOid(HeapTuple tuple) { Form_pg_database dbForm = (Form_pg_database) GETSTRUCT(tuple); return dbForm->oid; } #endif #if (PG_VERSION_NUM >= 130000) #include "common/hashfn.h" #define heap_open(r, l) table_open(r, l) #define heap_close(r, l) table_close(r, l) #endif #if (PG_VERSION_NUM < 130000) /* Compatibility for ProcessUtility hook */ #define QueryCompletion char #endif #endif /* VERSION_COMPAT_H */ pg_auto_failover-1.6.3/src/tools/000077500000000000000000000000001414244367200170045ustar00rootroot00000000000000pg_auto_failover-1.6.3/src/tools/pg_autoctl.valgrind000077500000000000000000000004401414244367200226760ustar00rootroot00000000000000#!/bin/bash set -euo pipefail script_dir="${0%/*}" pid=$$ mkdir -p valgrind exec /usr/bin/valgrind --quiet --leak-check=yes --error-markers=VALGRINDERROR-BEGIN,VALGRINDERROR-END --max-stackframe=16000000 --log-file=./valgrind/log.${pid} "${script_dir}/../bin/pg_autoctl/pg_autoctl" "$@" pg_auto_failover-1.6.3/src/tools/remove_useless_declarations.sh000077500000000000000000000062611414244367200251400ustar00rootroot00000000000000#!/bin/sh set -eu cd "$(git rev-parse --show-toplevel)" files=$(find src -iname '*.c' -type f | git check-attr --stdin citus-style | grep -v ': unset$' | sed 's/: citus-style: set$//') while true; do # A visual version of this regex can be seen here (it is MUCH clearer): # https://www.debuggex.com/r/XodMNE9auT9e-bTx # This visual version only contains the search bit, the replacement bit is # quite simple when extracted from: # \n$+{code_between}\t$+{type}$+{variable} = # shellcheck disable=SC2086 perl -i -p0e 's/\n\t(?!return )(?P(\w+ )+\**)(?>(?P\w+)( = *[\w>\s\n-]*?)?;\n(?P(?>(?P\/\*.*?\*\/|"(?>\\"|.)*?"|[^#]))*?)(\t)?(?=\b(?P=variable)\b))(?<=\n\t)(?P=variable) =(?![^;]*?[^>_]\b(?P=variable)\b[^_])/\n$+{code_between}\t$+{type}$+{variable} =/sg' $files # The following are simply the same regex, but repeated for different # indentation levels, i.e. finding declarations indented using 2, 3, 4, 5 # and 6 tabs. More than 6 don't really occur in the wild. # (this is needed because variable sized backtracking is not supported in perl) # shellcheck disable=SC2086 perl -i -p0e 's/\n\t\t(?!return )(?P(\w+ )+\**)(?>(?P\w+)( = *[\w>\s\n-]*?)?;\n(?P(?>(?P\/\*.*?\*\/|"(?>\\"|.)*?"|[^#]))*?)(\t\t)?(?=\b(?P=variable)\b))(?<=\n\t\t)(?P=variable) =(?![^;]*?[^>_]\b(?P=variable)\b[^_])/\n$+{code_between}\t\t$+{type}$+{variable} =/sg' $files # shellcheck disable=SC2086 perl -i -p0e 's/\n\t\t\t(?!return )(?P(\w+ )+\**)(?>(?P\w+)( = *[\w>\s\n-]*?)?;\n(?P(?>(?P\/\*.*?\*\/|"(?>\\"|.)*?"|[^#]))*?)(\t\t\t)?(?=\b(?P=variable)\b))(?<=\n\t\t\t)(?P=variable) =(?![^;]*?[^>_]\b(?P=variable)\b[^_])/\n$+{code_between}\t\t\t$+{type}$+{variable} =/sg' $files # shellcheck disable=SC2086 perl -i -p0e 's/\n\t\t\t\t(?!return )(?P(\w+ )+\**)(?>(?P\w+)( = *[\w>\s\n-]*?)?;\n(?P(?>(?P\/\*.*?\*\/|"(?>\\"|.)*?"|[^#]))*?)(\t\t\t\t)?(?=\b(?P=variable)\b))(?<=\n\t\t\t\t)(?P=variable) =(?![^;]*?[^>_]\b(?P=variable)\b[^_])/\n$+{code_between}\t\t\t\t$+{type}$+{variable} =/sg' $files # shellcheck disable=SC2086 perl -i -p0e 's/\n\t\t\t\t\t(?!return )(?P(\w+ )+\**)(?>(?P\w+)( = *[\w>\s\n-]*?)?;\n(?P(?>(?P\/\*.*?\*\/|"(?>\\"|.)*?"|[^#]))*?)(\t\t\t\t\t)?(?=\b(?P=variable)\b))(?<=\n\t\t\t\t\t)(?P=variable) =(?![^;]*?[^>_]\b(?P=variable)\b[^_])/\n$+{code_between}\t\t\t\t\t$+{type}$+{variable} =/sg' $files # shellcheck disable=SC2086 perl -i -p0e 's/\n\t\t\t\t\t\t(?!return )(?P(\w+ )+\**)(?>(?P\w+)( = *[\w>\s\n-]*?)?;\n(?P(?>(?P\/\*.*?\*\/|"(?>\\"|.)*?"|[^#]))*?)(\t\t\t\t\t\t)?(?=\b(?P=variable)\b))(?<=\n\t\t\t\t\t\t)(?P=variable) =(?![^;]*?[^>_]\b(?P=variable)\b[^_])/\n$+{code_between}\t\t\t\t\t\t$+{type}$+{variable} =/sg' $files # shellcheck disable=SC2086 git diff --quiet $files && break; # shellcheck disable=SC2086 git add $files; done pg_auto_failover-1.6.3/tests/000077500000000000000000000000001414244367200162175ustar00rootroot00000000000000pg_auto_failover-1.6.3/tests/Pipfile000066400000000000000000000003151414244367200175310ustar00rootroot00000000000000[[source]] name = "pypi" url = "https://pypi.python.org/simple" verify_ssl = true [packages] pyroute2 = "==0.5.15" nose = "==1.3.7" psycopg2 = "==2.7.5" [dev-packages] [requires] python_version = "3.6" pg_auto_failover-1.6.3/tests/Pipfile.lock000066400000000000000000000075621414244367200204730ustar00rootroot00000000000000{ "_meta": { "hash": { "sha256": "400d971e7470db5df94c1d3a7efe124b801f6ea84aa687b9378bbac58057fc55" }, "pipfile-spec": 6, "requires": { "python_version": "3.6" }, "sources": [ { "name": "pypi", "url": "https://pypi.python.org/simple", "verify_ssl": true } ] }, "default": { "nose": { "hashes": [ "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac", "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a", "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98" ], "index": "pypi", "version": "==1.3.7" }, "psycopg2": { "hashes": [ "sha256:0b9e48a1c1505699a64ac58815ca99104aacace8321e455072cee4f7fe7b2698", "sha256:0f4c784e1b5a320efb434c66a50b8dd7e30a7dc047e8f45c0a8d2694bfe72781", "sha256:0fdbaa32c9eb09ef09d425dc154628fca6fa69d2f7c1a33f889abb7e0efb3909", "sha256:11fbf688d5c953c0a5ba625cc42dea9aeb2321942c7c5ed9341a68f865dc8cb1", "sha256:19eaac4eb25ab078bd0f28304a0cb08702d120caadfe76bb1e6846ed1f68635e", "sha256:3232ec1a3bf4dba97fbf9b03ce12e4b6c1d01ea3c85773903a67ced725728232", "sha256:36f8f9c216fcca048006f6dd60e4d3e6f406afde26cfb99e063f137070139eaf", "sha256:59c1a0e4f9abe970062ed35d0720935197800a7ef7a62b3a9e3a70588d9ca40b", "sha256:6506c5ff88750948c28d41852c09c5d2a49f51f28c6d90cbf1b6808e18c64e88", "sha256:6bc3e68ee16f571681b8c0b6d5c0a77bef3c589012352b3f0cf5520e674e9d01", "sha256:6dbbd7aabbc861eec6b910522534894d9dbb507d5819bc982032c3ea2e974f51", "sha256:6e737915de826650d1a5f7ff4ac6cf888a26f021a647390ca7bafdba0e85462b", "sha256:6ed9b2cfe85abc720e8943c1808eeffd41daa73e18b7c1e1a228b0b91f768ccc", "sha256:711ec617ba453fdfc66616db2520db3a6d9a891e3bf62ef9aba4c95bb4e61230", "sha256:844dacdf7530c5c612718cf12bc001f59b2d9329d35b495f1ff25045161aa6af", "sha256:86b52e146da13c896e50c5a3341a9448151f1092b1a4153e425d1e8b62fec508", "sha256:985c06c2a0f227131733ae58d6a541a5bc8b665e7305494782bebdb74202b793", "sha256:a86dfe45f4f9c55b1a2312ff20a59b30da8d39c0e8821d00018372a2a177098f", "sha256:aa3cd07f7f7e3183b63d48300666f920828a9dbd7d7ec53d450df2c4953687a9", "sha256:b1964ed645ef8317806d615d9ff006c0dadc09dfc54b99ae67f9ba7a1ec9d5d2", "sha256:b2abbff9e4141484bb89b96eb8eae186d77bc6d5ffbec6b01783ee5c3c467351", "sha256:cc33c3a90492e21713260095f02b12bee02b8d1f2c03a221d763ce04fa90e2e9", "sha256:d7de3bf0986d777807611c36e809b77a13bf1888f5c8db0ebf24b47a52d10726", "sha256:db5e3c52576cc5b93a959a03ccc3b02cb8f0af1fbbdc80645f7a215f0b864f3a", "sha256:e168aa795ffbb11379c942cf95bf813c7db9aa55538eb61de8c6815e092416f5", "sha256:e9ca911f8e2d3117e5241d5fa9aaa991cb22fb0792627eeada47425d706b5ec8", "sha256:eccf962d41ca46e6326b97c8fe0a6687b58dfc1a5f6540ed071ff1474cea749e", "sha256:efa19deae6b9e504a74347fe5e25c2cb9343766c489c2ae921b05f37338b18d1", "sha256:f4b0460a21f784abe17b496f66e74157a6c36116fa86da8bf6aa028b9e8ad5fe", "sha256:f93d508ca64d924d478fb11e272e09524698f0c581d9032e68958cfbdd41faef" ], "index": "pypi", "version": "==2.7.5" }, "pyroute2": { "hashes": [ "sha256:d730eff091dd5b2b78282bc82ebe6888e7ee4d70b79468b1da58d99fc1a2a2fc" ], "index": "pypi", "version": "==0.5.15" } }, "develop": {} } pg_auto_failover-1.6.3/tests/network.py000066400000000000000000000244471414244367200202750ustar00rootroot00000000000000from pyroute2 import netns, NDB, netlink, NSPopen from contextlib import contextmanager import ipaddress import subprocess import os import os.path """ TODO: Add an introduction to network namespaces, veth interfaces, and bridges, and explain why we use them here. """ BRIDGE_NF_CALL_IPTABLES = "/proc/sys/net/bridge/bridge-nf-call-iptables" COMMAND_TIMEOUT = 60 @contextmanager def managed_nspopen(*args, **kwds): proc = NSPopen(*args, **kwds) try: yield proc finally: if proc.poll() is None: # send SIGKILL to the process and wait for it to die if it's still # running proc.kill() # If it's not dead after 2 seconds we throw an error proc.communicate(timeout=2) # release proxy process resourecs proc.release() class VirtualLAN: """ Helper class to create a network of virtual nodes to simulate a virtual network. IP addresses are assigned automatically to the nodes from a private IP range. IP address of a virtual node can be accessed using the node.address field. Internally, this is a network of Linux network namespaces connected by a bridge. TODO: explain more details and add an example. """ def __init__(self, namePrefix, subnet): ipnet = ipaddress.ip_network(subnet) self.availableHosts = ipnet.hosts() self.prefixLen = ipnet.prefixlen self.namePrefix = namePrefix self.nodes = [] # create the bridge self.bridgeName = "%s-br" % (namePrefix,) self.bridgeAddress = next(self.availableHosts) self._add_bridge(self.bridgeName, self.bridgeAddress, self.prefixLen) # Don't pass bridged IPv4 traffic to iptables' chains, so namespaces # can communicate irrespective of the host machines iptables. This is # needed in some docker instances (e.g. travis), where traffic was # filtered at bridge level. See # https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt try: with open(BRIDGE_NF_CALL_IPTABLES, "r") as f: self.saved_bridge_nf_call_iptables = f.read() with open(BRIDGE_NF_CALL_IPTABLES, "w") as f: f.write("0\n") except FileNotFoundError: # In some environments this variable doesn't exist, we are ok with # no changes in this case. self.saved_bridge_nf_call_iptables = None def create_node(self): """ Creates a VirtualNode which can access/be accessed from other nodes in the virtual network. """ namespace = "%s-%s" % (self.namePrefix, len(self.nodes)) address = next(self.availableHosts) node = VirtualNode(namespace, address, self.prefixLen) self._add_interface_to_bridge(self.bridgeName, node.vethPeer) self.nodes.append(node) return node def destroy(self): """ Destroys the objects created for the virtual network. """ for node in self.nodes: node.destroy() _remove_interface_if_exists(self.bridgeName) if self.saved_bridge_nf_call_iptables is not None: with open(BRIDGE_NF_CALL_IPTABLES, "w") as f: f.write(self.saved_bridge_nf_call_iptables) def _add_bridge(self, name, address, prefixLen): """ Creates a bridge with the given name, address, and netmask perfix length. """ _remove_interface_if_exists(name) with NDB() as ndb: ( ndb.interfaces.create(ifname=name, kind="bridge", state="up") .add_ip("%s/%s" % (address, prefixLen)) .commit() ) def _add_interface_to_bridge(self, bridge, interface): """ Adds the given interface to the bridge. In our usecase, this interface is usually the peer end of a veth pair with the other end inside a network namespace, in which case after calling this function the namespace will be able to communicate with the other nodes in the virtual network. """ with NDB() as ndb: ndb.interfaces[bridge].add_port(interface).commit() ndb.interfaces[interface].set(state="up").commit() class VirtualNode: """ A virtual node inside a virtual network. Internally, this corresponds to a Linux network namespace. """ def __init__(self, namespace, address, prefixLen): self.namespace = namespace self.address = address self.prefixLen = prefixLen self.vethPeer = namespace + "p" self._add_namespace(namespace, address, prefixLen) def destroy(self): """ Removes all objects created for the virtual node. """ _remove_interface_if_exists(self.vethPeer) try: netns.remove(self.namespace) except: # Namespace doesn't exist. Return silently. pass def run(self, command, user=os.getenv("USER")): """ Executes a command under the given user from this virtual node. Returns a context manager that returns NSOpen object to control the process. NSOpen has the same API as subprocess.POpen. """ sudo_command = [ "sudo", "-E", "-u", user, "env", "PATH=" + os.getenv("PATH"), ] + command return managed_nspopen( self.namespace, sudo_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, start_new_session=True, ) def run_unmanaged(self, command, user=os.getenv("USER")): """ Executes a command under the given user from this virtual node. Returns an NSPopen object to control the process. NSOpen has the same API as subprocess.Popen. This NSPopen object needs to be manually release. In general you should prefer using run, where this is done automatically by the context manager. """ sudo_command = [ "sudo", "-E", "-u", user, "env", "PATH=" + os.getenv("PATH"), ] + command return NSPopen( self.namespace, sudo_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, start_new_session=True, ) def run_and_wait(self, command, name, timeout=COMMAND_TIMEOUT): """ Waits for command to exit successfully. If it exits with error or it timeouts, raises an execption with stdout and stderr streams of the process. """ with self.run(command) as proc: try: out, err = proc.communicate(timeout=timeout) if proc.returncode > 0: raise Exception( "%s failed, out: %s\n, err: %s" % (name, out, err) ) return out, err except subprocess.TimeoutExpired: proc.kill() out, err = proc.communicate() raise Exception( "%s timed out after %d seconds. out: %s\n, err: %s" % (name, timeout, out, err) ) def _add_namespace(self, name, address, netmaskLength): """ Creates a namespace with the given name, and creates a veth interface with one endpoint inside the namespace which has the given address and netmask length. The peer end of veth interface can be used to connect the namespace to a bridge. """ self._remove_namespace_if_exists(name) netns.create(name) veth_name = "veth0" _remove_interface_if_exists(self.vethPeer) with NDB() as ndb: # # Add netns to the NDB sources # # ndb.interfaces["lo"] is a short form of # ndb.interfaces[{"target": "localhost", "ifname": "lo"}] # # To address interfaces/addresses/routes wthin a netns, use # ndb.interfaces[{"target": netns_name, "ifname": "lo"}] ndb.sources.add(netns=name) # # Create veth ( ndb.interfaces.create( ifname=veth_name, kind="veth", peer=self.vethPeer, state="up", ) .commit() .set(net_ns_fd=name) .commit() ) # # .interfaces.wait() returns an interface object when # it becomes available on the specified source ( ndb.interfaces.wait(target=name, ifname=veth_name) .set(state="up") .add_ip("%s/%s" % (address, netmaskLength)) .commit() ) # ( ndb.interfaces[{"target": name, "ifname": "lo"}] .set(state="up") .commit() ) def _remove_namespace_if_exists(self, name): """ If the given namespace exists, removes it. Otherwise just returns silently. """ try: netns.remove(name) except Exception: # Namespace doesn't exist. Return silently. pass def ifdown(self): """ Bring the network interface down for this node """ with NDB() as ndb: # bring it down and wait until success ndb.interfaces[self.vethPeer].set(state="down").commit() def ifup(self): """ Bring the network interface up for this node """ with NDB() as ndb: # bring it up and wait until success ndb.interfaces[self.vethPeer].set(state="up").commit() def _remove_interface_if_exists(name): """ If the given interface exists, brings it down and removes it. Otherwise just returns silently. A bridge is also an interface, so this can be used for removing bridges too. """ with NDB() as ndb: if name in ndb.interfaces: try: ndb.interfaces[name].remove().commit() except netlink.exceptions.NetlinkError: pass pg_auto_failover-1.6.3/tests/pgautofailover_utils.py000066400000000000000000001677461414244367200230650ustar00rootroot00000000000000import os import os.path import signal import shutil import time import network import psycopg2 import subprocess import datetime as dt from collections import namedtuple from nose.tools import eq_ from enum import Enum import json import ssl_cert_utils as cert COMMAND_TIMEOUT = network.COMMAND_TIMEOUT POLLING_INTERVAL = 0.1 STATE_CHANGE_TIMEOUT = 90 PGVERSION = os.getenv("PGVERSION", "11") NodeState = namedtuple("NodeState", "reported assigned") # Append stderr output to default CalledProcessError message class CalledProcessError(subprocess.CalledProcessError): def __str__(self): return super().__str__() + "\n\t" + self.stderr class Role(Enum): Monitor = 1 Postgres = 2 Coordinator = 3 Worker = 4 def command(self): return self.name.lower() class Feature(Enum): Secondary = 1 def command(self): return self.name.lower() class Cluster: # Docker uses 172.17.0.0/16 by default, so we use 172.27.1.0/24 to not # conflict with that. def __init__( self, networkNamePrefix="pgauto", networkSubnet="172.27.1.0/24" ): """ Initializes the environment, virtual network, and other local state necessary for operation of the Cluster. """ os.environ["PG_REGRESS_SOCK_DIR"] = "" os.environ["PG_AUTOCTL_DEBUG"] = "" os.environ["PGHOST"] = "localhost" self.networkSubnet = networkSubnet self.vlan = network.VirtualLAN(networkNamePrefix, networkSubnet) self.monitor = None self.datanodes = [] def create_monitor( self, datadir, port=5432, hostname=None, authMethod=None, sslMode=None, sslSelfSigned=False, sslCAFile=None, sslServerKey=None, sslServerCert=None, ): """ Initializes the monitor and returns an instance of MonitorNode. """ if self.monitor is not None: raise Exception("Monitor has already been created.") vnode = self.vlan.create_node() self.monitor = MonitorNode( self, datadir, vnode, port, hostname, authMethod, sslMode, sslSelfSigned, sslCAFile=sslCAFile, sslServerKey=sslServerKey, sslServerCert=sslServerCert, ) self.monitor.create() return self.monitor # TODO group should auto sense for normal operations and passed to the # create cli as an argument when explicitly set by the test def create_datanode( self, datadir, port=5432, group=0, listen_flag=False, role=Role.Postgres, formation=None, authMethod=None, sslMode=None, sslSelfSigned=False, sslCAFile=None, sslServerKey=None, sslServerCert=None, ): """ Initializes a data node and returns an instance of DataNode. This will do the "keeper init" and "pg_autoctl run" commands. """ vnode = self.vlan.create_node() nodeid = len(self.datanodes) + 1 datanode = DataNode( self, datadir, vnode, port, os.getenv("USER"), authMethod, "postgres", self.monitor, nodeid, group, listen_flag, role, formation, sslMode=sslMode, sslSelfSigned=sslSelfSigned, sslCAFile=sslCAFile, sslServerKey=sslServerKey, sslServerCert=sslServerCert, ) self.datanodes.append(datanode) return datanode def pg_createcluster(self, datadir, port=5432): """ Initializes a postgresql node using pg_createcluster and returns directory path. """ vnode = self.vlan.create_node() create_command = [ "sudo", shutil.which("pg_createcluster"), "--user", os.getenv("USER"), "--group", "postgres", "-p", str(port), PGVERSION, datadir, "--", "--auth-local", "trust", ] print("%s" % " ".join(create_command)) vnode.run_and_wait(create_command, "pg_createcluster") abspath = os.path.join("/var/lib/postgresql/", PGVERSION, datadir) chmod_command = [ "sudo", shutil.which("install"), "-d", "-o", os.getenv("USER"), "/var/lib/postgresql/%s/backup" % PGVERSION, ] print("%s" % " ".join(chmod_command)) vnode.run_and_wait(chmod_command, "chmod") return abspath def destroy(self, force=True): """ Cleanup whatever was created for this Cluster. """ for datanode in list(reversed(self.datanodes)): datanode.destroy(force=force, ignore_failure=True, timeout=3) if self.monitor: self.monitor.destroy() self.vlan.destroy() def nodes(self): """ Returns a list of all nodes in the cluster including the monitor NOTE: Monitor is explicitely last in this list. So this list of nodes can be stopped in order safely. """ nodes = self.datanodes.copy() if self.monitor: nodes.append(self.monitor) return nodes def flush_output(self): """ flush the output for all running pg_autoctl processes in the cluster """ for node in self.nodes(): node.flush_output() def sleep(self, secs): """ sleep for the specified time while flushing output of the cluster at least every second """ full_secs = int(secs) for i in range(full_secs): self.flush_output() time.sleep(1) self.flush_output() time.sleep(secs - full_secs) def communicate(self, proc, timeout): """ communicate with the process with the specified timeout while flushing output of the cluster at least every second """ full_secs = int(timeout) for i in range(full_secs): self.flush_output() try: # wait until process is done for one second each iteration # of this loop, to add up to the actual timeout argument return proc.communicate(timeout=1) except subprocess.TimeoutExpired: pass self.flush_output() return proc.communicate(timeout=timeout - full_secs) def create_root_cert(self, directory, basename="root", CN="root"): self.cert = cert.SSLCert(directory, basename, CN) self.cert.create_root_cert() class PGNode: """ Common stuff between MonitorNode and DataNode. """ def __init__( self, cluster, datadir, vnode, port, username, authMethod, database, role, sslMode=None, sslSelfSigned=False, sslCAFile=None, sslServerKey=None, sslServerCert=None, ): self.cluster = cluster self.datadir = datadir self.vnode = vnode self.port = port self.username = username self.authMethod = authMethod or "trust" self.database = database self.role = role self.pg_autoctl = None self.authenticatedUsers = {} self._pgversion = None self._pgmajor = None self.sslMode = sslMode self.sslSelfSigned = sslSelfSigned self.sslCAFile = sslCAFile self.sslServerKey = sslServerKey self.sslServerCert = sslServerCert self._pgversion = None self._pgmajor = None def connection_string(self): """ Returns a connection string which can be used to connect to this postgres node. """ host = self.vnode.address if self.authMethod and self.username in self.authenticatedUsers: dsn = "postgres://%s:%s@%s:%d/%s" % ( self.username, self.authenticatedUsers[self.username], host, self.port, self.database, ) else: dsn = "postgres://%s@%s:%d/%s" % ( self.username, host, self.port, self.database, ) # If a local CA is used, or even a self-signed certificate, # using verify-ca often provides enough protection. if self.sslMode: sslMode = self.sslMode elif self.sslSelfSigned: sslMode = "require" else: sslMode = "prefer" dsn += "?sslmode=%s" % sslMode if self.sslCAFile: dsn += f"&sslrootcert={self.sslCAFile}" return dsn def run(self, env={}, name=None, host=None, port=None): """ Runs "pg_autoctl run" """ self.name = name self.pg_autoctl = PGAutoCtl(self) self.pg_autoctl.run(name=name, host=host, port=port) def running(self): return self.pg_autoctl and self.pg_autoctl.run_proc def flush_output(self): """ Flushes the output of pg_autoctl if it's running to be sure that it does not get stuck, because of a filled up pipe. """ if self.running(): self.pg_autoctl.consume_output(0.001) def sleep(self, secs): """ Sleep for the specfied amount of seconds but meanwile consume output of the pg_autoctl process to make sure it does not lock up. """ if self.running(): self.pg_autoctl.consume_output(secs) else: time.sleep(secs) def run_sql_query(self, query, *args): """ Runs the given sql query with the given arguments in this postgres node and returns the results. Returns None if there are no results to fetch. """ result = None conn = psycopg2.connect(self.connection_string()) with conn: with conn.cursor() as cur: cur.execute(query, args) try: result = cur.fetchall() except psycopg2.ProgrammingError: pass # leaving contexts closes the cursor, however # leaving contexts doesn't close the connection conn.close() return result def pg_config_get(self, settings): """ Returns the current value of the given postgres settings" """ if isinstance(settings, str): return self.run_sql_query(f"SHOW {setting}")[0][0] else: # we have a list of settings to grab s = {} q = "select name, setting from pg_settings where name = any(%s)" for name, setting in self.run_sql_query(q, settings): s[name] = setting return s def set_user_password(self, username, password): """ Sets user passwords on the PGNode """ alter_user_set_passwd_command = "alter user %s with password '%s'" % ( username, password, ) passwd_command = [ shutil.which("psql"), "-d", self.database, "-c", alter_user_set_passwd_command, ] self.vnode.run_and_wait(passwd_command, name="user passwd") self.authenticatedUsers[username] = password def stop_pg_autoctl(self): """ Kills the keeper by sending a SIGTERM to keeper's process group. """ if self.pg_autoctl: return self.pg_autoctl.stop() def stop_postgres(self): """ Stops the postgres process by running: pg_ctl -D ${self.datadir} --wait --mode fast stop """ # pg_ctl stop is racey when another process is trying to start postgres # again in the background. It will not finish in that case. pg_autoctl # does this, so we try stopping postgres a couple of times. This way we # make sure the race does not impact our tests. # # The race can be easily reproduced by in one shell doing: # while true; do pg_ctl --pgdata monitor/ start; done # And in another: # pg_ctl --pgdata monitor --wait --mode fast stop # The second command will not finish, since the first restarts postgres # before the second finds out it has been killed. for i in range(60): stop_command = [ shutil.which("pg_ctl"), "-D", self.datadir, "--wait", "--mode", "fast", "stop", ] try: with self.vnode.run(stop_command) as stop_proc: out, err = stop_proc.communicate(timeout=1) if stop_proc.returncode > 0: print( "stopping postgres for '%s' failed, out: %s\n, err: %s" % (self.vnode.address, out, err) ) return False elif stop_proc.returncode is None: print("stopping postgres for '%s' timed out") return False return True except subprocess.TimeoutExpired: pass else: raise Exception("Postgres could not be stopped after 60 attempts") def reload_postgres(self): """ Reload the postgres configuration by running: pg_ctl -D ${self.datadir} reload """ reload_command = [shutil.which("pg_ctl"), "-D", self.datadir, "reload"] with self.vnode.run(reload_command) as reload_proc: out, err = self.cluster.communicate(reload_proc, COMMAND_TIMEOUT) if reload_proc.returncode > 0: print( "reloading postgres for '%s' failed, out: %s\n, err: %s" % (self.vnode.address, out, err) ) return False elif reload_proc.returncode is None: print("reloading postgres for '%s' timed out") return False return True def restart_postgres(self): """ Restart Postgres with pg_autoctl do service restart postgres """ command = PGAutoCtl(self) command.execute( "service restart postgres", "do", "service", "restart", "postgres" ) def pg_is_running(self, timeout=COMMAND_TIMEOUT): """ Returns true when Postgres is running. We use pg_ctl status. """ command = PGAutoCtl(self) try: command.execute("pgsetup ready", "do", "pgsetup", "ready", "-vvv") except Exception as e: # pg_autoctl uses EXIT_CODE_PGSQL when Postgres is not ready return False return True def wait_until_pg_is_running(self, timeout=STATE_CHANGE_TIMEOUT): """ Waits until the underlying Postgres process is running. """ command = PGAutoCtl(self) out, err, ret = command.execute( "pgsetup ready", "do", "pgsetup", "wait", "-vvv" ) return ret == 0 def fail(self): """ Simulates a data node failure by terminating the keeper and stopping postgres. """ self.stop_pg_autoctl() # stopping pg_autoctl also stops Postgres, unless bugs. if self.pg_is_running(): self.stop_postgres() def config_file_path(self): """ Returns the path of the config file for this data node. """ # Config file is located at: # ~/.config/pg_autoctl/${PGDATA}/pg_autoctl.cfg home = os.getenv("HOME") pgdata = os.path.abspath(self.datadir)[1:] # Remove the starting '/' return os.path.join( home, ".config/pg_autoctl", pgdata, "pg_autoctl.cfg" ) def state_file_path(self): """ Returns the path of the state file for this data node. """ # State file is located at: # ~/.local/share/pg_autoctl/${PGDATA}/pg_autoctl.state home = os.getenv("HOME") pgdata = os.path.abspath(self.datadir)[1:] # Remove the starting '/' return os.path.join( home, ".local/share/pg_autoctl", pgdata, "pg_autoctl.state" ) def get_postgres_logs(self): ldir = os.path.join(self.datadir, "log") try: logfiles = os.listdir(ldir) except FileNotFoundError: # If the log directory does not exist then there's also no logs to # display return "" logfiles.sort() logs = [] for logfile in logfiles: logs += ["\n\n%s:\n" % logfile] logs += open(os.path.join(ldir, logfile)).readlines() # it's not really logs but we want to see that too for inc in [ "recovery.conf", "postgresql.auto.conf", "postgresql-auto-failover.conf", "postgresql-auto-failover-standby.conf", ]: conf = os.path.join(self.datadir, inc) if os.path.isfile(conf): logs += ["\n\n%s:\n" % conf] logs += open(conf).readlines() else: logs += ["\n\n%s does not exist\n" % conf] return "".join(logs) def pgversion(self): """ Query local Postgres for its version. Cache the result. """ if self._pgversion: return self._pgversion # server_version_num is 110005 for 11.5 self._pgversion = int( self.run_sql_query("show server_version_num")[0][0] ) self._pgmajor = self._pgversion // 10000 return self._pgversion def pgmajor(self): if self._pgmajor: return self._pgmajor self.pgversion() return self._pgmajor def ifdown(self): """ Bring the network interface down for this node """ self.vnode.ifdown() def ifup(self): """ Bring the network interface up for this node """ self.vnode.ifup() def config_set(self, setting, value): """ Set a configuration parameter to given value """ command = PGAutoCtl(self) command.execute( "config set %s" % setting, "config", "set", setting, value ) return True def config_get(self, setting): """ Set a configuration parameter to given value """ command = PGAutoCtl(self) out, err, ret = command.execute( "config get %s" % setting, "config", "get", setting ) return out[:-1] def show_uri(self, json=False): """ Runs pg_autoctl show uri """ command = PGAutoCtl(self) if json: out, err, ret = command.execute("show uri", "show", "uri", "--json") else: out, err, ret = command.execute("show uri", "show", "uri") return out def logs(self): log_string = "" if self.running(): out, err, ret = self.stop_pg_autoctl() log_string += f"STDOUT OF PG_AUTOCTL FOR {self.datadir}:\n" log_string += f"{self.pg_autoctl.cmd}\n{out}\n" log_string += f"STDERR OF PG_AUTOCTL FOR {self.datadir}:\n{err}\n" pglogs = self.get_postgres_logs() log_string += f"POSTGRES LOGS FOR {self.datadir}:\n{pglogs}\n" return log_string def get_events_str(self): "2020-08-03 12:04:41.513761+00:00" events = self.get_events() if events: return "\n".join( [ "%32s %8s %17s/%-17s %10s %10s %s" % ( "eventtime", "name", "state", "goal state", "repl st", "tli:lsn", "event", ) ] + [ "%32s %8s %17s/%-17s %10s %3s:%7s %s" % result for result in events ] ) else: return "" def print_debug_logs(self): events = self.get_events_str() logs = f"MONITOR EVENTS:\n{events}\n" for node in self.cluster.nodes(): logs += node.logs() print(logs) # we might be running with the monitor disabled if self.cluster.monitor and self.cluster.monitor.pg_autoctl: print("%s" % self.cluster.monitor.pg_autoctl.err) def enable_ssl( self, sslMode=None, sslSelfSigned=None, sslCAFile=None, sslServerKey=None, sslServerCert=None, ): """ Enables SSL on a pg_autoctl node """ self.sslMode = sslMode self.sslSelfSigned = sslSelfSigned self.sslCAFile = sslCAFile self.sslServerKey = sslServerKey self.sslServerCert = sslServerCert ssl_args = ["enable", "ssl", "-vvv", "--pgdata", self.datadir] if self.sslMode: ssl_args += ["--ssl-mode", self.sslMode] if self.sslSelfSigned: ssl_args += ["--ssl-self-signed"] if self.sslCAFile: ssl_args += ["--ssl-ca-file", self.sslCAFile] if self.sslServerKey: ssl_args += ["--server-key", self.sslServerKey] if self.sslServerCert: ssl_args += ["--server-cert", self.sslServerCert] if not self.sslSelfSigned and not self.sslServerKey: ssl_args += ["--no-ssl"] command = PGAutoCtl(self, argv=ssl_args) out, err, ret = command.execute("enable ssl") def get_monitor_uri(self): """ pg_autoctl show uri --monitor """ command = PGAutoCtl(self) out, err, ret = command.execute( "show uri --monitor", "show", "uri", "--formation", "monitor" ) return out def get_formation_uri(self, formationName="default"): """ pg_autoctl show uri --formation {formationName} """ command = PGAutoCtl(self) out, err, ret = command.execute( "show uri --formation", "show", "uri", "--formation", formationName ) return out def check_conn_string_ssl(self, conn_string, sslmode): """ Asserts that given connection string embeds expected SSL settings. """ crl = None rootCert = self.sslCAFile print("checking connstring =", conn_string) assert f"sslmode={sslmode}" in conn_string if rootCert: assert f"sslrootcert={rootCert}" in conn_string if crl: assert f"sslcrl={crl}" in conn_string def check_ssl(self, ssl, sslmode, monitor=False, primary=False): """ Checks if ssl settings match how the node is set up """ key = self.sslServerKey crt = self.sslServerCert crl = None rootCert = self.sslCAFile if self.sslSelfSigned: key = os.path.join(self.datadir, "server.key") crt = os.path.join(self.datadir, "server.crt") # grab all the settings we want to check in a single round-trip pg_settings_names = [ "ssl", "ssl_ciphers", "ssl_key_file", "ssl_cert_file", "ssl_crl_file", "ssl_ca_file", ] if self.pgmajor() >= 12: pg_settings_names += ["primary_conninfo"] pg_settings = self.pg_config_get(pg_settings_names) eq_(pg_settings["ssl"], ssl) eq_(self.config_get("ssl.sslmode"), sslmode) expected_ciphers = ( "ECDHE-ECDSA-AES128-GCM-SHA256:" "ECDHE-ECDSA-AES256-GCM-SHA384:" "ECDHE-RSA-AES128-GCM-SHA256:" "ECDHE-RSA-AES256-GCM-SHA384:" "ECDHE-ECDSA-AES128-SHA256:" "ECDHE-ECDSA-AES256-SHA384:" "ECDHE-RSA-AES128-SHA256:" "ECDHE-RSA-AES256-SHA384" ) # TODO: also do this for monitor once we can have superuser access to # the monitor if not monitor: eq_(pg_settings["ssl_ciphers"], expected_ciphers) monitor_uri = self.get_monitor_uri() self.check_conn_string_ssl(monitor_uri, sslmode) if not monitor: monitor_uri = self.config_get("pg_autoctl.monitor") self.check_conn_string_ssl(monitor_uri, sslmode) formation_uri = self.get_formation_uri() self.check_conn_string_ssl(formation_uri, sslmode) for pg_setting, autoctl_setting, file_path in [ ("ssl_key_file", "ssl.key_file", key), ("ssl_cert_file", "ssl.cert_file", crt), ("ssl_crl_file", "ssl.crl_file", crl), ("ssl_ca_file", "ssl.ca_file", rootCert), ]: if file_path is None: continue assert os.path.isfile(file_path) print("checking", pg_setting) eq_(pg_settings[pg_setting], file_path) eq_(self.config_get(autoctl_setting), file_path) if monitor or primary: return if self.pgmajor() >= 12: self.check_conn_string_ssl(pg_settings["primary_conninfo"], sslmode) def editedHBA(self): """ Returns True when pg_autoctl has edited the HBA file found in datadir, False otherwise. """ editedHBA = False hbaFilePath = os.path.join(self.datadir, "pg_hba.conf") with open(hbaFilePath, "r") as hba: lines = hba.readlines() for line in lines: if line == "": continue if line[0] == "#": continue if "# Auto-generated by pg_auto_failover" in line: # make the output easier to follow if editedHBA is False: print() # do not print the ending \n in line print("Edited HBA line: %s" % line[:-1]) editedHBA = True return editedHBA class DataNode(PGNode): def __init__( self, cluster, datadir, vnode, port, username, authMethod, database, monitor, nodeid, group, listen_flag, role, formation, sslMode=None, sslSelfSigned=False, sslCAFile=None, sslServerKey=None, sslServerCert=None, ): super().__init__( cluster, datadir, vnode, port, username, authMethod, database, role, sslMode=sslMode, sslSelfSigned=sslSelfSigned, sslCAFile=sslCAFile, sslServerKey=sslServerKey, sslServerCert=sslServerCert, ) self.monitor = monitor self.nodeid = nodeid self.group = group self.listen_flag = listen_flag self.formation = formation self.monitorDisabled = None def create( self, run=False, level="-v", name=None, host=None, port=None, candidatePriority=None, replicationQuorum=None, monitorDisabled=False, nodeId=None, citusSecondary=False, citusClusterName="default", ): """ Runs "pg_autoctl create" """ pghost = "localhost" sockdir = os.environ["PG_REGRESS_SOCK_DIR"] if self.listen_flag: pghost = str(self.vnode.address) if sockdir and sockdir != "": pghost = sockdir if monitorDisabled: self.monitorDisabled = True # don't pass --hostname to Postgres nodes in order to exercise the # automatic detection of the hostname. create_args = [ "create", self.role.command(), level, "--pgdata", self.datadir, "--pghost", pghost, "--pgport", str(self.port), "--pgctl", shutil.which("pg_ctl"), ] if self.authMethod == "skip": create_args += ["--skip-pg-hba"] else: create_args += ["--auth", self.authMethod] if not self.monitorDisabled: create_args += ["--monitor", self.monitor.connection_string()] if self.sslMode: create_args += ["--ssl-mode", self.sslMode] if self.sslSelfSigned: create_args += ["--ssl-self-signed"] if self.sslCAFile: create_args += ["--ssl-ca-file", self.sslCAFile] if self.sslServerKey: create_args += ["--server-key", self.sslServerKey] if self.sslServerCert: create_args += ["--server-cert", self.sslServerCert] if not self.sslSelfSigned and not self.sslCAFile: create_args += ["--no-ssl"] if self.listen_flag: create_args += ["--listen", str(self.vnode.address)] if self.formation: create_args += ["--formation", self.formation] if self.group: create_args += ["--group", str(self.group)] if name: self.name = name create_args += ["--name", name] if host: create_args += ["--hostname", host] if port: create_args += ["--pgport", port] if candidatePriority is not None: create_args += ["--candidate-priority", str(candidatePriority)] if replicationQuorum is not None: create_args += ["--replication-quorum", str(replicationQuorum)] if citusSecondary is True: create_args += ["--citus-secondary"] if citusClusterName is not None and citusClusterName != "default": create_args += ["--citus-cluster", citusClusterName] if self.monitorDisabled: assert nodeId is not None create_args += ["--disable-monitor"] create_args += ["--node-id", str(nodeId)] if run: create_args += ["--run"] # when run is requested pg_autoctl does not terminate # therefore we do not wait for process to complete # we just record the process self.pg_autoctl = PGAutoCtl(self, create_args) if run: self.pg_autoctl.run() else: self.pg_autoctl.execute("pg_autoctl create") # sometimes we might have holes in the nodeid sequence # grab the current nodeid, if it's already available nodeid = self.get_nodeid() if nodeid > 0: self.nodeid = nodeid def get_nodeid(self): """ Fetch the nodeid from the pg_autoctl state file. """ command = PGAutoCtl(self) out, err, ret = command.execute("get node id", "do", "fsm", "state") self.state = json.loads(out) return self.state["state"]["nodeId"] def jsDict(self, lsn="0/1", isPrimary=False): """ Returns a python dict with the information to fill-in a JSON representation of the node. """ return { "node_id": self.get_nodeid(), "node_name": self.name, "node_host": str(self.vnode.address), "node_port": self.port, "node_lsn": lsn, "node_is_primary": isPrimary, } def get_local_state(self): """ Fetch the assigned_state from the pg_autoctl state file. """ command = PGAutoCtl(self) out, err, ret = command.execute( "get node id", "-vv", "do", "fsm", "state" ) self.state = json.loads(out) return ( self.state["state"]["current_role"], self.state["state"]["assigned_role"], ) def get_nodename(self, nodeId=None): """ Fetch the node name from the monitor, given its nodeid """ if nodeId is None: nodeId = self.get_nodeid() self.name = self.cluster.monitor.run_sql_query( "select nodename from pgautofailover.node where nodeid = %s", nodeId )[0][0] return self.name def destroy( self, force=False, ignore_failure=False, timeout=COMMAND_TIMEOUT ): """ Cleans up processes and files created for this data node. """ self.stop_pg_autoctl() flags = ["--destroy"] if force: flags.append("--force") try: destroy = PGAutoCtl(self) destroy.execute( "pg_autoctl drop node --destroy", "drop", "node", *flags, timeout=timeout, ) except Exception as e: if ignore_failure: print(str(e)) else: raise try: os.remove(self.config_file_path()) except FileNotFoundError: pass try: os.remove(self.state_file_path()) except FileNotFoundError: pass # Remove self from the cluster if present so that future calls to # cluster.destroy() will not emit errors for the already destroyed node try: self.cluster.datanodes.remove(self) except ValueError: pass def wait_until_state( self, target_state, timeout=STATE_CHANGE_TIMEOUT, sleep_time=POLLING_INTERVAL, ): """ Waits until this data node reaches the target state, and then returns True. If this doesn't happen until "timeout" seconds, returns False. """ prev_state = None wait_until = dt.datetime.now() + dt.timedelta(seconds=timeout) while wait_until > dt.datetime.now(): self.cluster.sleep(sleep_time) current_state, assigned_state = self.get_state() # only log the state if it has changed if current_state != prev_state: if current_state == target_state: print( "state of %s is '%s', done waiting" % (self.datadir, current_state) ) else: print( "state of %s is '%s', waiting for '%s' ..." % (self.datadir, current_state, target_state) ) if current_state == target_state: return True prev_state = current_state print( "%s didn't reach %s after %d seconds" % (self.datadir, target_state, timeout) ) error_msg = ( f"{self.datadir} failed to reach {target_state} " f"after {timeout} seconds\n" ) self.print_debug_logs() raise Exception(error_msg) def wait_until_assigned_state( self, target_state, timeout=STATE_CHANGE_TIMEOUT, sleep_time=POLLING_INTERVAL, ): """ Waits until this data node is assigned the target state. Typically used when the node has been stopped or failed and we want to check the monitor FSM. """ prev_state = None wait_until = dt.datetime.now() + dt.timedelta(seconds=timeout) while wait_until > dt.datetime.now(): self.cluster.sleep(sleep_time) current_state, assigned_state = self.get_state() # only log the state if it has changed if assigned_state != prev_state: if assigned_state == target_state: print( "assigned state of %s is '%s', done waiting" % (self.datadir, assigned_state) ) else: print( "assigned state of %s is '%s', waiting for '%s' ..." % (self.datadir, assigned_state, target_state) ) if assigned_state == target_state: return True prev_state = assigned_state print( "%s didn't reach %s after %d seconds" % (self.datadir, target_state, timeout) ) error_msg = ( f"{self.datadir} failed to reach {target_state} " f"after {timeout} seconds\n" ) self.print_debug_logs() raise Exception(error_msg) def get_state(self): """ Returns the current state of the data node. This is done by querying the monitor node. """ results = self.monitor.run_sql_query( """ SELECT reportedstate, goalstate FROM pgautofailover.node WHERE nodeid=%s and groupid=%s """, self.nodeid, self.group, ) if len(results) == 0: raise Exception( "node %s in group %s not found on the monitor" % (self.nodeid, self.group) ) else: res = NodeState(results[0][0], results[0][1]) return res # default case, unclean when reached return NodeState(None, None) def get_events(self): """ Returns the current list of events from the monitor. """ if self.monitor: last_events_query = ( "select eventtime, nodename, " "reportedstate, goalstate, " "reportedrepstate, reportedtli, reportedlsn, description " "from pgautofailover.last_events('default', count => 20)" ) return self.monitor.get_events() def enable_maintenance(self, allowFailover=False): """ Enables maintenance on a pg_autoctl standby node :return: """ command = PGAutoCtl(self) if allowFailover: command.execute( "enable maintenance", "enable", "maintenance", "--allow-failover", ) else: command.execute("enable maintenance", "enable", "maintenance") def disable_maintenance(self): """ Disables maintenance on a pg_autoctl standby node :return: """ command = PGAutoCtl(self) command.execute("disable maintenance", "disable", "maintenance") def perform_promotion(self): """ Calls pg_autoctl perform promotion on a Postgres node """ command = PGAutoCtl(self) command.execute("perform promotion", "perform", "promotion") def enable_monitor(self, monitor): """ Disables the monitor on a pg_autoctl node :return: """ command = PGAutoCtl(self) command.execute( "enable monitor", "enable", "monitor", monitor.connection_string(), ) self.monitor = monitor self.monitorDisabled = False def disable_monitor(self): """ Disables the monitor on a pg_autoctl node :return: """ command = PGAutoCtl(self) command.execute("disable monitor", "disable", "monitor", "--force") self.monitor = None self.monitorDisabled = True def drop(self): """ Drops a pg_autoctl node from its formation :return: """ command = PGAutoCtl(self) command.execute("drop node", "drop", "node") return True def do_fsm_assign(self, target_state): """ Runs `pg_autoctl do fsm assign` on a node :return: """ command = PGAutoCtl(self) command.execute( "do fsm assign", "-vv", "do", "fsm", "assign", target_state ) return True def do_fsm_nodes_set(self, nodesArray): """ Runs `pg_autoctl do fsm nodes set` on a node :return: """ filename = "/tmp/nodes.json" with open(filename, "w") as nodesFile: nodesFile.write(json.dumps(nodesArray)) command = PGAutoCtl(self) out, err, ret = command.execute( "do fsm nodes set", "do", "fsm", "nodes", "set", filename ) return True def do_fsm_step(self): """ Runs `pg_autoctl do fsm step` on a node :return: """ command = PGAutoCtl(self) command.execute("do fsm step", "do", "fsm", "step") return True def set_metadata(self, name=None, host=None, port=None): """ Sets node metadata via pg_autoctl """ args = ["set node metadata", "set", "node", "metadata"] if name: args += ["--name", name] if host: args += ["--hostname", host] if port: args += ["--pgport", port] command = PGAutoCtl(self) command.execute(*args) def set_candidate_priority(self, candidatePriority): """ Sets candidate priority via pg_autoctl """ command = PGAutoCtl(self) try: command.execute( "set canditate priority", "set", "node", "candidate-priority", "--", str(candidatePriority), ) except Exception as e: if command.last_returncode == 1: return False raise e return True def get_candidate_priority(self): """ Gets candidate priority via pg_autoctl """ command = PGAutoCtl(self) out, err, ret = command.execute( "get canditate priority", "get", "node", "candidate-priority" ) return int(out) def set_replication_quorum(self, replicationQuorum): """ Sets replication quorum via pg_autoctl """ command = PGAutoCtl(self) try: command.execute( "set replication quorum", "set", "node", "replication-quorum", replicationQuorum, ) except Exception as e: if command.last_returncode == 1: return False raise e return True def get_replication_quorum(self): """ Gets replication quorum via pg_autoctl """ command = PGAutoCtl(self) out, err, ret = command.execute( "get replication quorum", "get", "node", "replication-quorum" ) value = out.strip() if value not in ["true", "false"]: raise Exception("Unknown replication quorum value %s" % value) return value == "true" def set_number_sync_standbys(self, numberSyncStandbys): """ Sets number sync standbys via pg_autoctl """ command = PGAutoCtl(self) try: command.execute( "set number sync standbys", "set", "formation", "number-sync-standbys", str(numberSyncStandbys), ) except Exception as e: if command.last_returncode == 1: return False raise e return True def get_number_sync_standbys(self): """ Gets number sync standbys via pg_autoctl """ command = PGAutoCtl(self) out, err, ret = command.execute( "get number sync standbys", "get", "formation", "number-sync-standbys", ) return int(out) def get_synchronous_standby_names(self): """ Gets synchronous standby names via pg_autoctl """ command = PGAutoCtl(self) out, err, ret = command.execute( "get synchronous_standby_names", "show", "standby-names" ) # strip spaces and single-quotes from the output return out.strip("' \n\r\t") def get_synchronous_standby_names_local(self): """ Gets synchronous standby names via sql query on data node """ query = "select current_setting('synchronous_standby_names')" result = self.run_sql_query(query) return result[0][0] def check_synchronous_standby_names(self, ssn): """ Checks both monitor a local synchronous_standby_names do match ssn. """ eq_(self.get_synchronous_standby_names_local(), ssn) eq_(self.get_synchronous_standby_names(), ssn) def print_synchronous_standby_names(self): monitorStandbyNames = self.get_synchronous_standby_names() localStandbyNames = self.get_synchronous_standby_names_local() print("synchronous_standby_names = '%s'" % monitorStandbyNames) print("synchronous_standby_names_local = '%s'" % localStandbyNames) return def list_replication_slot_names(self): """ Returns a list of the replication slot names on the local Postgres. """ query = ( "select slot_name from pg_replication_slots " + "where slot_name ~ '^pgautofailover_standby_' " + " and slot_type = 'physical'" ) try: result = self.run_sql_query(query) return [row[0] for row in result] except Exception as e: self.print_debug_logs() raise e def has_needed_replication_slots(self): """ Each node is expected to maintain a slot for each of the other nodes the primary through streaming replication, the secondary(s) manually through calls to pg_replication_slot_advance() on the local Postgres. Postgres 10 lacks the function pg_replication_slot_advance() so when the local Postgres is version 10 we don't create any replication slot on the standby servers. """ if self.pgmajor() == 10: return True hostname = str(self.vnode.address) other_nodes = self.monitor.get_other_nodes(self.nodeid) expected_slots = [ "pgautofailover_standby_%s" % n[0] for n in other_nodes ] current_slots = self.list_replication_slot_names() # just to make it easier to read through the print()ed list expected_slots.sort() current_slots.sort() if set(expected_slots) == set(current_slots): # print("slots list on %s is %s, as expected" % # (self.datadir, current_slots)) return True self.print_debug_logs() print() print( "slots list on %s is %s, expected %s" % (self.datadir, current_slots, expected_slots) ) return False class MonitorNode(PGNode): def __init__( self, cluster, datadir, vnode, port, hostname, authMethod, sslMode=None, sslSelfSigned=None, sslCAFile=None, sslServerKey=None, sslServerCert=None, ): super().__init__( cluster, datadir, vnode, port, "autoctl_node", authMethod, "pg_auto_failover", Role.Monitor, sslMode, sslSelfSigned, sslCAFile, sslServerKey, sslServerCert, ) # set the hostname, default to the ip address of the node if hostname: self.hostname = hostname else: self.hostname = str(self.vnode.address) def create(self, level="-v", run=False): """ Initializes and runs the monitor process. """ create_args = [ "create", self.role.command(), level, "--pgdata", self.datadir, "--pgport", str(self.port), "--auth", self.authMethod, "--hostname", self.hostname, ] if self.sslMode: create_args += ["--ssl-mode", self.sslMode] if self.sslSelfSigned: create_args += ["--ssl-self-signed"] if self.sslCAFile: create_args += ["--ssl-ca-file", self.sslCAFile] if self.sslServerKey: create_args += ["--server-key", self.sslServerKey] if self.sslServerCert: create_args += ["--server-cert", self.sslServerCert] if not self.sslSelfSigned and not self.sslCAFile: create_args += ["--no-ssl"] if run: create_args += ["--run"] # when run is requested pg_autoctl does not terminate # therefore we do not wait for process to complete # we just record the process self.pg_autoctl = PGAutoCtl(self, create_args) if run: self.pg_autoctl.run() else: self.pg_autoctl.execute("create monitor") def run(self, env={}, name=None, host=None, port=None): """ Runs "pg_autoctl run" """ self.pg_autoctl = PGAutoCtl(self) self.pg_autoctl.run(level="-v") # when on the monitor we always want Postgres to be running to continue self.wait_until_pg_is_running() def destroy(self): """ Cleans up processes and files created for this monitor node. """ if self.pg_autoctl: out, err, ret = self.pg_autoctl.stop() if ret != 0: print() print("Monitor logs:\n%s\n%s\n" % (out, err)) try: destroy = PGAutoCtl(self) destroy.execute( "pg_autoctl destroy monitor", "drop", "monitor", "--destroy" ) except Exception as e: print(str(e)) raise try: os.remove(self.config_file_path()) except FileNotFoundError: pass try: os.remove(self.state_file_path()) except FileNotFoundError: pass # Set self to None in cluster to avoid errors in future calls to # cluster.destroy() self.cluster.monitor = None def create_formation( self, formation_name, kind="pgsql", secondary=None, dbname=None ): """ Create a formation that the monitor controls :param formation_name: identifier used to address the formation :param ha: boolean whether or not to run the formation with high availability :param kind: identifier to signal what kind of formation to run :param dbname: name of the database to use in the formation :return: None """ formation_command = [ shutil.which("pg_autoctl"), "create", "formation", "--pgdata", self.datadir, "--formation", formation_name, "--kind", kind, ] if dbname is not None: formation_command += ["--dbname", dbname] # pass true or false to --enable-secondary or --disable-secondary, # only when ha is actually set by the user if secondary is not None: if secondary: formation_command += ["--enable-secondary"] else: formation_command += ["--disable-secondary"] self.vnode.run_and_wait(formation_command, name="create formation") def enable(self, feature, formation="default"): """ Enable a feature on a formation :param feature: instance of Feature enum indicating which feature to enable :param formation: name of the formation to enable the feature on :return: None """ command = PGAutoCtl(self) command.execute( "enable %s" % feature.command(), "enable", feature.command(), "--formation", formation, ) def disable(self, feature, formation="default"): """ Disable a feature on a formation :param feature: instance of Feature enum indicating which feature to disable :param formation: name of the formation to disable the feature on :return: None """ command = PGAutoCtl(self) command.execute( "disable %s" % feature.command(), "disable", feature.command(), "--formation", formation, ) def failover(self, formation="default", group=0): """ performs manual failover for given formation and group id """ failover_commmand_text = ( "select * from pgautofailover.perform_failover('%s', %s)" % (formation, group) ) failover_command = [ shutil.which("psql"), "-d", self.database, "-c", failover_commmand_text, ] self.vnode.run_and_wait(failover_command, name="manual failover") def print_state(self, formation="default"): print("pg_autoctl show state --pgdata %s" % self.datadir) command = PGAutoCtl(self) out, err, ret = command.execute( "show state", "show", "state", "--formation", formation ) print("%s" % out) def get_other_nodes(self, nodeid): """ Returns the list of the other nodes in the same formation/group. """ query = "select * from pgautofailover.get_other_nodes(%s)" return self.run_sql_query(query, nodeid) def check_ssl(self, ssl, sslmode): """ Checks if ssl settings match how the node is set up """ return super().check_ssl(ssl, sslmode, monitor=True) def get_events(self): """ Returns the current list of events from the monitor. """ last_events_query = ( "select eventtime, nodename, " "reportedstate, goalstate, " "reportedrepstate, reportedtli, reportedlsn, description " "from pgautofailover.last_events('default', count => 20)" ) if self.pg_is_running(): return self.run_sql_query(last_events_query) def run_sql_query(self, query, *args): """ Run a SQL query on the monitor. When exception OperationalError is raised, it might be a SEGFAULT on the Postgres side of things, within the pgautofailover extension. To help debug, then print the Postgres logs. """ try: return super().run_sql_query(query, *args) except psycopg2.OperationalError: # Did we SEGFAULT? let's see the Postgres logs. pglogs = self.get_postgres_logs() print(f"POSTGRES LOGS FOR {self.datadir}:\n{pglogs}\n") raise class PGAutoCtl: def __init__(self, pgnode, argv=None): self.vnode = pgnode.vnode self.datadir = pgnode.datadir self.pgnode = pgnode self.command = None self.program = shutil.which("pg_autoctl") if self.program is None: pg_config = shutil.which("pg_config") if pg_config is None: raise Exception( "Failed to find pg_config in %s" % os.environ["PATH"] ) else: # run pg_config --bindir p = subprocess.run( [pg_config, "--bindir"], text=True, capture_output=True ) bindir = p.stdout.splitlines()[0] self.program = os.path.join(bindir, "pg_autoctl") self.run_proc = None self.last_returncode = None self.out = "" self.err = "" self.cmd = "" if argv: self.command = [self.program] + argv def run(self, level="-vv", name=None, host=None, port=None): """ Runs our command in the background, returns immediately. The command could be `pg_autoctl run`, or another command. We could be given a full `pg_autoctl create postgres --run` command. """ if not self.command: self.command = [ self.program, "run", "--pgdata", self.datadir, level, ] if name: self.command += ["--name", name] if host: self.command += ["--hostname", host] if port: self.command += ["--pgport", port] self.cmd = " ".join(self.command) if self.run_proc: self.run_proc.release() self.run_proc = self.vnode.run_unmanaged(self.command) def execute(self, name, *args, timeout=COMMAND_TIMEOUT): """ Execute a single pg_autoctl command, wait for its completion. """ self.set_command(*args) self.cmd = " ".join(self.command) with self.vnode.run(self.command) as proc: try: out, err = self.pgnode.cluster.communicate(proc, timeout) except subprocess.TimeoutExpired: string_command = " ".join(self.command) self.pgnode.print_debug_logs() raise Exception( f"{name} timed out after {timeout} seconds.\n{string_command}\n", ) self.last_returncode = proc.returncode if proc.returncode > 0: raise CalledProcessError(proc.returncode, self.cmd, out, err) return out, err, proc.returncode def stop(self): """ Kills the keeper by sending a SIGTERM to keeper's process group. """ if self.run_proc and self.run_proc.pid: try: os.kill(self.run_proc.pid, signal.SIGTERM) return self.pgnode.cluster.communicate(self, COMMAND_TIMEOUT) except ProcessLookupError as e: self.run_proc = None print( "Failed to terminate pg_autoctl for %s: %s" % (self.datadir, e) ) return None, None, -1 else: return None, None, 0 def communicate(self, timeout=COMMAND_TIMEOUT): """ Read all data from the Unix PIPE This call is idempotent. If it is called a second time after an earlier successful call, then it returns the results from when the process exited originally. """ if not self.run_proc: return self.out, self.err self.out, self.err = self.run_proc.communicate(timeout=timeout) # The process exited, so let's clean this process up. Calling # communicate again would otherwise cause an "Invalid file object" # error. ret = self.run_proc.returncode self.run_proc.release() self.run_proc = None return self.out, self.err, ret def consume_output(self, secs): """ Read available lines from the process for some given seconds """ try: self.out, self.err, ret = self.communicate(timeout=secs) except subprocess.TimeoutExpired: # all good, we'll comme back pass return self.out, self.err def set_command(self, *args): """ Build the process command line, or use the one given at init time. """ if self.command: return self.command pgdata = ["--pgdata", self.datadir] self.command = [self.program] # add pgdata in the command BEFORE any -- arguments for arg in args: if arg == "--": self.command += pgdata self.command += [arg] # when no -- argument is used, append --pgdata option at the end if "--pgdata" not in self.command: self.command += pgdata return self.command def sighup(self): """ Send a SIGHUP signal to the pg_autoctl process """ if self.run_proc and self.run_proc.pid: os.kill(self.run_proc.pid, signal.SIGHUP) else: print("pg_autoctl process for %s is not running" % self.datadir) def sudo_mkdir_p(directory): """ Runs the command: sudo mkdir -p directory """ p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "mkdir", "-p", directory, ] ) assert p.wait(timeout=COMMAND_TIMEOUT) == 0 pg_auto_failover-1.6.3/tests/ssl_cert_utils.py000066400000000000000000000122731414244367200216340ustar00rootroot00000000000000# # SSL Certificate creation in the test environment # import subprocess import os, os.path, time, shutil class SSLCert: """ Calls openssl to generate SSL certificates and sign them. """ def __init__(self, directory, basename, CN): self.directory = directory self.basename = basename self.CN = CN self.csr = None self.key = None self.crt = None self.rootKey = None self.rootCert = None self.crl = None self.sudo_mkdir_p() def sudo_mkdir_p(self): if os.path.isdir(self.directory): return True p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "mkdir", "-p", self.directory, ] ) assert p.wait() == 0 def create_root_cert(self): # avoid bugs where we overwrite certificates in a given directory assert self.csr is None assert self.crt is None assert self.key is None self.csr = os.path.join(self.directory, "%s.csr" % self.basename) self.key = os.path.join(self.directory, "%s.key" % self.basename) self.crt = os.path.join(self.directory, "%s.crt" % self.basename) # first create a certificate signing request (CSR) and a public/private # key file print() p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "openssl", "req", "-new", "-nodes", "-text", "-out", self.csr, "-keyout", self.key, "-subj", self.CN, ] ) assert p.wait() == 0 p = subprocess.Popen(["chmod", "og-rwx", self.key]) assert p.wait() == 0 # Then, sign the request with the key to create a root certificate # authority p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "openssl", "x509", "-req", "-in", self.csr, "-text", "-days", "3650", "-extfile", "/etc/ssl/openssl.cnf", "-extensions", "v3_ca", "-signkey", self.key, "-out", self.crt, ] ) assert p.wait() == 0 def create_signed_certificate(self, rootSSLCert): # avoid bugs where we overwrite certificates in a given directory assert self.csr is None assert self.crt is None assert self.key is None self.rootKey = rootSSLCert.key self.rootCert = rootSSLCert.crt self.crl = rootSSLCert.crl self.crt = os.path.join(self.directory, "%s.crt" % self.basename) self.csr = os.path.join(self.directory, "%s.csr" % self.basename) self.key = os.path.join(self.directory, "%s.key" % self.basename) p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "openssl", "req", "-new", "-nodes", "-text", "-out", self.csr, "-keyout", self.key, "-subj", self.CN, ] ) assert p.wait() == 0 p = subprocess.Popen(["chmod", "og-rwx", self.key]) assert p.wait() == 0 p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "openssl", "x509", "-req", "-in", self.csr, "-text", "-days", "365", "-CA", self.rootCert, "-CAkey", self.rootKey, "-CAcreateserial", "-out", self.crt, ] ) assert p.wait() == 0 print("openssl verify -CAfile %s %s" % (self.rootCert, self.crt)) p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "openssl", "verify", "-show_chain", "-CAfile", self.rootCert, self.crt, ] ) assert p.wait() == 0 pg_auto_failover-1.6.3/tests/test_auth.py000066400000000000000000000050651414244367200205770ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import * import os import re cluster = None node1 = None node2 = None replication_password = "streaming_password" monitor_password = "monitor_password" def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): monitor = cluster.create_monitor("/tmp/auth/monitor", authMethod="md5") monitor.run() monitor.wait_until_pg_is_running() monitor.set_user_password("autoctl_node", "autoctl_node_password") monitor.create_formation("auth", kind="pgsql", secondary=True) def test_001_init_primary(): global node1 node1 = cluster.create_datanode( "/tmp/auth/node1", authMethod="md5", formation="auth" ) node1.create() node1.config_set("replication.password", replication_password) node1.run() node1.wait_until_pg_is_running() node1.set_user_password("pgautofailover_monitor", monitor_password) node1.set_user_password("pgautofailover_replicator", replication_password) assert node1.wait_until_state(target_state="single") def test_002_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_003_init_secondary(): global node2 node2 = cluster.create_datanode( "/tmp/auth/node2", authMethod="md5", formation="auth" ) os.putenv("PGPASSWORD", replication_password) node2.create() node2.config_set("replication.password", replication_password) node2.run() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") eq_( node1.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_2)", ) def test_004_failover(): print() print("Calling pgautofailover.failover() on the monitor") cluster.monitor.failover(formation="auth") assert node2.wait_until_state(target_state="primary") eq_( node2.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_1)", ) assert node1.wait_until_state(target_state="secondary") def test_005_logging_of_passwords(): logs = node2.logs() assert monitor_password not in logs assert "password=****" in logs # We are still logging passwords when the pguri is incomplete and when printing settings, # so assert that it's not there in other cases: assert not re.match( "^(?!primary_conninfo|Failed to find).*%s.*$" % replication_password, logs, ) pg_auto_failover-1.6.3/tests/test_basic_operation.py000066400000000000000000000231571414244367200230010ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import raises, eq_ import time cluster = None monitor = None node1 = None node2 = None node3 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): global monitor monitor = cluster.create_monitor("/tmp/basic/monitor") monitor.run() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/basic/node1") node1.create() # the name of the node should be "%s_%d" % ("node", node1.nodeid) eq_(node1.get_nodename(), "node_%d" % node1.get_nodeid()) # we can change the name on the monitor with pg_autoctl set node metadata node1.set_metadata(name="node a") eq_(node1.get_nodename(), "node a") node1.run() assert node1.wait_until_state(target_state="single") # we can also change the name directly in the configuration file node1.config_set("pg_autoctl.name", "a") # wait until the reload signal has been processed before checking time.sleep(2) eq_(node1.get_nodename(), "a") def test_002_stop_postgres(): node1.stop_postgres() assert node1.wait_until_pg_is_running() def test_003_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_004_init_secondary(): global node2 node2 = cluster.create_datanode("/tmp/basic/node2") # register the node on the monitor with a first name for tests node2.create(name="node_b") eq_(node2.get_nodename(), "node_b") # now run the node and change its name again node2.run(name="b") time.sleep(1) eq_(node2.get_nodename(), "b") assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") eq_( node1.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_2)", ) assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() def test_005_read_from_secondary(): results = node2.run_sql_query("SELECT * FROM t1") eq_(results, [(1,), (2,)]) @raises(Exception) def test_006_001_writes_to_node2_fail(): node2.run_sql_query("INSERT INTO t1 VALUES (3)") def test_006_002_read_from_secondary(): results = node2.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,)] def test_007_001_wait_until_primary(): assert node1.wait_until_state(target_state="primary") @raises(Exception) def test_007_002_maintenance_primary(): node1.enable_maintenance() # without --allow-failover, that fails def test_007_003_maintenance_primary(): assert node1.wait_until_state(target_state="primary") def test_007_004_maintenance_primary_allow_failover(): print() print("Enabling maintenance on node1, allowing failover") node1.enable_maintenance(allowFailover=True) assert node1.wait_until_state(target_state="maintenance") assert node2.wait_until_state(target_state="wait_primary") node2.check_synchronous_standby_names(ssn="") def test_007_005_disable_maintenance(): print() print("Disabling maintenance on node1") node1.disable_maintenance() assert node1.wait_until_pg_is_running() assert node1.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="primary") node2.check_synchronous_standby_names( ssn="ANY 1 (pgautofailover_standby_1)" ) def test_008_001_enable_maintenance_secondary(): print() print("Enabling maintenance on node2") assert node2.wait_until_state(target_state="primary") node1.enable_maintenance() assert node1.wait_until_state(target_state="maintenance") node1.stop_postgres() node2.run_sql_query("INSERT INTO t1 VALUES (3)") def test_008_002_disable_maintenance_secondary(): print() print("Disabling maintenance on node2") node1.disable_maintenance() assert node1.wait_until_pg_is_running() assert node1.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="primary") node2.check_synchronous_standby_names( ssn="ANY 1 (pgautofailover_standby_1)" ) # the rest of the tests expect node1 to be primary, make it so def test_009_failback(): print() monitor.failover() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") eq_( node1.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_2)", ) def test_010_fail_primary(): print() print("Injecting failure of node1") node1.fail() assert node2.wait_until_state(target_state="wait_primary") def test_011_writes_to_node2_succeed(): node2.run_sql_query("INSERT INTO t1 VALUES (4)") results = node2.run_sql_query("SELECT * FROM t1 ORDER BY a") eq_(results, [(1,), (2,), (3,), (4,)]) def test_012_start_node1_again(): node1.run() assert node2.wait_until_state(target_state="primary") eq_( node2.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_1)", ) assert node1.wait_until_state(target_state="secondary") def test_013_read_from_new_secondary(): results = node1.run_sql_query("SELECT * FROM t1 ORDER BY a") eq_(results, [(1,), (2,), (3,), (4,)]) @raises(Exception) def test_014_writes_to_node1_fail(): node1.run_sql_query("INSERT INTO t1 VALUES (3)") def test_015_fail_secondary(): node1.fail() assert node2.wait_until_state(target_state="wait_primary") def test_016_drop_secondary(): node1.run() assert node1.wait_until_state(target_state="secondary") node1.drop() assert not node1.pg_is_running() assert node2.wait_until_state(target_state="single") # replication slot list should be empty now assert node2.has_needed_replication_slots() def test_017_add_new_secondary(): global node3 node3 = cluster.create_datanode("/tmp/basic/node3") node3.create() @raises(Exception) def test_018_cant_failover_yet(): monitor.failover() def test_019_run_secondary(): node3.run() assert node3.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="primary") assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() eq_( node2.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_3)", ) # In previous versions of pg_auto_failover we removed the replication slot # on the secondary after failover. Now, we instead maintain the replication # slot's replay_lsn thanks for the monitor tracking of the nodes' LSN # positions. # # So rather than checking that we want to zero replication slots after # replication, we check that we still have a replication slot for the other # node. # def test_020_multiple_manual_failover_verify_replication_slots(): print() print("Calling pgautofailover.failover() on the monitor") monitor.failover() assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="primary") assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() eq_( node3.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_2)", ) print("Calling pg_autoctl perform promotion on node 2") node2.perform_promotion() assert node2.wait_until_state(target_state="primary") eq_( node2.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_3)", ) assert node3.wait_until_state(target_state="secondary") assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() # # Now test network partition detection. Cut the primary out of the network # by means of `ifconfig down` on its virtual network interface, and then # after 30s the primary should demote itself, and the monitor should # failover to the secondary. # def test_021_ifdown_primary(): print() assert node2.wait_until_state(target_state="primary") eq_( node2.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_3)", ) node2.ifdown() def test_022_detect_network_partition(): # wait for network partition detection to kick-in, allow some head-room timeout = 90 demoted = False while not demoted and timeout > 0: states = node2.get_local_state() demoted = states == ("demote_timeout", "demote_timeout") if demoted: break time.sleep(1) timeout -= 1 if node2.pg_is_running() or timeout <= 0: node2.print_debug_logs() raise Exception("test failed: node2 didn't stop running in 90s") print() assert not node2.pg_is_running() assert node3.wait_until_state(target_state="wait_primary") eq_(node3.get_synchronous_standby_names_local(), "") def test_023_ifup_old_primary(): print() node2.ifup() assert node2.wait_until_pg_is_running() assert node2.wait_until_state("secondary") assert node3.wait_until_state("primary") eq_( node3.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_2)", ) def test_024_stop_postgres_monitor(): original_state = node3.get_state().reported monitor.stop_postgres() # allow trying twice to make Travis CI stable if not monitor.wait_until_pg_is_running(): assert monitor.wait_until_pg_is_running() print() assert node3.wait_until_state(target_state=original_state) def test_025_drop_primary(): node3.drop() assert not node3.pg_is_running() assert node2.wait_until_state(target_state="single") pg_auto_failover-1.6.3/tests/test_basic_operation_listen_flag.py000066400000000000000000000040471414244367200253450ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import * cluster = None node1 = None node2 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): monitor = cluster.create_monitor("/tmp/listen/monitor") monitor.run() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/listen/node1", listen_flag=True) node1.create() node1.run() assert node1.wait_until_state(target_state="single") node1.wait_until_pg_is_running() def test_002_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_003_init_secondary(): global node2 node2 = cluster.create_datanode("/tmp/listen/node2", listen_flag=True) node2.create() node2.run() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") def test_004_read_from_secondary(): results = node2.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,)] @raises(Exception) def test_005_writes_to_node2_fail(): node2.run_sql_query("INSERT INTO t1 VALUES (3)") def test_006_fail_primary(): node1.fail() assert node2.wait_until_state(target_state="wait_primary") def test_007_writes_to_node2_succeed(): node2.run_sql_query("INSERT INTO t1 VALUES (3)") results = node2.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,), (3,)] def test_008_start_node1_again(): node1.run() assert node2.wait_until_state(target_state="primary") assert node1.wait_until_state(target_state="secondary") def test_009_read_from_new_secondary(): results = node1.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,), (3,)] @raises(Exception) def test_010_writes_to_node1_fail(): node1.run_sql_query("INSERT INTO t1 VALUES (3)") def test_011_fail_secondary(): node1.fail() assert node2.wait_until_state(target_state="wait_primary") pg_auto_failover-1.6.3/tests/test_config_get_set.py000066400000000000000000000045441414244367200226160ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import assert_raises, raises, eq_ import os import shutil import subprocess import time cluster = None monitor = None node1 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): global monitor monitor = cluster.create_monitor("/tmp/config_test/monitor") monitor.run() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/config_test/node1") node1.create() # the name of the node should be "%s_%d" % ("node", node1.nodeid) eq_(node1.get_nodename(), "node_%d" % node1.get_nodeid()) # we can change the name on the monitor with pg_autoctl set node metadata node1.set_metadata(name="node a") eq_(node1.get_nodename(), "node a") node1.run() assert node1.wait_until_state(target_state="single") # we can also change the name directly in the configuration file node1.config_set("pg_autoctl.name", "a") # wait until the reload signal has been processed before checking time.sleep(2) eq_(node1.get_nodename(), "a") def test_002_config_set_monitor(): pg_ctl = monitor.config_get("postgresql.pg_ctl") # set something non-default to assert no side-effects later sslmode = "prefer" monitor.config_set("ssl.sslmode", sslmode) # set monitor config postgresql.pg_ctl to something invalid with assert_raises(subprocess.CalledProcessError): monitor.config_set("postgresql.pg_ctl", "invalid") # it should not get changed eq_(monitor.config_get("postgresql.pg_ctl"), pg_ctl) # try again with a keeper pg_ctl = node1.config_get("postgresql.pg_ctl") # set the keeper to something invalid with assert_raises(subprocess.CalledProcessError): node1.config_set("postgresql.pg_ctl", "invalid") # it should not get changed eq_(node1.config_get("postgresql.pg_ctl"), pg_ctl) # pg_ctl can be moved and `config set` will still operate. shutil.copy(pg_ctl, "/tmp/pg_ctl") monitor.config_set("postgresql.pg_ctl", "/tmp/pg_ctl") # "move" pg_ctl os.remove("/tmp/pg_ctl") monitor.config_set("postgresql.pg_ctl", pg_ctl) eq_(monitor.config_get("postgresql.pg_ctl"), pg_ctl) # no side effects eq_(monitor.config_get("ssl.sslmode"), sslmode) pg_auto_failover-1.6.3/tests/test_create_run.py000066400000000000000000000045061414244367200217640ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover import time cluster = None monitor = None node1 = None node2 = None node3 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): global monitor monitor = cluster.create_monitor("/tmp/create-run/monitor") monitor.run() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/create-run/node1") node1.create(run=True) assert node1.wait_until_state(target_state="single") def test_002_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_003_init_secondary(): global node2 node2 = cluster.create_datanode("/tmp/create-run/node2") node2.create(run=True) assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") def test_004_read_from_secondary(): results = node2.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,)] def test_005_maintenance(): node2.enable_maintenance() assert node2.wait_until_state(target_state="maintenance") node2.fail() node1.run_sql_query("INSERT INTO t1 VALUES (3)") node2.run() node2.disable_maintenance() assert node2.wait_until_pg_is_running() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") def test_006_fail_primary(): node1.fail() assert node2.wait_until_state(target_state="wait_primary", timeout=180) def test_007_start_node1_again(): node1.create(run=True) assert node2.wait_until_state(target_state="primary") assert node1.wait_until_state(target_state="secondary") def test_008_read_from_new_secondary(): results = node1.run_sql_query("SELECT * FROM t1 ORDER BY a") assert results == [(1,), (2,), (3,)] def test_009_fail_secondary(): node1.fail() assert node2.wait_until_state(target_state="wait_primary") def test_010_drop_secondary(): node1.run() assert node1.wait_until_state(target_state="secondary") node1.drop() time.sleep(2) # avoid timing issue assert not node1.pg_is_running() assert node2.wait_until_pg_is_running() assert node2.wait_until_state(target_state="single") pg_auto_failover-1.6.3/tests/test_create_standby_with_pgdata.py000066400000000000000000000061541414244367200252000ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import * import os import subprocess cluster = None node1 = None node2 = None node3 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): monitor = cluster.create_monitor("/tmp/sb-from-pgdata/monitor") monitor.run() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/sb-from-pgdata/node1") node1.create() node1.run() assert node1.wait_until_state(target_state="single") node1.wait_until_pg_is_running() def test_002_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_003_init_secondary(): global node2 # fail the registration of a node2 by using a PGDATA directory that has # already been created, with another system_identifier (initdb creates a # new one each time) p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "pg_ctl", "initdb", "-s", "-D", "/tmp/sb-from-pgdata/node2", ] ) assert p.wait() == 0 node2 = cluster.create_datanode("/tmp/sb-from-pgdata/node2") @raises(Exception) def test_004_create_raises_error(): try: node2.create() except Exception as e: # we want to see the failure here print(e) raise def test_005_cleanup_after_failure(): print("Failed as expected, cleaning up") print("rm -rf /tmp/sb-from-pgdata/node2") p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "rm", "-rf", "/tmp/sb-from-pgdata/node2", ] ) assert p.wait() == 0 def test_006_init_secondary(): global node3 # create node3 from a manual copy of node1 to test creating a standby # from an existing PGDATA (typically PGDATA would be deployed from a # backup and recovery mechanism) p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "cp", "-a", "/tmp/sb-from-pgdata/node1", "/tmp/sb-from-pgdata/node3", ] ) assert p.wait() == 0 os.remove("/tmp/sb-from-pgdata/node3/postmaster.pid") node3 = cluster.create_datanode("/tmp/sb-from-pgdata/node3") node3.create() node3.run() cluster.monitor.print_state() assert node3.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") def test_007_failover(): print() print("Calling pgautofailover.failover() on the monitor") cluster.monitor.failover() assert node3.wait_until_state(target_state="primary") assert node1.wait_until_state(target_state="secondary") pg_auto_failover-1.6.3/tests/test_debian_clusters.py000066400000000000000000000060411414244367200227770ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover import os.path import subprocess cluster = None monitor = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): global monitor print() monitor_path = cluster.pg_createcluster("monitor", port=6000) postgres_conf_path = os.path.join(monitor_path, "postgresql.conf") # verify postgresql.conf is not in data directory assert not os.path.exists(postgres_conf_path) monitor = cluster.create_monitor(monitor_path, port=6000) monitor.create(level="-vv") monitor.run(port=6000) monitor.wait_until_pg_is_running() # verify postgresql.conf is in data directory now assert os.path.exists(postgres_conf_path) pgversion = os.getenv("PGVERSION") p = subprocess.run( [ "ls", "-ld", monitor_path, "/var/lib/postgresql/%s" % pgversion, "/etc/postgresql/%s" % pgversion, "/etc/postgresql/%s/monitor" % pgversion, "/etc/postgresql/%s/monitor/postgresql.conf" % pgversion, "/etc/postgresql/%s/monitor/pg_hba.conf" % pgversion, "/etc/postgresql/%s/monitor/pg_ident.conf" % pgversion, ], text=True, capture_output=True, ) print("%s" % p.stdout) def test_001_custom_single(): global node1 print() node1_path = cluster.pg_createcluster("debian_node1", port=6001) postgres_conf_path = os.path.join(node1_path, "postgresql.conf") # verify postgresql.conf is not in data directory assert not os.path.exists(postgres_conf_path) node1 = cluster.create_datanode(node1_path, port=6001, listen_flag=True) node1.create(level="-vv") # verify postgresql.conf is in data directory now assert os.path.exists(postgres_conf_path) pgversion = os.getenv("PGVERSION") p = subprocess.run( [ "ls", "-ld", node1_path, "/var/lib/postgresql/%s" % pgversion, "/etc/postgresql/%s" % pgversion, "/etc/postgresql/%s/debian_node1" % pgversion, "/etc/postgresql/%s/debian_node1/postgresql.conf" % pgversion, "/etc/postgresql/%s/debian_node1/pg_hba.conf" % pgversion, "/etc/postgresql/%s/debian_node1/pg_ident.conf" % pgversion, ], text=True, capture_output=True, ) print("%s" % p.stdout) monitor.print_state() def test_002_chmod_debian_data_directory(): # debian installs the following ownership and permissions: # # drwxr-xr-x 5 postgres postgres ... /var/lib/postgresql/11 # drwx------ 20 docker postgres ... /var/lib/postgresql/11/monitor # drwx------ 20 docker postgres ... /var/lib/postgresql/11/debian_node1 # # we need to give the postgres group the w on the top-level directory pgversion = os.getenv("PGVERSION") p = subprocess.Popen( ["chmod", "go+w", "/var/lib/postgresql/%s" % pgversion] ) assert p.wait() == 0 pg_auto_failover-1.6.3/tests/test_enable_ssl.py000066400000000000000000000173311414244367200217440ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover import ssl_cert_utils as cert import subprocess import os import time cluster = None monitor = None node1 = None node2 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() # remove client side setup for certificates too client_top_directory = os.path.join(os.getenv("HOME"), ".postgresql") p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "rm", "-rf", client_top_directory, ] ) assert p.wait() == 0 # also remove certificates we created for the servers p = subprocess.run( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "rm", "-rf", "/tmp/certs", ] ) assert p.returncode == 0 def test_000_create_monitor(): global monitor monitor = cluster.create_monitor("/tmp/enable/monitor") monitor.run() monitor.wait_until_pg_is_running() monitor.check_ssl("off", "prefer") def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/enable/node1") node1.create() node1.run() assert node1.wait_until_state(target_state="single") node1.wait_until_pg_is_running() node1.check_ssl("off", "prefer", primary=True) def test_002_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_003_init_secondary(): global node2 node2 = cluster.create_datanode("/tmp/enable/node2") node2.create() node2.run() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") node2.check_ssl("off", "prefer") def test_004_maintenance(): print() print("Enabling maintenance on node2") node2.enable_maintenance() assert node2.wait_until_state(target_state="maintenance") def test_005_enable_ssl_monitor(): monitor.enable_ssl(sslSelfSigned=True, sslMode="require") monitor.sleep(2) # we signaled, wait some time monitor.check_ssl("on", "require") def test_006_enable_ssl_primary(): # we stop pg_autoctl to make it easier for the test to be reliable # without too much delay/sleep hacking; when doing the `pg_autoctl # enable ssl` online we need to make sure the signal made it to the # running process and then was acted upon node1.stop_pg_autoctl() node1.enable_ssl(sslSelfSigned=True, sslMode="require") node1.run() node1.wait_until_pg_is_running() node1.check_ssl("on", "require", primary=True) def test_007_enable_ssl_secondary(): node2.stop_pg_autoctl() node2.enable_ssl(sslSelfSigned=True, sslMode="require") node2.run() node2.wait_until_pg_is_running() node2.check_ssl("on", "require") def test_008_disable_maintenance(): print("Disabling maintenance on node2") node2.disable_maintenance() assert node2.wait_until_pg_is_running() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") # upgrade to verify full def test_009_enable_maintenance(): print() print("Enabling maintenance on node2") node2.enable_maintenance() assert node2.wait_until_state(target_state="maintenance") def test_010_enable_ssl_verify_ca_monitor(): client_top_directory = os.path.join(os.getenv("HOME"), ".postgresql") print() print("Creating cluster root certificate") cluster.create_root_cert( client_top_directory, basename="root", CN="/CN=root.pgautofailover.ca" ) p = subprocess.run( [ "ls", "-ld", client_top_directory, cluster.cert.crt, cluster.cert.csr, cluster.cert.key, ], text=True, capture_output=True, ) print("%s" % p.stdout) # now create and sign the CLIENT certificate print("Creating cluster client certificate") clientCert = cert.SSLCert( client_top_directory, basename="postgresql", CN="/CN=autoctl_node" ) clientCert.create_signed_certificate(cluster.cert) p = subprocess.run( [ "ls", "-ld", client_top_directory, clientCert.crt, clientCert.csr, clientCert.key, ], text=True, capture_output=True, ) print("%s" % p.stdout) # the root user also needs the certificates, tests are connecting with it subprocess.run(["ln", "-s", client_top_directory, "/root/.postgresql"]) assert p.returncode == 0 p = subprocess.run( ["ls", "-l", "/root/.postgresql"], text=True, capture_output=True ) print("%s" % p.stdout) # now create and sign the SERVER certificate for the monitor print("Creating monitor server certificate") monitorCert = cert.SSLCert( "/tmp/certs/monitor", "server", "/CN=monitor.pgautofailover.ca" ) monitorCert.create_signed_certificate(cluster.cert) p = subprocess.run( [ "ls", "-ld", client_top_directory, cluster.cert.crt, cluster.cert.csr, cluster.cert.key, clientCert.crt, clientCert.csr, clientCert.key, monitorCert.crt, monitorCert.csr, monitorCert.key, ], text=True, capture_output=True, ) print("%s" % p.stdout) monitor.enable_ssl( sslCAFile=cluster.cert.crt, sslServerKey=monitorCert.key, sslServerCert=monitorCert.crt, sslMode="verify-ca", ) monitor.sleep(2) # we signaled, wait some time monitor.check_ssl("on", "verify-ca") def test_011_enable_ssl_verify_ca_primary(): node1Cert = cert.SSLCert( "/tmp/certs/node1", "server", "/CN=node1.pgautofailover.ca" ) node1Cert.create_signed_certificate(cluster.cert) node1.stop_pg_autoctl() node1.enable_ssl( sslCAFile=cluster.cert.crt, sslServerKey=node1Cert.key, sslServerCert=node1Cert.crt, sslMode="verify-ca", ) node1.run() node1.wait_until_pg_is_running() node1.check_ssl("on", "verify-ca", primary=True) def test_012_enable_ssl_verify_ca_secondary(): node2Cert = cert.SSLCert( "/tmp/certs/node2", "server", "/CN=node2.pgautofailover.ca" ) node2Cert.create_signed_certificate(cluster.cert) node2.stop_pg_autoctl() node2.enable_ssl( sslCAFile=cluster.cert.crt, sslServerKey=node2Cert.key, sslServerCert=node2Cert.crt, sslMode="verify-ca", ) node2.run() node2.wait_until_pg_is_running() node2.check_ssl("on", "verify-ca") def test_013_disable_maintenance(): print("Disabling maintenance on node2") node2.disable_maintenance() assert node2.wait_until_pg_is_running() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") def test_014_enable_ssl_require_primary(): node1Cert = cert.SSLCert( "/tmp/certs/node1", "server", "/CN=node1.pgautofailover.ca" ) node1Cert.create_signed_certificate(cluster.cert) node1.enable_ssl( sslServerKey=node1Cert.key, sslServerCert=node1Cert.crt, sslMode="require", ) node1.pg_autoctl.sighup() time.sleep(6) # to avoid flackyness here, we allow a second run/timeout of waiting if not node1.wait_until_pg_is_running(): assert node1.wait_until_pg_is_running() node1.check_ssl("on", "require", primary=True) pg_auto_failover-1.6.3/tests/test_ensure.py000066400000000000000000000054431414244367200211370ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import * import time import os.path cluster = None node1 = None node2 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): monitor = cluster.create_monitor("/tmp/ensure/monitor") monitor.run() monitor.wait_until_pg_is_running() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/ensure/node1") print() print("create node1") node1.create() print("stop postgres") node1.stop_postgres() print("run node1") node1.run() print("wait until Postgres is running") node1.wait_until_pg_is_running() assert node1.wait_until_state(target_state="single") def test_002_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_003_init_secondary(): global node2 node2 = cluster.create_datanode("/tmp/ensure/node2") node2.create() node2.stop_postgres() node2.run() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") def test_004_demoted(): print() node1.stop_postgres() node1.stop_pg_autoctl() # we need the pg_autoctl process to run to reach the state demoted, # otherwise the monitor assigns that state to node1 but we never reach # it print("stopped pg_autoctl and postgres, now waiting for 30s") node2.sleep(30) node1.run() # We must not wait for PG to run, since otherwise we might miss the demoted # state assert node1.wait_until_state(target_state="demoted") # ideally we should be able to check that we refrain from starting # postgres again before calling the transition function print("re-starting pg_autoctl on node1") assert node1.wait_until_state(target_state="secondary") def test_005_inject_error_in_node2(): assert node2.wait_until_state(target_state="primary") # break Postgres setup on the primary, and restart Postgres: then # Postgres keeps failing to start, and pg_autoctl still communicates # with the monitor, which should still orchestrate a failover. pgconf = os.path.join(node2.datadir, "postgresql.conf") with open(pgconf, "a+") as f: f.write("\n") f.write("shared_preload_libraries='wrong_extension'\n") node2.restart_postgres() # the first step is the promotion of the other node as the new primary: assert node1.wait_until_state("wait_primary") # then when the failover happens, the new primary postgresql.conf gets # copied over, and we get the nodes back to primary/secondary assert node2.wait_until_state("secondary") assert node1.wait_until_state("primary") pg_auto_failover-1.6.3/tests/test_extension_update.py000066400000000000000000000020221414244367200232020ustar00rootroot00000000000000import os import time import pgautofailover_utils as pgautofailover from nose.tools import eq_ cluster = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.monitor.stop_pg_autoctl() cluster.destroy() def test_000_create_monitor(): monitor = cluster.create_monitor("/tmp/update/monitor") def test_001_update_extension(): os.environ["PG_AUTOCTL_DEBUG"] = "1" os.environ["PG_AUTOCTL_EXTENSION_VERSION"] = "dummy" cluster.monitor.run() cluster.monitor.wait_until_pg_is_running() # Wait until extension is installed time.sleep(1) results = cluster.monitor.run_sql_query( """SELECT installed_version FROM pg_available_extensions WHERE name = 'pgautofailover' """ ) if results[0][0] != "dummy": cluster.monitor.print_debug_logs() eq_(results, [("dummy",)]) del os.environ["PG_AUTOCTL_EXTENSION_VERSION"] assert "PG_AUTOCTL_EXTENSION_VERSION" not in os.environ pg_auto_failover-1.6.3/tests/test_installcheck.py000066400000000000000000000036131414244367200222770ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import * import subprocess import shutil import os import os.path cluster = None node1 = None node2 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): monitor = cluster.create_monitor("/tmp/check/monitor") monitor.run() monitor.wait_until_pg_is_running() def test_001_add_hba_entry(): with open(os.path.join("/tmp/check/monitor", "pg_hba.conf"), "a") as hba: hba.write("host all all %s trust\n" % cluster.networkSubnet) # print() # with open(os.path.join("/tmp/check/monitor", "pg_hba.conf"), "r") as hba: # lines = hba.readlines() # for line in lines[-10:]: # print("%s" % line[:-1]) cluster.monitor.reload_postgres() def test_002_make_installcheck(): # support both the local Dockerfile and also Travis build environments if "TRAVIS_BUILD_DIR" in os.environ: topdir = os.environ["TRAVIS_BUILD_DIR"] else: topdir = "/usr/src/pg_auto_failover" p = subprocess.Popen( [ "sudo", shutil.which("chmod"), "-R", "go+w", os.path.join(topdir, "src/monitor"), ] ) assert p.wait() == 0 p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "PGHOST=" + str(cluster.monitor.vnode.address), "make", "-C", os.path.join(topdir, "src/monitor"), "installcheck", ] ) if p.wait() != 0: diff = os.path.join(topdir, "src/monitor/regression.diffs") with open(diff, "r") as d: print("%s" % d.read()) raise Exception("make installcheck failed") pg_auto_failover-1.6.3/tests/test_monitor_disabled.py000066400000000000000000000050241414244367200231470ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import * import os import json cluster = None node1 = None node2 = None node3 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/no-monitor/node1") node1.create(monitorDisabled=True, host=str(node1.vnode.address), nodeId=1) node1.run(name="a") def test_002_init_to_single(): node1.do_fsm_assign("single") def test_003_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_004_init_secondary(): global node2 node2 = cluster.create_datanode("/tmp/no-monitor/node2") node2.create(monitorDisabled=True, host=str(node2.vnode.address), nodeId=2) node2.run(name="b") def test_005_fsm_nodes_set(): nodesArray = [node1.jsDict("0/1", True), node2.jsDict("0/1", False)] node1.do_fsm_nodes_set(nodesArray) node2.do_fsm_nodes_set(nodesArray) def test_006_init_to_wait_standby(): node2.do_fsm_assign("wait_standby") def test_007_catchingup(): node1.do_fsm_assign("wait_primary") node2.do_fsm_assign("catchingup") def test_008_secondary(): node1.do_fsm_assign("primary") node2.do_fsm_assign("secondary") eq_(node1.get_synchronous_standby_names_local(), "*") def test_009_init_secondary(): global node3 node3 = cluster.create_datanode("/tmp/no-monitor/node3") node3.create(monitorDisabled=True, host=str(node3.vnode.address), nodeId=3) node3.run(name="c") def test_010_fsm_nodes_set(): LSN1 = node1.run_sql_query("select pg_current_wal_flush_lsn()")[0][0] LSN2 = node2.run_sql_query("select pg_last_wal_receive_lsn()")[0][0] nodesArray = [ node1.jsDict(LSN1, True), node2.jsDict(LSN2, False), node3.jsDict("0/1", False), ] node1.do_fsm_nodes_set(nodesArray) node2.do_fsm_nodes_set(nodesArray) node3.do_fsm_nodes_set(nodesArray) def test_011_init_to_wait_standby(): node1.do_fsm_assign("primary") node3.do_fsm_assign("wait_standby") eq_(node1.get_synchronous_standby_names_local(), "*") def test_012_catchingup(): node3.do_fsm_assign("catchingup") eq_(node1.get_synchronous_standby_names_local(), "*") def test_013_secondary(): node3.do_fsm_assign("secondary") node1.do_fsm_assign("primary") # no monitor: use the generic value '*' eq_(node1.get_synchronous_standby_names_local(), "*") pg_auto_failover-1.6.3/tests/test_multi_alternate_primary_failures.py000066400000000000000000000160521414244367200264620ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import raises, eq_ import time import os.path cluster = None monitor = None node1 = None node2 = None node3 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): global monitor monitor = cluster.create_monitor( "/tmp/multi_alternate_primary_failures/monitor" ) monitor.run() monitor.wait_until_pg_is_running() def test_001_init_primary(): global node1 node1 = cluster.create_datanode( "/tmp/multi_alternate_primary_failures/node1" ) node1.create() node1.run() print() assert node1.wait_until_state(target_state="single") def test_002_001_add_two_standbys(): global node2 node2 = cluster.create_datanode( "/tmp/multi_alternate_primary_failures/node2" ) node2.create() node2.run() node2.wait_until_pg_is_running() print() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() # with one standby, we have number_sync_standbys set to 0 still assert node1.get_number_sync_standbys() == 0 def test_002_002_add_two_standbys(): global node3 node3 = cluster.create_datanode( "/tmp/multi_alternate_primary_failures/node3" ) node3.create() node3.run() node3.wait_until_pg_is_running() print() assert node3.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() # with two standbys, we have number_sync_standbys set to 1 assert node1.get_number_sync_standbys() == 1 # # In this test series, we have # # node1 node2 node3 # primary secondary secondary # # demoted primary secondary # # demoted draining report_lsn # # demoted primary secondary # # secondary primary secondary # def test_003_001_stop_primary(): # verify that node1 is primary and stop it assert node1.get_state().assigned == "primary" node1.fail() # wait for node2 to become the new primary print() assert node1.wait_until_assigned_state(target_state="demoted") assert node2.wait_until_state(target_state="primary") def test_003_002_stop_primary(): # verify that node2 is primary and stop it assert node2.get_state().assigned == "primary" node2.fail() # node3 can't be promoted when it's the only one reporting its LSN print() assert node2.wait_until_assigned_state(target_state="draining") assert node3.wait_until_state(target_state="report_lsn") # check that node3 stays at report_lsn and doesn't go to wait_primary node3.sleep(5) assert node3.wait_until_state(target_state="report_lsn") def test_003_003_bringup_last_failed_primary(): # Restart node2 node2.run() # Now node 2 should become primary print() assert node2.wait_until_state(target_state="primary") assert node3.wait_until_state(target_state="secondary") def test_003_004_bringup_first_failed_primary(): # Restart node1 node1.run() node3.wait_until_pg_is_running() # Now node 1 should become secondary print() assert node1.wait_until_state(target_state="secondary") assert node2.get_state().assigned == "primary" assert node3.get_state().assigned == "secondary" assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() # # In this test series , we have # # node1 node2 node3 # secondary primary secondary # # primary demoted secondary # # draining demoted report_lsn # # draining secondary primary # # secondary secondary primary # def test_005_001_fail_primary_again(): # verify that node2 is primary and stop it assert node2.get_state().assigned == "primary" node2.fail() print() assert node2.wait_until_assigned_state( target_state="demote_timeout", timeout=120 ) assert node2.wait_until_assigned_state(target_state="demoted", timeout=120) assert node1.wait_until_assigned_state(target_state="primary", timeout=120) assert node1.wait_until_state(target_state="primary", timeout=120) assert node3.wait_until_state(target_state="secondary", timeout=120) def test_005_002_fail_primary_again(): # verify that node1 is primary and stop it assert node1.get_state().assigned == "primary" node1.fail() print() assert node1.wait_until_assigned_state(target_state="draining") assert node3.wait_until_assigned_state(target_state="report_lsn") def test_005_003_bring_up_first_failed_primary(): # Restart node2 node2.run() print() assert node2.wait_until_state(target_state="demoted") # Now node 2 should become secondary assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="primary") def test_005_004_bring_up_last_failed_primary(): # Restart node1 node1.run() node1.wait_until_pg_is_running() # Now node 3 should become secondary print() assert node1.wait_until_state(target_state="secondary") assert node3.get_state().assigned == "primary" assert node2.get_state().assigned == "secondary" # # In this test series , we have # # node1 node2 node3 # secondary secondary primary # # primary secondary demoted # # # demoted primary secondary # # secondary primary secondary # def test_006_001_fail_primary(): assert node3.get_state().assigned == "primary" node3.fail() print() assert node3.wait_until_assigned_state( target_state="demote_timeout", timeout=120 ) assert node3.wait_until_assigned_state(target_state="demoted", timeout=120) assert node1.wait_until_assigned_state(target_state="primary", timeout=120) assert node1.wait_until_state(target_state="primary", timeout=120) assert node2.wait_until_state(target_state="secondary", timeout=120) def test_006_002_fail_new_primary(): assert node1.get_state().assigned == "primary" node1.fail() node3.run() print() assert node2.wait_until_state(target_state="primary", timeout=120) assert node3.wait_until_state(target_state="secondary", timeout=120) def test_006_003_bringup_last_failed_primary(): node1.run() print() assert node1.wait_until_state(target_state="secondary", timeout=120) assert node2.wait_until_state(target_state="primary", timeout=120) assert node3.wait_until_state(target_state="secondary", timeout=120) pg_auto_failover-1.6.3/tests/test_multi_async.py000066400000000000000000000270241414244367200221640ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import raises, eq_ import time import subprocess import os.path cluster = None monitor = None node1 = None node2 = None node3 = None node4 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): global monitor # test creating the monitor in an existing empty directory p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "mkdir", "-p", "/tmp/multi_async/monitor", ] ) assert p.wait() == 0 monitor = cluster.create_monitor("/tmp/multi_async/monitor") monitor.run() monitor.wait_until_pg_is_running() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/multi_async/node1") node1.create() node1.run() assert node1.wait_until_state(target_state="single") def test_002_add_standby(): global node2 node2 = cluster.create_datanode("/tmp/multi_async/node2") node2.create() node2.run() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() # make sure we reached primary on node1 before next tests assert node1.wait_until_state(target_state="primary") def test_003_add_standby(): global node3 node3 = cluster.create_datanode("/tmp/multi_async/node3") node3.create(level="-vv", replicationQuorum=False) node3.run() assert node3.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() # the formation number_sync_standbys is expected to still be zero now eq_(node1.get_number_sync_standbys(), 0) # make sure we reached primary on node1 before next tests assert node1.wait_until_state(target_state="primary") def test_004_set_async(): # now we set the whole formation to async node1.set_number_sync_standbys(0) eq_(node1.get_number_sync_standbys(), 0) print() assert node1.wait_until_state(target_state="primary") assert node1.set_replication_quorum("false") # primary assert node2.set_replication_quorum("false") # secondary assert node3.set_replication_quorum("false") # secondary def test_005_write_into_primary(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2), (3), (4)") node1.run_sql_query("CHECKPOINT") results = node1.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,), (3,), (4,)] def test_006_async_failover(): print() print("Calling pgautofailover.failover() on the monitor") node2.perform_promotion() assert node1.wait_until_state(target_state="secondary") # secondary assert node3.wait_until_state(target_state="secondary") # secondary assert node2.wait_until_state(target_state="primary") # primary def test_007_read_from_new_primary(): results = node2.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,), (3,), (4,)] # # The next tests prepare a test-case where at promotion time an async # standby is first driven to SECONDARY, and then other sync standby nodes in # REPORT_LSN move forward. We had a bug where the REPORT_LSN nodes would be # stuck with the primary node being in the WAIT_PRIMARY/PRIMARY state. # def test_008_set_sync_async(): print() assert node1.set_replication_quorum("true") # secondary assert node2.set_replication_quorum("true") # primary assert node3.set_replication_quorum("false") # secondary assert node3.set_candidate_priority(0) assert node2.wait_until_state(target_state="primary") def test_009_add_sync_standby(): global node4 node4 = cluster.create_datanode("/tmp/multi_async/node4") node4.create() node4.run() assert node1.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="primary") assert node3.wait_until_state(target_state="secondary") assert node4.wait_until_state(target_state="secondary") assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() assert node4.has_needed_replication_slots() # the formation number_sync_standbys is expected to be incremented, we # now have two standby nodes that participate in the replication quorum # (node1 and node4) eq_(node2.get_number_sync_standbys(), 1) # make sure we reached primary on node1 before next tests assert node2.wait_until_state(target_state="primary") def test_010_promote_node1(): print() print("Calling pgautofailover.perform_promotion(node1) on the monitor") # we don't use node1.perform_promotion() here because using the # pg_autoctl client means we would listen to notification and get back # to the rest of the code when the promotion is all over with # # we need to take control way before that, so just trigger the failover # and get back to controling our test case. q = "select pgautofailover.perform_promotion('default', 'node_1')" monitor.run_sql_query(q) def test_011_ifdown_node4_at_reportlsn(): print() assert node4.wait_until_state(target_state="report_lsn") node4.ifdown() assert node3.wait_until_state(target_state="secondary") def test_012_ifup_node4(): node4.ifup() print() assert node3.wait_until_state(target_state="secondary") assert node4.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node2.wait_until_state(target_state="secondary") def test_013_drop_node4(): node4.destroy() print() assert node1.wait_until_state(target_state="primary") assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="secondary") # we have only one standby participating in the quorum now eq_(node1.get_number_sync_standbys(), 0) # # A series of test where we fail primary and candidate secondary node and # all is left is a secondary that is not a candidate for failover. # # In the first series 014_0xx the demoted primary comes back first and needs # to fetch missing LSNs from node3 because it might have missed some # transactions. # # node1 node2 node3 # primary secondary secondary # # demoted wait_primary secondary # # demoted demoted report_lsn # # wait_primary demoted secondary (fast_forward → primary) # # primary secondary secondary # def test_014_001_fail_node1(): node1.fail() # first we have a 30s timeout for the monitor to decide that node1 is # down; then we have another 30s timeout at stop_replication waiting for # demote_timeout, so let's give it 120s there and step in the middle first assert node2.wait_until_state(target_state="stop_replication", timeout=120) assert node2.wait_until_state(target_state="wait_primary", timeout=120) assert node3.wait_until_state(target_state="secondary") node2.check_synchronous_standby_names(ssn="") def test_014_002_stop_new_primary_node2(): node2.fail() print() assert node3.wait_until_state(target_state="report_lsn") def test_014_003_restart_node1(): node1.run() # node1 used to be primary, now demoted, and meanwhile node2 was primary # node1 is assigned report_lsn and then is selected (only node with # candidate priority > 0) ; and thus needs to go through fast_forward assert node1.wait_until_assigned_state(target_state="fast_forward") assert node1.wait_until_state(target_state="stop_replication") assert node1.wait_until_state(target_state="wait_primary") assert node3.wait_until_state(target_state="secondary") def test_014_004_restart_node2(): node2.run() assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") # # Test 15 is like test 14, though we inverse the restarting of the failed # nodes, first the new primary and then very old primary. # # node1 node2 node3 # primary secondary secondary # # demoted wait_primary secondary # # demoted demoted report_lsn # # demoted wait_primary secondary # # secondary primary secondary # def test_015_001_fail_primary_node1(): node1.fail() # first we have a 30s timeout for the monitor to decide that node1 is # down; then we have another 30s timeout at stop_replication waiting for # demote_timout, so let's give it 120s there assert node2.wait_until_state(target_state="wait_primary", timeout=120) assert node3.wait_until_state(target_state="secondary") node2.check_synchronous_standby_names(ssn="") def test_015_002_fail_new_primary_node2(): node2.fail() print() assert node3.wait_until_state(target_state="report_lsn") def test_015_003_restart_node2(): node2.run() # restart the previous primary, it re-joins as a (wannabe) primary # because the only secondary has candidatePriority = 0, it's wait_primary assert node2.wait_until_state(target_state="wait_primary") assert node3.wait_until_state(target_state="secondary") time.sleep(5) assert not node2.get_state().assigned == "primary" def test_015_004_restart_node1(): node1.run() assert node3.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="primary") assert node1.wait_until_state(target_state="secondary") # # When after loosing both secondary nodes, the one that's back online has # candidate priority set to zero, then we should remain in wait_primary # state. # def test_016_001_fail_node3(): node3.fail() assert node3.wait_until_assigned_state(target_state="catchingup") def test_016_002_fail_node1(): node1.fail() assert node2.wait_until_state(target_state="wait_primary") def test_016_003_restart_node3(): node3.run() assert node3.wait_until_assigned_state(target_state="secondary") assert node2.wait_until_state(target_state="wait_primary") time.sleep(5) assert not node2.get_state().assigned == "primary" def test_016_004_restart_node1(): node1.run() assert node3.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="primary") assert node1.wait_until_state(target_state="secondary") # # When a node with candidate-priority zero (here, node3) fails while the # primary node (here, node2) is already in wait_primary, the non-candidate # node (here, node3) should still be assigned catchingup. # def test_017_001_fail_node1(): node1.fail() assert node2.wait_until_state(target_state="wait_primary") def test_017_002_fail_node3(): node3.fail() assert node3.wait_until_assigned_state(target_state="catchingup") def test_017_003_restart_nodes(): node3.run() node1.run() assert node3.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="primary") assert node1.wait_until_state(target_state="secondary") pg_auto_failover-1.6.3/tests/test_multi_ifdown.py000066400000000000000000000211461414244367200223340ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import raises, eq_ import time import os.path cluster = None monitor = None node1 = None node2 = None node3 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): global monitor monitor = cluster.create_monitor("/tmp/multi_ifdown/monitor") monitor.run() monitor.wait_until_pg_is_running() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/multi_ifdown/node1") node1.create() node1.run() assert node1.wait_until_state(target_state="single") def test_002_add_standby(): global node2 node2 = cluster.create_datanode("/tmp/multi_ifdown/node2") node2.create() node2.run() assert node2.wait_until_pg_is_running() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() # make sure we reached primary on node1 before next tests assert node1.wait_until_state(target_state="primary") def test_003_add_standby(): global node3 node3 = cluster.create_datanode("/tmp/multi_ifdown/node3") node3.create() node3.run() assert node3.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() # the formation number_sync_standbys is expected to be set to 1 now assert node1.get_number_sync_standbys() == 1 # make sure we reached primary on node1 before next tests assert node1.wait_until_state(target_state="primary") def test_004_write_into_primary(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2), (3), (4)") node1.run_sql_query("CHECKPOINT") results = node1.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,), (3,), (4,)] def test_005_set_candidate_priorities(): print() assert node1.wait_until_state(target_state="primary") # set priorities in a way that we know the candidate: node3 node1.set_candidate_priority(90) # current primary node2.set_candidate_priority(0) # not a candidate anymore node3.set_candidate_priority(90) # when we set candidate priority we go to apply_settings then primary print() assert node1.wait_until_state(target_state="primary") assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="secondary") # node1 should still be "sync" assert node1.get_number_sync_standbys() == 1 assert node2.get_replication_quorum() # also let's see synchronous_standby_names here # remember to sort by candidate priority then name ssn = "ANY 1 (pgautofailover_standby_3, pgautofailover_standby_2)" node1.check_synchronous_standby_names(ssn) def test_006_ifdown_node3(): node3.ifdown() def test_007_insert_rows(): node1.run_sql_query( "INSERT INTO t1 SELECT x+10 FROM generate_series(1, 10000) as gs(x)" ) node1.run_sql_query("CHECKPOINT") lsn1 = node1.run_sql_query("select pg_current_wal_lsn()")[0][0] print("%s " % lsn1, end="", flush=True) # node2 is sync and should get the WAL lsn2 = node2.run_sql_query("select pg_last_wal_receive_lsn()")[0][0] print("%s " % lsn2, end="", flush=True) while lsn2 != lsn1: time.sleep(1) lsn2 = node2.run_sql_query("select pg_last_wal_receive_lsn()")[0][0] print("%s " % lsn2, end="", flush=True) eq_(lsn1, lsn2) def test_008_failover(): print() print("Injecting failure of node1") node1.fail() # have node2 re-join the network and hopefully reconnect etc print("Reconnecting node3 (ifconfig up)") node3.ifup() # now we should be able to continue with the failover, and fetch missing # WAL bits from node2 assert node3.wait_until_pg_is_running() assert node3.wait_until_state(target_state="wait_primary", timeout=120) assert node2.wait_until_state(target_state="secondary") # node 2 has candidate priority of 0, can still be used to reach primary assert node3.wait_until_state(target_state="primary") assert node3.has_needed_replication_slots() assert node2.has_needed_replication_slots() # when in wait_primary state we should not block writes when: assert node3.get_number_sync_standbys() == 1 ssn = "ANY 1 (pgautofailover_standby_1, pgautofailover_standby_2)" node3.check_synchronous_standby_names(ssn=ssn) def test_009_read_from_new_primary(): results = node3.run_sql_query("SELECT count(*) FROM t1") assert results == [(10004,)] def test_010_start_node1_again(): node1.run() assert node1.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="primary") assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() # now that we're back to primary, check we have sync rep again ssn = "ANY 1 (pgautofailover_standby_1, pgautofailover_standby_2)" node3.check_synchronous_standby_names(ssn) # test_011_XXX, test_012_XXX, test_013_XXX, test_014_XXX and test_015_XXX # are meant to test the scenario when the most advanced secondary # becomes inaccessible at the same time when the primary is inaccessible def test_011_prepare_candidate_priorities(): # we are aiming to promote node2 assert node2.set_candidate_priority(100) # the next primary # other nodes are already candidates for primary, but with less # priority assert node1.get_candidate_priority() == 90 assert node3.get_candidate_priority() == 90 def test_012_prepare_replication_quorums(): # for the purpose of this test, we need one node # async, to allow that we should decrement the sync stanbys node3.set_number_sync_standbys(0) # to emulate one node is behind, it is easier to make it async # we want node2 to be behind others assert node2.set_replication_quorum("false") # others should be sync assert node1.get_replication_quorum() assert node3.get_replication_quorum() def test_013_secondary_gets_behind_primary(): # make sure that node2 gets behind of the primary node2.ifdown() # primary ingests some data node3.run_sql_query("INSERT INTO t1 VALUES (5), (6)") node3.run_sql_query("CHECKPOINT") # ensure that the healthy secondary gets the change results = node1.run_sql_query("SELECT count(*) FROM t1") assert results == [(10006,)] lsn1 = node1.run_sql_query("select pg_last_wal_receive_lsn()")[0][0] print("%s " % lsn1, end="", flush=True) # ensure the monitor received this lsn node1.pg_autoctl.sighup() # wake up from the 10s node_active delay time.sleep(1) q = "select reportedlsn from pgautofailover.node where nodeid = 1" lsn1m = monitor.run_sql_query(q)[0][0] print("%s " % lsn1m, end="", flush=True) retry = 0 while lsn1 != lsn1m and retry < 3: time.sleep(1) lsn1m = monitor.run_sql_query(q)[0][0] print("%s " % lsn1m, end="", flush=True) eq_(lsn1, lsn1m) def test_014_secondary_reports_lsn(): # make the primary and mostAdvanced secondary inaccessible # and the candidate for failover as accessible # which means that node2 will not be able to fetch wal # and blocked until the other secondary is up assert node1.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="primary") node3.ifdown() # primary node1.ifdown() # most advanced standby node2.ifup() # failover candidate print() print("Calling pgautofailover.failover() on the monitor") monitor.failover() # node2 reports its LSN while others are inaccessible assert node2.wait_until_state(target_state="report_lsn") def test_015_finalize_failover_after_most_advanced_secondary_gets_back(): # when they are accessible again, both should become # secondaries node1.ifup() # old most advanced secondary, now secondary node3.ifup() # old primary, now secondary assert node1.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="primary") results = node2.run_sql_query("SELECT count(*) FROM t1") eq_(results, [(10006,)]) pg_auto_failover-1.6.3/tests/test_multi_maintenance.py000066400000000000000000000302001414244367200233170ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import raises, eq_ import time import os.path cluster = None monitor = None node1 = None node2 = None node3 = None node4 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): global monitor monitor = cluster.create_monitor("/tmp/multi_maintenance/monitor") monitor.run() monitor.wait_until_pg_is_running() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/multi_maintenance/node1") node1.create() node1.run() assert node1.wait_until_state(target_state="single") def test_002_add_standby(): global node2 node2 = cluster.create_datanode("/tmp/multi_maintenance/node2") node2.create() node2.run() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() # make sure we reached primary on node1 before next tests assert node1.wait_until_state(target_state="primary") node1.check_synchronous_standby_names( ssn="ANY 1 (pgautofailover_standby_2)" ) def test_003_add_standby(): global node3 node3 = cluster.create_datanode("/tmp/multi_maintenance/node3") node3.create() node3.run() assert node3.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() # the formation number_sync_standbys is expected to be set to 1 now eq_(node1.get_number_sync_standbys(), 1) ssn = "ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)" node1.check_synchronous_standby_names(ssn) # make sure we reached primary on node1 before next tests assert node1.wait_until_state(target_state="primary") def test_004_write_into_primary(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2), (3), (4)") node1.run_sql_query("CHECKPOINT") results = node1.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,), (3,), (4,)] def test_005_set_candidate_priorities(): print() assert node1.wait_until_state(target_state="primary") # set priorities in a way that we know the candidate: node3 node1.set_candidate_priority(80) # current primary node2.set_candidate_priority(70) # remain secondary node3.set_candidate_priority(90) # favorite for failover # when we set candidate priority we go to apply_settings then primary print() assert node1.wait_until_state(target_state="primary") assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="secondary") # node2 should still be "sync" eq_(node2.get_replication_quorum(), True) # other replication settings should still be the same as before eq_(node1.get_number_sync_standbys(), 1) ssn = "ANY 1 (pgautofailover_standby_3, pgautofailover_standby_2)" node1.check_synchronous_standby_names(ssn) def test_006a_maintenance_and_failover(): print() print("Enabling maintenance on node2") node2.enable_maintenance() assert node2.wait_until_state(target_state="maintenance") node2.stop_postgres() # assigned and goal state must be the same assert node1.wait_until_state(target_state="primary") # ssn is not changed during maintenance operations ssn = "ANY 1 (pgautofailover_standby_3, pgautofailover_standby_2)" eq_(node1.get_synchronous_standby_names(), ssn) eq_(node1.get_synchronous_standby_names_local(), ssn) print("Calling pgautofailover.failover() on the monitor") monitor.failover() assert node3.wait_until_state(target_state="primary") assert node1.wait_until_state(target_state="secondary") # now that node3 is primary, synchronous_standby_names has changed ssn = "ANY 1 (pgautofailover_standby_1, pgautofailover_standby_2)" node3.check_synchronous_standby_names(ssn) print("Disabling maintenance on node2, should connect to the new primary") node2.disable_maintenance() # allow manual checking of primary_conninfo primary_conninfo_ipaddr = str(node3.vnode.address) print("current primary is node3 at %s" % primary_conninfo_ipaddr) if node2.pgmajor() < 12: fn = "recovery.conf" else: fn = "postgresql-auto-failover-standby.conf" fn = os.path.join(node2.datadir, fn) conf = open(fn).read() if primary_conninfo_ipaddr not in conf: raise Exception( "Primary ip address %s not found in %s:\n%s" % (primary_conninfo_ipaddr, fn, conf) ) assert node1.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="primary") # ssn is not changed during maintenance operations node3.check_synchronous_standby_names(ssn) assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() def test_006b_read_from_new_primary(): results = node3.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,), (3,), (4,)] def test_007a_node1_to_maintenance(): print() assert node3.wait_until_state(target_state="primary") print("Enabling maintenance on node1") node1.enable_maintenance() assert node3.wait_until_state(target_state="primary") def test_007b_node2_to_maintenance(): # node3 is the current primary assert node3.get_number_sync_standbys() == 1 print("Enabling maintenance on node2") node2.enable_maintenance() assert node3.wait_until_state(target_state="primary") # when both secondaries are put to maintenance, writes are blocked on # the primary ssn = "ANY 1 (pgautofailover_standby_1, pgautofailover_standby_2)" node3.check_synchronous_standby_names(ssn) def test_008a_stop_primary(): # node3 is the current primary assert node3.get_state().assigned == "primary" node3.fail() # check that even after 30s node3 is still not set to draining node3.sleep(30) assert not node3.get_state().assigned == "draining" assert node3.get_state().assigned == "primary" def test_008b_start_primary(): node3.run() assert node3.wait_until_state(target_state="primary") @raises(Exception) def test_009a_enable_maintenance_on_primary_should_fail(): node3.enable_maintenance(allowFailover=True) def test_009b_disable_maintenance(): print("Disabling maintenance on node1 and node2") node1.disable_maintenance() node2.disable_maintenance() assert node1.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="primary") def test_010_set_number_sync_standby_to_zero(): assert node3.set_number_sync_standbys(0) eq_(node3.get_number_sync_standbys(), 0) def test_011_all_to_maintenance(): print() assert node3.wait_until_state(target_state="primary") print("Enabling maintenance on node1") node1.enable_maintenance() assert node3.wait_until_state(target_state="primary") # now we can, because we don't care about having any standbys print("Enabling maintenance on node2") node2.enable_maintenance() assert node3.wait_until_state(target_state="wait_primary") # also let's see synchronous_standby_names here node3.check_synchronous_standby_names(ssn="") def test_012_can_write_during_maintenance(): node3.run_sql_query("INSERT INTO t1 VALUES (5), (6)") node3.run_sql_query("CHECKPOINT") def test_013_add_standby(): global node4 node4 = cluster.create_datanode("/tmp/multi_maintenance/node4") node4.create() node4.run() assert node4.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="primary") assert node2.wait_until_state(target_state="maintenance") assert node1.wait_until_state(target_state="maintenance") assert node3.has_needed_replication_slots() assert node4.has_needed_replication_slots() # the formation number_sync_standbys is expected to not be changed eq_(node3.get_number_sync_standbys(), 0) # make sure we reached primary on node1 before next tests assert node3.wait_until_state(target_state="primary") def test_014_disable_maintenance(): print() # make sure node2 is still in maintenance, then disable maintenance print("Disabling maintenance on node2") assert node2.wait_until_state(target_state="maintenance") node2.disable_maintenance() assert node3.wait_until_state(target_state="primary") print("Disabling maintenance on node1") # make sure node1 is still in maintenance, then disable maintenance assert node1.wait_until_state(target_state="maintenance") node1.disable_maintenance() assert node3.wait_until_state(target_state="primary") # also let's see synchronous_standby_names here print("Monitor: %s" % node3.get_synchronous_standby_names()) print( "Node 3: %s" % node3.run_sql_query("show synchronous_standby_names")[0][0] ) def test_015_set_number_sync_standby_to_one(): node3.set_number_sync_standbys(1) eq_(node3.get_number_sync_standbys(), 1) assert node3.wait_until_state(target_state="primary") def test_016_two_standbys_in_maintenance(): print() print("Enabling maintenance on node1") node1.enable_maintenance() assert node3.wait_until_state(target_state="primary") print("Enabling maintenance on node2") node2.enable_maintenance() assert node3.wait_until_state(target_state="primary") @raises(Exception) def test_017_primary_to_maintenance(): # this should fail, because we have 4 nodes and number_sync_standbys = 1 # and 2 nodes are already in maintenance # # if we allowed a 3rd node to be in maintenance, we would have no # standby node left and 0 < 1 print() print("Enabling maintenance on node3 (primary)") node3.enable_maintenance() def test_018_disable_maintenance(): print() print("Disabling maintenance on node2") assert node2.wait_until_state(target_state="maintenance") node2.disable_maintenance() assert node3.wait_until_state(target_state="primary") # make sure node1 is still in maintenance, then disable maintenance print("Disabling maintenance on node1") assert node1.wait_until_state(target_state="maintenance") node1.disable_maintenance() assert node3.wait_until_state(target_state="primary") def test_019_set_priorities(): # set priorities in a way that we know the candidate: node1 node1.set_candidate_priority(90) node2.set_candidate_priority(70) node3.set_candidate_priority(70) # current primary node4.set_candidate_priority(70) # when we set candidate priority we go to apply_settings then primary print() assert node1.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="primary") assert node4.wait_until_state(target_state="secondary") def test_020_primary_to_maintenance(): print() assert node3.wait_until_state(target_state="primary") print("Enabling maintenance on node3, allowing failover") node3.enable_maintenance(allowFailover=True) assert node3.wait_until_state(target_state="maintenance") assert node2.wait_until_state(target_state="secondary") assert node4.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") def test_021_stop_maintenance(): print() print("Disabling maintenance on node3") node3.disable_maintenance() assert node3.wait_until_pg_is_running() assert node3.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node2.wait_until_state(target_state="secondary") assert node4.wait_until_state(target_state="secondary") pg_auto_failover-1.6.3/tests/test_multi_standbys.py000066400000000000000000000306561414244367200227030ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import raises, eq_ import time import os.path cluster = None monitor = None node1 = None node2 = None node3 = None node4 = None node5 = None node6 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): global monitor monitor = cluster.create_monitor("/tmp/multi_standby/monitor") monitor.run() monitor.wait_until_pg_is_running() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/multi_standby/node1") node1.create() node1.run() assert node1.wait_until_state(target_state="single") def test_002_candidate_priority(): assert node1.get_candidate_priority() == 50 assert not node1.set_candidate_priority(-1) assert node1.get_candidate_priority() == 50 assert node1.set_candidate_priority(99) assert node1.get_candidate_priority() == 99 def test_003_replication_quorum(): assert node1.get_replication_quorum() assert not node1.set_replication_quorum("wrong quorum") assert node1.get_replication_quorum() assert node1.set_replication_quorum("false") assert not node1.get_replication_quorum() assert node1.set_replication_quorum("true") assert node1.get_replication_quorum() def test_004_001_add_three_standbys(): # the next test wants to set number_sync_standbys to 2 # so we need at least 3 standbys to allow that global node2 node2 = cluster.create_datanode("/tmp/multi_standby/node2") node2.create() node2.run() assert node2.wait_until_state(target_state="secondary") assert node2.wait_until_pg_is_running() assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() # with one standby, we have number_sync_standbys set to 0 still assert node1.get_number_sync_standbys() == 0 def test_004_002_add_three_standbys(): global node3 # refrain from waiting for the primary to be ready, to trigger a race # condition that could segfault the monitor (if the code was less # careful than it is now) # assert node1.wait_until_state(target_state="primary") node3 = cluster.create_datanode("/tmp/multi_standby/node3") node3.create() node3.run() assert node3.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") assert node3.wait_until_pg_is_running() assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() # the formation number_sync_standbys is expected to be set to 1 now assert node1.get_number_sync_standbys() == 1 def test_004_003_add_three_standbys(): global node4 node4 = cluster.create_datanode("/tmp/multi_standby/node4") node4.create() node4.run() assert node4.wait_until_state(target_state="secondary") # make sure we reached primary on node1 before next tests assert node1.wait_until_state(target_state="primary") assert node4.wait_until_pg_is_running() assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() assert node4.has_needed_replication_slots() def test_005_number_sync_standbys(): print() assert node1.get_number_sync_standbys() == 1 node1.set_number_sync_standbys(-1) assert node1.get_number_sync_standbys() == 1 print("set number_sync_standbys = 2") assert node1.set_number_sync_standbys(2) assert node1.get_number_sync_standbys() == 2 ssn = "ANY 2 (pgautofailover_standby_2, pgautofailover_standby_3, pgautofailover_standby_4)" node1.check_synchronous_standby_names(ssn) print("set number_sync_standbys = 0") assert node1.set_number_sync_standbys(0) assert node1.get_number_sync_standbys() == 0 ssn = "ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3, pgautofailover_standby_4)" node1.check_synchronous_standby_names(ssn) print("set number_sync_standbys = 1") assert node1.set_number_sync_standbys(1) assert node1.get_number_sync_standbys() == 1 # same ssn as before eq_(node1.get_synchronous_standby_names(), ssn) eq_(node1.get_synchronous_standby_names_local(), ssn) def test_006_number_sync_standbys_trigger(): assert node1.set_number_sync_standbys(2) assert node1.get_number_sync_standbys() == 2 node4.drop() assert node1.wait_until_state(target_state="primary") ssn = "ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)" node1.check_synchronous_standby_names(ssn) # there's no state change to instruct us that the replication slot # maintenance is now done, so we have to wait for awhile instead. node1.pg_autoctl.sighup() # wake up from the 10s node_active delay node2.pg_autoctl.sighup() # wake up from the 10s node_active delay node3.pg_autoctl.sighup() # wake up from the 10s node_active delay time.sleep(6) assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() def test_007_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") node1.run_sql_query("CHECKPOINT") def test_008_set_candidate_priorities(): # set priorities in a way that we know the candidate: node2 node1.set_candidate_priority(90) # current primary node2.set_candidate_priority(90) node3.set_candidate_priority(70) print() assert node1.wait_until_state(target_state="primary") def test_009_failover(): print() print("Calling pgautofailover.failover() on the monitor") monitor.failover() assert node2.wait_until_state(target_state="primary") assert node3.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="secondary") ssn = "ANY 1 (pgautofailover_standby_1, pgautofailover_standby_3)" node2.check_synchronous_standby_names(ssn) assert node1.has_needed_replication_slots() assert node2.has_needed_replication_slots() assert node3.has_needed_replication_slots() def test_010_read_from_nodes(): assert node1.run_sql_query("SELECT * FROM t1") == [(1,), (2,)] assert node2.run_sql_query("SELECT * FROM t1") == [(1,), (2,)] assert node3.run_sql_query("SELECT * FROM t1") == [(1,), (2,)] def test_011_write_into_new_primary(): node2.run_sql_query("INSERT INTO t1 VALUES (3), (4)") results = node2.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,), (3,), (4,)] # generate more WAL trafic for replication node2.run_sql_query("CHECKPOINT") def test_012_fail_primary(): print() print("Failing current primary node 2") node2.fail() # explicitely allow for the 30s timeout in stop_replication assert node1.wait_until_state(target_state="stop_replication") assert node1.wait_until_state(target_state="primary") assert node3.wait_until_state(target_state="secondary") ssn = "ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)" node1.check_synchronous_standby_names(ssn) def test_013_restart_node2(): node2.run() assert node1.wait_until_state(target_state="primary") assert node2.wait_until_state(target_state="secondary") assert node3.wait_until_state(target_state="secondary") ssn = "ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)" node1.check_synchronous_standby_names(ssn) # # When the two standby nodes are lost and then assigned catchingup, and the # primary is now blocking writes, we can set number-sync-standbys to 0 to # unblock writes, causing the primary to reach wait_primary # def test_014_001_fail_set_properties(): eq_(node1.get_number_sync_standbys(), 1) node1.set_candidate_priority(50) node2.set_candidate_priority(50) node3.set_candidate_priority(50) node1.wait_until_state(target_state="primary") assert node1.get_replication_quorum() assert node2.get_replication_quorum() assert node3.get_replication_quorum() def test_014_002_fail_two_standby_nodes(): node2.fail() node3.fail() node2.wait_until_assigned_state(target_state="catchingup") node3.wait_until_assigned_state(target_state="catchingup") # node1 remains a primary, blocking writes, at this stage node1.wait_until_state(target_state="primary") ssn = "ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)" node1.check_synchronous_standby_names(ssn) def test_014_003_unblock_writes(): node1.set_number_sync_standbys(0) # node1 unblocks writes because number-sync-standbys is now zero node1.wait_until_state(target_state="wait_primary") eq_(node1.get_number_sync_standbys(), 0) node1.check_synchronous_standby_names(ssn="") def test_014_004_restart_nodes(): node3.run() node2.run() assert node3.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") ssn = "ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)" node1.check_synchronous_standby_names(ssn) # # Now if number-sync-standbys is zero already, then when we lose all the # standby nodes the primary is assigned wait_primary to unblock writes # def test_015_001_set_properties(): node1.wait_until_state(target_state="primary") eq_(node1.get_number_sync_standbys(), 0) def test_015_002_fail_two_standby_nodes(): node2.fail() node3.fail() # node 1 is assigned wait_primary as soon as we lose all the candidates node1.wait_until_state(target_state="wait_primary") ssn = "" eq_(node1.get_synchronous_standby_names(), ssn) eq_(node1.get_synchronous_standby_names_local(), ssn) def test_015_003_set_properties(): # stop the data leak by re-implementing sync rep # # the primary is now expected to be in apply_settings/primary, and fail # to reach primary, which causes the set number-sync-standbys command to # fail. # # instead of using the command line (which waits for 60s and fail), # let's use the monitor SQL API instead q = "select pgautofailover.set_formation_number_sync_standbys('default', 1)" monitor.run_sql_query(q) assert node1.wait_until_assigned_state(target_state="primary") eq_(node1.get_number_sync_standbys(), 1) ssn = "ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)" node1.check_synchronous_standby_names(ssn) def test_015_004_restart_nodes(): node3.run() node2.run() assert node3.wait_until_state(target_state="secondary") assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") ssn = "ANY 1 (pgautofailover_standby_2, pgautofailover_standby_3)" node1.check_synchronous_standby_names(ssn) # # Now test a failover when all the nodes have candidate priority set to zero # def test_016_001_set_candidate_priorities_to_zero(): node1.set_candidate_priority(0) node2.set_candidate_priority(0) node3.set_candidate_priority(0) # no candidate for failover, we're wait_primary node1.wait_until_state(target_state="primary") def test_016_002_trigger_failover(): print() print("Calling pgautofailover.failover() on the monitor") monitor.failover() assert node3.wait_until_state(target_state="report_lsn") assert node2.wait_until_state(target_state="report_lsn") assert node1.wait_until_state(target_state="report_lsn") def test_016_003_set_candidate_priority_to_one(): node2.set_candidate_priority(1) # no candidate for failover, we're wait_primary node2.wait_until_state(target_state="primary") node1.wait_until_state(target_state="secondary") node3.wait_until_state(target_state="secondary") def test_016_004_reset_candidate_priority(): node2.set_candidate_priority(0) node2.wait_until_state(target_state="primary") node1.wait_until_state(target_state="secondary") node3.wait_until_state(target_state="secondary") def test_016_005_perform_promotion(): print() print("Calling pg_autoctl perform promotion on node 1") node1.perform_promotion() node1.wait_until_state(target_state="primary") node2.wait_until_state(target_state="secondary") node3.wait_until_state(target_state="secondary") def test_017_remove_old_primary(): node2.drop() assert node1.wait_until_state(target_state="primary") assert node3.wait_until_state(target_state="secondary") pg_auto_failover-1.6.3/tests/test_replace_monitor.py000066400000000000000000000052331414244367200230150ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover import time from nose.tools import eq_ cluster = None monitor = None node1 = None node2 = None newmonitor = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): global monitor monitor = cluster.create_monitor("/tmp/replace/monitor") monitor.run() def test_001_init_primary(): global node1 node1 = cluster.create_datanode("/tmp/replace/node1") node1.create(run=True) assert node1.wait_until_state(target_state="single") def test_002_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_003_init_secondary(): global node2 node2 = cluster.create_datanode("/tmp/replace/node2") node2.create(run=True) assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") def test_004_read_from_secondary(): results = node2.run_sql_query("SELECT * FROM t1") assert results == [(1,), (2,)] def test_005_drop_monitor(): monitor.destroy() cluster.monitor = None def test_006a_disable_monitor_node2(): node2.disable_monitor() def test_006b_disable_monitor_node1(): node1.disable_monitor() def test_006c_write_to_primary(): node1.run_sql_query("INSERT INTO t1 VALUES (3)") def test_007_read_from_secondary(): results = node2.run_sql_query("SELECT * FROM t1 ORDER BY a") assert results == [(1,), (2,), (3,)] def test_008_create_new_monitor(): global newmonitor newmonitor = cluster.create_monitor("/tmp/replace/newmonitor") newmonitor.run() def test_009a_enable_monitor_node1(): node1.enable_monitor(newmonitor) assert node1.wait_until_state(target_state="single") def test_009b_enable_monitor_node2(): node2.enable_monitor(newmonitor) assert node2.wait_until_state(target_state="catchingup") assert node1.wait_until_state(target_state="wait_primary") def test_010_wait_until_state(): assert node1.wait_until_state(target_state="primary") assert node2.wait_until_state(target_state="secondary") def test_011_failover(): print() print("Calling pgautofailover.failover() on the monitor") newmonitor.failover() assert node2.wait_until_state(target_state="primary") eq_( node2.get_synchronous_standby_names_local(), "ANY 1 (pgautofailover_standby_1)", ) assert node1.wait_until_state(target_state="secondary") def test_012_read_from_secondary(): results = node1.run_sql_query("SELECT * FROM t1 ORDER BY a") assert results == [(1,), (2,), (3,)] pg_auto_failover-1.6.3/tests/test_skip_pg_hba.py000066400000000000000000000077131414244367200221060ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import eq_ import subprocess import os, os.path, time, shutil cluster = None node1 = None node2 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): monitor = cluster.create_monitor("/tmp/skip/monitor", authMethod="skip") monitor.run() monitor.wait_until_pg_is_running() with open(os.path.join("/tmp/skip/monitor", "pg_hba.conf"), "a") as hba: hba.write("host all all %s trust\n" % cluster.networkSubnet) monitor.reload_postgres() def test_001_init_primary(): global node1 print() node1_path = cluster.pg_createcluster("node1") # make a copy of the debian's HBA file hba_path = os.path.join( "/etc", "/".join(node1_path.split("/")[3:]), "pg_hba.conf" ) shutil.copyfile(hba_path, "/tmp/pg_hba.debian.conf") # allow using unix domain sockets pgautofailover.sudo_mkdir_p("/tmp/socks/node1") os.environ["PG_REGRESS_SOCK_DIR"] = "/tmp/socks/node1" # we need to give the hostname here, because our method to find it # automatically will fail in the test environment node1 = cluster.create_datanode(node1_path, authMethod="skip") node1.create(level="-vvv") # # Check that we didn't edit the HBA file, thanks to --skip-pg-hba, here # in the test file spelled the strange way --auth skip. # p = subprocess.run( [ "diff", "/tmp/pg_hba.debian.conf", os.path.join(node1_path, "pg_hba.conf"), ], text=True, capture_output=True, ) print("diff %s" % " ".join(p.args)) if p.returncode != 0: print("%s" % p.stdout) assert p.returncode == 0 with open(os.path.join(node1_path, "pg_hba.conf"), "a") as hba: # node1.run_sql_query will need # host "172.27.1.1", user "docker", database "postgres" hba.write("host postgres docker %s trust\n" % cluster.networkSubnet) hba.write("host all all %s trust\n" % cluster.networkSubnet) hba.write("host replication all %s trust\n" % cluster.networkSubnet) node1.reload_postgres() node1.run() assert node1.wait_until_state(target_state="single") node1.wait_until_pg_is_running() def test_002_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_003_init_secondary(): global node2 # allow using unix domain sockets pgautofailover.sudo_mkdir_p("/tmp/socks/node2") os.environ["PG_REGRESS_SOCK_DIR"] = "/tmp/socks/node2" node2 = cluster.create_datanode("/tmp/skip/node2", authMethod="skip") node2.create() with open(os.path.join("/tmp/skip/node2", "pg_hba.conf"), "a") as hba: hba.write("host all all %s trust\n" % cluster.networkSubnet) hba.write("host replication all %s trust\n" % cluster.networkSubnet) node2.reload_postgres() node2.run() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") def test_004a_hba_have_not_been_edited(): eq_(False, node1.editedHBA()) eq_(False, node2.editedHBA()) def test_004b_pg_autoctl_conf_skip_hba(): eq_("skip", node1.config_get("postgresql.hba_level")) eq_("skip", node2.config_get("postgresql.hba_level")) def test_005_failover(): print() print("Calling pgautofailover.failover() on the monitor") cluster.monitor.failover() assert node2.wait_until_state(target_state="primary") assert node1.wait_until_state(target_state="secondary") def test_006_restart_secondary(): node1.stop_pg_autoctl() node1.run() def test_006a_hba_have_not_been_edited(): eq_(False, node1.editedHBA()) eq_(False, node2.editedHBA()) def test_006b_pg_autoctl_conf_skip_hba(): eq_("skip", node1.config_get("postgresql.hba_level")) eq_("skip", node2.config_get("postgresql.hba_level")) pg_auto_failover-1.6.3/tests/test_ssl_cert.py000066400000000000000000000174721414244367200214610ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover import ssl_cert_utils as cert from nose.tools import * import subprocess import os, os.path, time, shutil cluster = None node1 = None node2 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() client_top_directory = os.path.join(os.getenv("HOME"), ".postgresql") cluster.create_root_cert( client_top_directory, basename="root", CN="/CN=root.pgautofailover.ca" ) def teardown_module(): cluster.destroy() # remove client side setup for certificates too client_top_directory = os.path.join(os.getenv("HOME"), ".postgresql") p = subprocess.Popen( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "rm", "-rf", client_top_directory, ] ) assert p.wait() == 0 # also remove certificates we created for the servers p = subprocess.run( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), "rm", "-rf", "/tmp/certs", ] ) assert p.returncode == 0 def test_000_create_monitor(): # create SSL certs and keys for this server # # https://www.postgresql.org/docs/11/ssl-tcp.html # # server.crt and server.key should be stored on the server, and root.crt # should be stored on the client so the client can verify that the # server's leaf certificate was signed by its trusted root certificate. # root.key should be stored offline for use in creating future # certificates. # # https://www.postgresql.org/docs/current/libpq-ssl.html # # If the server attempts to verify the identity of the client by # requesting the client's leaf certificate, libpq will send the # certificates stored in file ~/.postgresql/postgresql.crt in the user's # home directory # Now, create a server certificate signed by the new root certificate # authority client_top_directory = os.path.join(os.getenv("HOME"), ".postgresql") # now create and sign the CLIENT certificate clientCert = cert.SSLCert( client_top_directory, basename="postgresql", CN="/CN=autoctl_node" ) clientCert.create_signed_certificate(cluster.cert) # now create and sign the SERVER certificate for the monitor serverCert = cert.SSLCert( "/tmp/certs/monitor", "server", "/CN=monitor.pgautofailover.ca" ) serverCert.create_signed_certificate(cluster.cert) p = subprocess.run( [ "ls", "-ld", client_top_directory, cluster.cert.crt, cluster.cert.csr, cluster.cert.key, clientCert.crt, clientCert.csr, clientCert.key, serverCert.crt, serverCert.csr, serverCert.key, ], text=True, capture_output=True, ) print("%s" % p.stdout) # the root user also needs the certificates, tests are connecting with it subprocess.run(["ln", "-s", client_top_directory, "/root/.postgresql"]) assert p.returncode == 0 # # Now create the monitor Postgres instance with the certificates # monitor = cluster.create_monitor( "/tmp/cert/monitor", authMethod="skip", sslMode="verify-ca", sslCAFile=cluster.cert.crt, sslServerKey=serverCert.key, sslServerCert=serverCert.crt, ) monitor.run() monitor.wait_until_pg_is_running() with open(os.path.join("/tmp/cert/monitor", "pg_hba.conf"), "a") as hba: hba.write("hostssl all all %s cert\n" % cluster.networkSubnet) monitor.reload_postgres() # check the SSL settings cmd = [ "openssl", "s_client", "-starttls", "postgres", "-connect", "172.27.1.2:5432", "-showcerts", "-CAfile", cluster.cert.crt, ] print(" ".join(cmd)) p = subprocess.run( [ "sudo", "-E", "-u", os.getenv("USER"), "env", "PATH=" + os.getenv("PATH"), ] + cmd, input="", text=True, capture_output=True, ) if p.returncode != 0: print("" % p.stdout) print("" % p.stderr) assert p.returncode == 0 # print connection string print("monitor: %s" % monitor.connection_string()) monitor.check_ssl("on", "verify-ca") def test_001_init_primary(): global node1 # Create a server certificate signed by the root Certificate Authority certs_dir = "/tmp/certs/node1" serverCert = cert.SSLCert( "/tmp/certs/node1", "server", "/CN=node1.pgautofailover.ca" ) serverCert.create_signed_certificate(cluster.cert) # Now create the server with the certificates node1 = cluster.create_datanode( "/tmp/cert/node1", authMethod="skip", sslMode="verify-ca", sslCAFile=cluster.cert.crt, sslServerKey=serverCert.key, sslServerCert=serverCert.crt, ) node1.create(level="-vv") with open(os.path.join("/tmp/cert/node1", "pg_hba.conf"), "a") as hba: # node1.run_sql_query will need # host "172.27.1.1", user "docker", database "postgres" hba.write("hostssl postgres docker %s cert\n" % cluster.networkSubnet) hba.write("hostssl all all %s cert\n" % cluster.networkSubnet) hba.write( "hostssl replication all %s cert map=pgautofailover\n" % cluster.networkSubnet ) with open(os.path.join("/tmp/cert/node1", "pg_ident.conf"), "a") as ident: # use an ident map to allow using the same cert for replication ident.write("pgautofailover autoctl_node pgautofailover_replicator\n") node1.reload_postgres() node1.run() assert node1.wait_until_state(target_state="single") node1.wait_until_pg_is_running() node1.check_ssl("on", "verify-ca", primary=True) def test_002_create_t1(): print() print(node1.connection_string()) node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_003_init_secondary(): global node2 # Create a server certificate signed by the root Certificate Authority certs_dir = "/tmp/certs/node2" serverCert = cert.SSLCert( "/tmp/certs/node2", "server", "/CN=node2.pgautofailover.ca" ) serverCert.create_signed_certificate(cluster.cert) # Now create the server with the certificates node2 = cluster.create_datanode( "/tmp/cert/node2", authMethod="skip", sslMode="verify-ca", sslCAFile=cluster.cert.crt, sslServerKey=serverCert.key, sslServerCert=serverCert.crt, ) node2.create(level="-vv") with open(os.path.join("/tmp/cert/node2", "pg_hba.conf"), "a") as hba: hba.write("hostssl all all %s cert\n" % cluster.networkSubnet) hba.write( "hostssl replication all %s cert map=pgautofailover\n" % cluster.networkSubnet ) with open(os.path.join("/tmp/cert/node1", "pg_ident.conf"), "a") as ident: # use an ident map to allow using the same cert for replication ident.write("pgautofailover autoctl_node pgautofailover_replicator\n") node2.reload_postgres() node2.run() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") node2.wait_until_pg_is_running() node2.check_ssl("on", "verify-ca") def test_004_failover(): print() print("Calling pgautofailover.failover() on the monitor") cluster.monitor.failover() assert node2.wait_until_state(target_state="primary") assert node1.wait_until_state(target_state="secondary") pg_auto_failover-1.6.3/tests/test_ssl_self_signed.py000066400000000000000000000031061414244367200227730ustar00rootroot00000000000000import pgautofailover_utils as pgautofailover from nose.tools import eq_ import os.path cluster = None node1 = None node2 = None def setup_module(): global cluster cluster = pgautofailover.Cluster() def teardown_module(): cluster.destroy() def test_000_create_monitor(): monitor = cluster.create_monitor( "/tmp/ssl-self-signed/monitor", sslSelfSigned=True ) monitor.run() monitor.wait_until_pg_is_running() monitor.check_ssl("on", "require") def test_001_init_primary(): global node1 node1 = cluster.create_datanode( "/tmp/ssl-self-signed/node1", sslSelfSigned=True ) node1.create() node1.run() assert node1.wait_until_state(target_state="single") node1.wait_until_pg_is_running() node1.check_ssl("on", "require", primary=True) def test_002_create_t1(): node1.run_sql_query("CREATE TABLE t1(a int)") node1.run_sql_query("INSERT INTO t1 VALUES (1), (2)") def test_003_init_secondary(): global node2 node2 = cluster.create_datanode( "/tmp/ssl-self-signed/node2", sslSelfSigned=True, sslMode="require" ) node2.create() node2.run() assert node2.wait_until_state(target_state="secondary") assert node1.wait_until_state(target_state="primary") node2.wait_until_pg_is_running() node2.check_ssl("on", "require") def test_004_failover(): print() print("Calling pgautofailover.failover() on the monitor") cluster.monitor.failover() assert node2.wait_until_state(target_state="primary") assert node1.wait_until_state(target_state="secondary") pg_auto_failover-1.6.3/tests/upgrade/000077500000000000000000000000001414244367200176465ustar00rootroot00000000000000pg_auto_failover-1.6.3/tests/upgrade/Makefile000066400000000000000000000024221414244367200213060ustar00rootroot00000000000000NODES ?= 3 PATCH = tests/upgrade/monitor-upgrade-1.7.patch Q_VERSION = select default_version, installed_version Q_VERSION += from pg_available_extensions where name = 'pgautofailover' build: docker-compose build patch: cd ../.. && git apply $(PATCH) clean: cd ../.. && git apply --reverse $(PATCH) up: create-volumes compose-up tail ; down: compose-down rm-volumes ; compose-down: docker-compose down --volumes --remove-orphans compose-up: docker-compose up -d tail: docker-compose logs -f create-volumes: for v in volm vol1 vol2 vol3; do docker volume create $$v; done rm-volumes: for v in volm vol1 vol2 vol3; do docker volume rm $$v; done upgrade-monitor: patch docker-compose up -d --no-deps --build monitor upgrade-nodes: docker-compose up -d --no-deps --build node3 node2 docker-compose up -d --no-deps --build node1 state: docker-compose exec monitor pg_autoctl show state version: docker-compose exec monitor pg_autoctl version docker-compose exec monitor psql -d pg_auto_failover -c "$(Q_VERSION)" failover: docker-compose exec monitor pg_autoctl perform failover watch: docker-compose exec monitor watch -n 0.2 pg_autoctl show state .PHONY: build patch clean up down upgrade-monitor state watch .PHONY: compose-down compose-up create-volumes rm-volumes pg_auto_failover-1.6.3/tests/upgrade/README.md000066400000000000000000000036541414244367200211350ustar00rootroot00000000000000# Testing monitor upgrades This directory contains some docker-compose based tooling to manually test monitor upgrades. The tooling is built around the idea that we want to test what happens at upgrade from the code in the local branch. It might be possible to also test what happens when we upgrade from a previously released version with some edits in this tooling, though that's not the main use-case here. A typical manual session looks like the following. First, let us prepare a tmux environment with two panes. The top pane will show the logs from all the nodes running within the docker-compose orchestration: ```bash $ tmux $ tmux split-window -v ``` Now, in the first pane, build our images and start all our services: ```bash # in the first pane $ make build $ make up ``` Now, in the second pane, watch until the cluster has reached a stable state (primary/secondary/secondary) before we go on to upgrade the monitor. ```bash # in the second pane $ make watch ``` To upgrade the monitor we apply a local patch that provides version 1.7 (with no schema changes, just version number hacking), build an updated docker image using the patch, and restart the monitor with this new version: ```bash # in the second pane $ make version $ make upgrade-monitor ``` To check that the upgrade went well, we can do: ```bash # in the second pane $ make version $ make state ``` We should see the Postgres nodes being verbose about the monitor having been upgraded, but not the nodes. It is possible to now upgrade the nodes to the current version too, though that's not the goal of this work at the moment. ```bash # in the second pane $ make upgrade-nodes # in the first pane, C-c the current logs session, and re-attach $ make tail # in the second page $ make version $ make state ``` To test a failover: ```bash $ make failover $ make state ``` Time to clean-up our local repository: ```bash # in the second pane $ make down $ make clean ``` pg_auto_failover-1.6.3/tests/upgrade/docker-compose.yml000066400000000000000000000032371414244367200233100ustar00rootroot00000000000000version: "3.9" # optional since v1.27.0 services: monitor: build: ../.. hostname: monitor volumes: - monitor_data:/var/lib/postgres:rw environment: PGDATA: /var/lib/postgres/pgaf command: pg_autoctl create monitor --ssl-self-signed --auth trust --run expose: - 5432 node1: build: ../.. hostname: node1 volumes: - node1_data:/var/lib/postgres:rw environment: PGDATA: /var/lib/postgres/pgaf PGUSER: ad PGDATABASE: analytics PG_AUTOCTL_MONITOR: "postgresql://autoctl_node@monitor/pg_auto_failover" command: pg_autoctl create postgres --ssl-self-signed --auth trust --pg-hba-lan --run expose: - 5432 node2: build: ../.. hostname: node2 volumes: - node2_data:/var/lib/postgres:rw environment: PGDATA: /var/lib/postgres/pgaf PGUSER: ad PGDATABASE: analytics PG_AUTOCTL_MONITOR: "postgresql://autoctl_node@monitor/pg_auto_failover" command: pg_autoctl create postgres --ssl-self-signed --auth trust --pg-hba-lan --run expose: - 5432 node3: build: ../.. hostname: node3 volumes: - node3_data:/var/lib/postgres:rw environment: PGDATA: /var/lib/postgres/pgaf PGUSER: ad PGDATABASE: analytics PG_AUTOCTL_MONITOR: "postgresql://autoctl_node@monitor/pg_auto_failover" command: pg_autoctl create postgres --ssl-self-signed --auth trust --pg-hba-lan --run expose: - 5432 volumes: monitor_data: external: true name: volm node1_data: external: true name: vol1 node2_data: external: true name: vol2 node3_data: external: true name: vol3 pg_auto_failover-1.6.3/tests/upgrade/monitor-upgrade-1.7.patch000066400000000000000000000041261414244367200243110ustar00rootroot00000000000000diff --git a/src/bin/pg_autoctl/defaults.h b/src/bin/pg_autoctl/defaults.h index 8e7e2eaf..643dc96a 100644 --- a/src/bin/pg_autoctl/defaults.h +++ b/src/bin/pg_autoctl/defaults.h @@ -17,7 +17,7 @@ #define PG_AUTOCTL_VERSION "1.6.2" /* version of the extension that we requite to talk to on the monitor */ -#define PG_AUTOCTL_EXTENSION_VERSION "1.6" +#define PG_AUTOCTL_EXTENSION_VERSION "1.7" /* environment variable to use to make DEBUG facilities available */ #define PG_AUTOCTL_DEBUG "PG_AUTOCTL_DEBUG" diff --git a/src/monitor/Makefile b/src/monitor/Makefile index 8e6bf321..478fb2a7 100644 --- a/src/monitor/Makefile +++ b/src/monitor/Makefile @@ -2,7 +2,7 @@ # Licensed under the PostgreSQL License. EXTENSION = pgautofailover -EXTVERSION = 1.6 +EXTVERSION = 1.7 SRC_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) diff --git a/src/monitor/metadata.h b/src/monitor/metadata.h index 9e50ab72..0f8b39a3 100644 --- a/src/monitor/metadata.h +++ b/src/monitor/metadata.h @@ -15,7 +15,7 @@ #include "storage/lockdefs.h" -#define AUTO_FAILOVER_EXTENSION_VERSION "1.6" +#define AUTO_FAILOVER_EXTENSION_VERSION "1.7" #define AUTO_FAILOVER_EXTENSION_NAME "pgautofailover" #define AUTO_FAILOVER_SCHEMA_NAME "pgautofailover" #define AUTO_FAILOVER_FORMATION_TABLE "pgautofailover.formation" diff --git a/src/monitor/pgautofailover--1.6--1.7.sql b/src/monitor/pgautofailover--1.6--1.7.sql new file mode 100644 index 00000000..7167ee17 --- /dev/null +++ b/src/monitor/pgautofailover--1.6--1.7.sql @@ -0,0 +1,6 @@ +-- +-- dummy extension update file that does nothing +-- +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION pgautofailover UPDATE TO dummy" to load this file. \quit + diff --git a/src/monitor/pgautofailover.control b/src/monitor/pgautofailover.control index a649eb76..d0504410 100644 --- a/src/monitor/pgautofailover.control +++ b/src/monitor/pgautofailover.control @@ -1,5 +1,5 @@ comment = 'pg_auto_failover' -default_version = '1.6' +default_version = '1.7' module_pathname = '$libdir/pgautofailover' relocatable = false requires = 'btree_gist' pg_auto_failover-1.6.3/valgrind/000077500000000000000000000000001414244367200166635ustar00rootroot00000000000000pg_auto_failover-1.6.3/valgrind/.gitignore000066400000000000000000000001071414244367200206510ustar00rootroot00000000000000# Ignore everything in this directory * # Except this file !.gitignore