pax_global_header00006660000000000000000000000064144726772220014527gustar00rootroot0000000000000052 comment=176c782300f89917c82618ddfb63221048c38b9e londiste-3.12/000077500000000000000000000000001447267722200132755ustar00rootroot00000000000000londiste-3.12/.github/000077500000000000000000000000001447267722200146355ustar00rootroot00000000000000londiste-3.12/.github/workflows/000077500000000000000000000000001447267722200166725ustar00rootroot00000000000000londiste-3.12/.github/workflows/ci.yml000066400000000000000000000103571447267722200200160ustar00rootroot00000000000000# # https://docs.github.com/en/actions/reference # https://github.com/actions # # uses: https://github.com/actions/checkout @v3 # uses: https://github.com/actions/setup-python @v4 # uses: https://github.com/actions/download-artifact @v3 # uses: https://github.com/actions/upload-artifact @v3 # name: CI on: pull_request: {} push: {} jobs: check: name: "Check" runs-on: ubuntu-latest strategy: matrix: test: - {PY: "3.10", TOXENV: "lint"} steps: - name: "Checkout" uses: actions/checkout@v3 - name: "Setup Python ${{matrix.test.PY}}" uses: actions/setup-python@v4 with: python-version: ${{matrix.test.PY}} - run: python3 -m pip install -r etc/requirements.build.txt --disable-pip-version-check - name: "Test" env: TOXENV: ${{matrix.test.TOXENV}} run: python3 -m tox -r database: name: "Python ${{matrix.test.PY}} + PostgreSQL ${{matrix.test.PG}}" runs-on: ubuntu-latest strategy: matrix: test: - {PY: "3.7", PG: "11", TOXENV: "py37"} - {PY: "3.8", PG: "12", TOXENV: "py38"} - {PY: "3.9", PG: "13", TOXENV: "py39"} - {PY: "3.10", PG: "14", TOXENV: "py310"} - {PY: "3.11", PG: "15", TOXENV: "py311"} steps: - name: "Checkout" uses: actions/checkout@v3 - name: "Setup Python ${{matrix.test.PY}}" uses: actions/setup-python@v4 with: python-version: ${{matrix.test.PY}} - run: python3 -m pip install -r etc/requirements.build.txt --disable-pip-version-check - name: "InstallDB" run: | echo "::group::apt-get-update" sudo -nH apt-get -q update sudo -nH apt-get -q install curl ca-certificates gnupg curl https://www.postgresql.org/media/keys/ACCC4CF8.asc \ | gpg --dearmor \ | sudo -nH tee /etc/apt/trusted.gpg.d/apt.postgresql.org.gpg echo "deb http://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main ${{matrix.test.PG}}" \ | sudo -nH tee /etc/apt/sources.list.d/pgdg.list sudo -nH apt-get -q update echo "::endgroup::" echo "::group::apt-get-install" # disable new cluster creation sudo -nH mkdir -p /etc/postgresql-common/createcluster.d echo "create_main_cluster = false" | sudo -nH tee /etc/postgresql-common/createcluster.d/no-main.conf sudo -nH apt-get -qyu install postgresql-${{matrix.test.PG}} postgresql-server-dev-${{matrix.test.PG}} pgqd echo "::endgroup::" # tune environment echo "/usr/lib/postgresql/${{matrix.test.PG}}/bin" >> $GITHUB_PATH echo "PGHOST=/tmp" >> $GITHUB_ENV - name: "Install extensions" run: | echo "::group::install-pgq" git clone -q https://github.com/pgq/pgq pgq-sql; make -C pgq-sql sudo -nH bash -c "PATH='${PATH}' make install -C pgq-sql" echo "::endgroup::" echo "::group::install-pgq-node" git clone -q https://github.com/pgq/pgq-node; make -C pgq-node sudo -nH bash -c "PATH='${PATH}' make install -C pgq-node" echo "::endgroup::" echo "::group::install-londiste" git clone -q https://github.com/pgq/londiste-sql; make -C londiste-sql sudo -nH bash -c "PATH='${PATH}' make install -C londiste-sql" echo "::endgroup::" - name: "StartDB" run: | rm -rf data log mkdir -p log LANG=C initdb data sed -ri -e "s,^[# ]*(unix_socket_directories).*,\\1='/tmp'," data/postgresql.conf pg_ctl -D data -l log/pg.log start || { cat log/pg.log ; exit 1; } sleep 1 - name: "CreateDB" run: | psql -d postgres -c "create database testdb" psql -d testdb -c "create extension pgq; create extension pgq_node;" psql -d testdb -c "select pgq.create_queue('testq')" - name: "Test" env: TOXENV: ${{matrix.test.TOXENV}} TEST_Q_NAME: testq PGDATABASE: testdb run: | python3 -m tox -r -- --color=yes - name: "StopDB" run: | pg_ctl -D data stop rm -rf data log /tmp/.s.PGSQL* londiste-3.12/.github/workflows/release.yml000066400000000000000000000060141447267722200210360ustar00rootroot00000000000000# # This runs when version tag is pushed # name: REL on: push: tags: ["v[0-9]*"] jobs: sdist: name: "Build source package" runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: {python-version: "3.11"} - run: python3 -m pip install -r etc/requirements.build.txt --disable-pip-version-check - run: python3 setup.py sdist - run: python3 setup.py bdist_wheel - uses: actions/upload-artifact@v3 with: {name: "dist", path: "dist"} publish: name: "Publish" runs-on: ubuntu-latest needs: [sdist] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: {python-version: "3.11"} - run: python3 -m pip install -r etc/requirements.build.txt --disable-pip-version-check - name: "Get files" uses: actions/download-artifact@v3 with: {name: "dist", path: "dist"} - name: "Install pandoc" run: | sudo -nH apt-get -u -y install pandoc pandoc --version - name: "Prepare" run: | PACKAGE=$(python3 setup.py --name) VERSION=$(python3 setup.py --version) TGZ="${PACKAGE}-${VERSION}.tar.gz" # default - gh:release, pypi # PRERELEASE - gh:prerelease, pypi # DRAFT - gh:draft,prerelease, testpypi PRERELEASE="false"; DRAFT="false" case "${VERSION}" in *[ab]*|*rc*) PRERELEASE="true";; *dev*) PRERELEASE="true"; DRAFT="true";; esac test "${{github.ref}}" = "refs/tags/v${VERSION}" || { echo "ERR: tag mismatch"; exit 1; } test -f "dist/${TGZ}" || { echo "ERR: sdist failed"; exit 1; } echo "PACKAGE=${PACKAGE}" >> $GITHUB_ENV echo "VERSION=${VERSION}" >> $GITHUB_ENV echo "TGZ=${TGZ}" >> $GITHUB_ENV echo "PRERELEASE=${PRERELEASE}" >> $GITHUB_ENV echo "DRAFT=${DRAFT}" >> $GITHUB_ENV mkdir -p tmp make -s shownote > tmp/note.md cat tmp/note.md ls -l dist - name: "Create Github release" env: GH_TOKEN: ${{secrets.GITHUB_TOKEN}} run: | title="${PACKAGE} v${VERSION}" ghf="--notes-file=./tmp/note.md" if test "${DRAFT}" = "true"; then ghf="${ghf} --draft"; fi if test "${PRERELEASE}" = "true"; then ghf="${ghf} --prerelease"; fi gh release create "v${VERSION}" "dist/${TGZ}" --title="${title}" ${ghf} - name: "Upload to PYPI" id: pypi_upload env: PYPI_TOKEN: ${{secrets.PYPI_TOKEN}} PYPI_TEST_TOKEN: ${{secrets.PYPI_TEST_TOKEN}} run: | ls -l dist if test "${DRAFT}" = "false"; then python -m twine upload -u __token__ -p ${PYPI_TOKEN} \ --repository pypi --disable-progress-bar dist/* else python -m twine upload -u __token__ -p ${PYPI_TEST_TOKEN} \ --repository testpypi --disable-progress-bar dist/* fi londiste-3.12/.gitignore000066400000000000000000000003451447267722200152670ustar00rootroot00000000000000__pycache__ *.pyc *.swp *.o *.so *.egg-info *.debhelper *.log *.substvars *-stamp debian/files debian/londiste debian/python-* debian/python3-* .tox .pybuild .pytype .coverage MANIFEST build tmp dist conf log pid *.log *.pid londiste-3.12/MANIFEST.in000066400000000000000000000002721447267722200150340ustar00rootroot00000000000000include etc/note.awk etc/*.yml etc/Docker* include MANIFEST.in Makefile tox.ini include README.rst NEWS.rst include tests/*.py tests/*.sh include tests/*/*.py tests/*/*.sh tests/*/*.sql londiste-3.12/Makefile000066400000000000000000000041261447267722200147400ustar00rootroot00000000000000 VERSION = $(shell python3 setup.py --version) RXVERSION = $(shell python3 setup.py --version | sed 's/\./[.]/g') TAG = v$(VERSION) NEWS = NEWS.rst all: lint test test: tox -e py38 lint: tox -q -e lint xlint: tox -q -e xlint clean: rm -rf build *.egg-info */__pycache__ tests/*.pyc rm -rf .pybuild MANIFEST xclean: clean rm -rf .tox dist sdist: python3 setup.py -q sdist checkver: @echo "Checking version" @grep -Eq '^\w+ v$(RXVERSION)\b' $(NEWS) \ || { echo "Version '$(VERSION)' not in $(NEWS)"; exit 1; } @echo "Checking git repo" @git diff --stat --exit-code || { echo "ERROR: Unclean repo"; exit 1; } release: checkver git tag $(TAG) git push github $(TAG):$(TAG) unrelease: git push github :$(TAG) git tag -d $(TAG) shownote: awk -v VER="v$(VERSION)" -f etc/note.awk $(NEWS) \ | pandoc -f rst -t gfm --wrap=none # # docker tests # pull-python: docker pull python:3.11-slim-bookworm docker pull python:3.10-slim-bookworm docker pull python:3.9-slim-bookworm docker pull python:3.8-slim-bookworm docker pull python:3.7-slim-bookworm pull-postgres: docker pull postgres:15-bookworm docker pull postgres:14-bookworm docker pull postgres:13-bookworm docker pull postgres:12-bookworm docker pull postgres:11-bookworm prune: docker image prune -f docker image ls # # test with combined image # COMPOSE_COMBO = docker compose -f etc/compose-combo.yml --project-directory . test-pg15-build: $(COMPOSE_COMBO) build test-pg15 test-pg15-shell: $(COMPOSE_COMBO) run --entrypoint bash test-pg15 test-pg10 test-pg11 test-pg12 test-pg13 test-pg14 test-pg15: $(COMPOSE_COMBO) up --build $@ test-all: $(COMPOSE_COMBO) up --build # # does not work yet # COMPOSE_SPLIT = docker compose -f etc/compose-split.yml --project-directory . dtest-db-build: $(COMPOSE_SPLIT) build db dtest-db-shell: $(COMPOSE_SPLIT) run --entrypoint bash db dtest-db-up: $(COMPOSE_SPLIT) up --build db dtest-worker-build: $(COMPOSE_SPLIT) build test dtest-worker-shell: $(COMPOSE_SPLIT) run --entrypoint bash test dtest-worker-up: $(COMPOSE_SPLIT) up --build test dtest-split-run: $(COMPOSE_SPLIT) up londiste-3.12/NEWS.rst000066400000000000000000000047251447267722200146130ustar00rootroot00000000000000 NEWS ==== Londiste v3.12 -------------- * typing: add full typing * tests/obfuscate: fix pgqd conf * build: convert to pyproject * mk: test-db15 .. test-pg15, test-all targets for docker tests * ci: drop obsolete actions * handlers: drop encoding validator, it has never worked on Py3 Londiste v3.11 -------------- * playback: allow EXECUTE to pass through on merge nodes * playback: make EXECUTE passthrough tunable * tox: update packages * cleanup: improve typings Londiste v3.10 -------------- * shard: add disable_replay parameter. * lint: upgrade linters, fix new warnings Londiste v3.9.2 --------------- * playback: fix weird result check Londiste v3.9.1 --------------- * playback: tolerate weird result from version query Londiste v3.9 ------------- * playback: support multistep fkey restore Londiste v3.8.6 --------------- * playback: fix variable init. Londiste v3.8.5 --------------- * playback: move ``local_only`` setup even earlier. Londiste v3.8.4 --------------- * playback: fix ``local_only`` setup which allowed first batch without filter. Londiste v3.8.3 --------------- * status: support --compact option Londiste v3.8.2 --------------- * shard: better error handling on missing shard key * admin: disable pidfile write for wait-sync Londiste v3.8.1 --------------- * Filter tables on registration: register_only_tables/register_skip_tables * Filter seqs on registration: register_only_seqs/register_skip_seqs Londiste v3.8 ------------- * shard handler: support filtered copy, load settings from config file: ``shard_hash_func``, ``shard_info_sql``. * fix: always call handler's ``prepare_batch``. Londiste v3.7.1 --------------- * Fix write_hook in parallel copy. Londiste v3.7 ------------- * Parallel single table copy: - threaded_copy_tables - list of glob patterns for table names - threaded_copy_pool_size - number of threads * Various linter fixes Londiste v3.6.1 --------------- * Fix fkey log message * Upgrade Skytools dependency in tox to get copy_from fix * Various linter fixes Londiste v3.6 ------------- * obfuscate: process copy events * Various linter fixes * Docker tests Londiste v3.5 ------------- * obfuscate: Improved decoding/encoding handling * Setup Github Actions * Code cleanups * Drop Debian packaging Londiste v3.4.1 --------------- * obfuscate: improvements for better usage with inheritance * pip: due to psycopg2/psycopg2-binary duality, drop direct dependency Londiste v3.4 ------------- * Move to separate repo londiste-3.12/README.rst000066400000000000000000000001211447267722200147560ustar00rootroot00000000000000 Londiste ======== Londiste is PgQ-based logical replication for PostgreSQL. londiste-3.12/etc/000077500000000000000000000000001447267722200140505ustar00rootroot00000000000000londiste-3.12/etc/Dockerfile.combo000066400000000000000000000044111447267722200171400ustar00rootroot00000000000000# contains both db and workers # https://docs.docker.com/engine/reference/builder/ ARG PY=3 FROM python:${PY}-slim-bookworm # need to repeat after FROM ARG PY=3 ARG PG=15 # copy to env ENV PY=${PY} ENV PG=${PG} WORKDIR /code RUN set -ex; \ echo "Init: PY=${PY} PG=${PG}"; \ export DEBIAN_FRONTEND="noninteractive"; \ apt="apt-get -qq -y --no-install-recommends"; \ ${apt} update; \ ${apt} install wget gnupg2 lsb-release git make gcc libc6-dev cpio; \ ${apt} dist-upgrade; \ wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor > /etc/apt/trusted.gpg.d/pgdg-archive.gpg; \ echo "deb http://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main ${PG}" > /etc/apt/sources.list.d/pgdg.list; \ ${apt} update; \ # disable new cluster creation mkdir -p /etc/postgresql-common/createcluster.d; \ echo "create_main_cluster = false" | tee /etc/postgresql-common/createcluster.d/no-main.conf; \ ${apt} install postgresql-${PG} postgresql-server-dev-${PG} pgqd; \ # build extensions git clone -q https://github.com/pgq/pgq; make -C pgq; \ bash -c "PATH='${PATH}' make install -C pgq"; \ git clone -q https://github.com/pgq/pgq-node; make -C pgq-node; \ bash -c "PATH='${PATH}' make install -C pgq-node"; \ git clone -q https://github.com/pgq/londiste-sql; make -C londiste-sql; \ bash -c "PATH='${PATH}' make install -C londiste-sql"; \ # clean build env rm -rf pgq pgq-node londiste-sql; \ ${apt} remove wget gnupg2 lsb-release git make gcc libc6-dev postgresql-server-dev-${PG}; \ ${apt} autoremove; \ rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin; \ rm -rf /var/lib/apt/lists/*; \ chown postgres:postgres /code; \ echo "OK: PY=${PY} PG=${PG}"; ENV PATH="/usr/lib/postgresql/${PG}/bin:/var/lib/postgresql/.local/bin:${PATH}" ENV PGHOST="/tmp" USER postgres RUN set -ex; \ rm -rf data log; \ mkdir -p log; \ LANG=C initdb data; \ sed -ri -e "s,^[# ]*(unix_socket_directories).*,\\1='/tmp'," data/postgresql.conf; COPY etc/requirements.txt etc/docker_run.sh etc/ RUN set -ex; \ pip3 -q --disable-pip-version-check --no-cache-dir install --user -r etc/requirements.txt ENTRYPOINT ["etc/docker_run.sh"] londiste-3.12/etc/Dockerfile.db000066400000000000000000000022701447267722200164270ustar00rootroot00000000000000# only postgres # https://docs.docker.com/engine/reference/builder/ ARG PG=15 FROM postgres:${PG}-bookworm #ARG PG=15 RUN set -ex; \ export PG="${PG_MAJOR}"; \ export DEBIAN_FRONTEND="noninteractive"; \ apt="apt-get -qq -y --no-install-recommends"; \ echo "Installing build env: ${PG}"; \ ${apt} update; \ ${apt} install ca-certificates git make gcc postgresql-server-dev-${PG}; \ echo "Building extensions"; \ export PATH="/usr/lib/postgresql/${PG}/bin:${PATH}"; \ git clone -q https://github.com/pgq/pgq; make -C pgq; \ bash -c "PATH='${PATH}' make install -C pgq"; \ git clone -q https://github.com/pgq/pgq-node; make -C pgq-node; \ bash -c "PATH='${PATH}' make install -C pgq-node"; \ git clone -q https://github.com/pgq/londiste-sql; make -C londiste-sql; \ bash -c "PATH='${PATH}' make install -C londiste-sql"; \ echo "Cleaning build env"; \ rm -rf pgq pgq-node londiste-sql; \ ${apt} remove ca-certificates git make gcc postgresql-server-dev-${PG}; \ ${apt} autoremove; \ rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin; \ rm -rf /var/lib/apt/lists/*; \ echo "OK" londiste-3.12/etc/Dockerfile.worker000066400000000000000000000013561447267722200173570ustar00rootroot00000000000000# only workers # https://docs.docker.com/engine/reference/builder/ ARG PY=3 FROM python:${PY}-slim-bookworm ARG PGHOST="db" ENV PGHOST="${PGHOST}" WORKDIR /code RUN set -ex; \ apt="apt-get -qq -y --no-install-recommends"; \ ${apt} update; \ ${apt} install pgqd; \ adduser --disabled-password --gecos=Replica londiste; \ rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin; \ rm -rf /var/lib/apt/lists/*; \ chown londiste:londiste /code; \ echo "OK: PY=${PY}" USER londiste COPY etc/requirements.txt etc/docker_run.sh etc/ RUN set -ex; \ pip3 -q --disable-pip-version-check --no-cache-dir install --user -r etc/requirements.txt ENTRYPOINT ["etc/docker_run.sh"] londiste-3.12/etc/compose-combo.yml000066400000000000000000000037041447267722200173410ustar00rootroot00000000000000# docker compose v2 file # # https://docs.docker.com/compose/compose-file/ name: londiste-combo services: test-pg15: container_name: "test-pg15" build: context: . dockerfile: etc/Dockerfile.combo args: PG: "15" PY: "3.11" volumes: - type: bind read_only: true source: . target: /repo deploy: mode: global command: ["./run.sh"] test-pg14: container_name: "test-pg14" build: context: . dockerfile: etc/Dockerfile.combo args: PG: "14" PY: "3.10" volumes: - type: bind read_only: true source: . target: /repo deploy: mode: global command: ["./run.sh"] test-pg13: container_name: "test-pg13" build: context: . dockerfile: etc/Dockerfile.combo args: PG: "13" PY: "3.9" volumes: - type: bind read_only: true source: . target: /repo deploy: mode: global command: ["./run.sh"] test-pg12: container_name: "test-pg12" build: context: . dockerfile: etc/Dockerfile.combo args: PG: "12" PY: "3.8" volumes: - type: bind read_only: true source: . target: /repo deploy: mode: global command: ["./run.sh"] test-pg11: container_name: "test-pg11" build: context: . dockerfile: etc/Dockerfile.combo args: PG: "11" PY: "3.7" volumes: - type: bind read_only: true source: . target: /repo deploy: mode: global command: ["./run.sh"] test-pg10: container_name: "test-pg10" build: context: . dockerfile: etc/Dockerfile.combo args: PG: "10" PY: "3.8" volumes: - type: bind read_only: true source: . target: /repo deploy: mode: global command: ["./run.sh"] londiste-3.12/etc/compose-split.yml000066400000000000000000000012521447267722200173710ustar00rootroot00000000000000# docker compose v2 file # https://docs.docker.com/compose/compose-file/ # fixme: does not work yet name: londiste-split services: test: build: context: . dockerfile: etc/Dockerfile.worker args: PY: "3.10" volumes: - type: bind read_only: true source: . target: /repo networks: - testnet deploy: mode: global command: ["./run.sh"] db: build: context: . dockerfile: etc/Dockerfile.db args: PG: "15" volumes: - pgdata:/var/lib/postgresql/data networks: - testnet deploy: mode: global volumes: pgdata: networks: testnet: londiste-3.12/etc/docker_run.sh000077500000000000000000000012641447267722200165450ustar00rootroot00000000000000#! /bin/bash # python3 setup.py build -t ../tmp -b ../tmp set -e set -x copy_repo() { mkdir -p "$2" cd "$1" ./etc/showfiles.sh | cpio -p --no-preserve-owner -d -m "$2" chmod -R u+w "$2" cd /code } pg_start() { cd /code pg_ctl -D data -l log/pg.log start || { cat log/pg.log ; exit 1; } } #cd /repo #pip3 -q --disable-pip-version-check --no-cache-dir install --user -r /repo/etc/requirements.txt chmod -R u+w /code/src || true rm -rf /code/src /code/tests copy_repo /repo /code/src cd /code/src python3 setup.py -q sdist pip3 install --disable-pip-version-check --no-cache-dir --no-deps --user dist/* cp -rp tests /code pg_start cd /code/tests exec "$@" londiste-3.12/etc/note.awk000066400000000000000000000004601447267722200155210ustar00rootroot00000000000000# extract version notes for version VER /^[-_0-9a-zA-Z]+ v?[0-9]/ { if ($2 == VER) { good = 1 next } else { good = 0 } } /^(===|---)/ { next } { if (good) { # also remove sphinx syntax print gensub(/:(\w+):`~?([^`]+)`/, "``\\2``", "g") } } londiste-3.12/etc/requirements.build.txt000066400000000000000000000000631447267722200204310ustar00rootroot00000000000000setuptools>=67 wheel>=0.41 twine==4.0.2 tox==4.8.0 londiste-3.12/etc/requirements.txt000066400000000000000000000002001447267722200173240ustar00rootroot00000000000000setuptools>=67 wheel>=0.41 build>=0.10 flit-core>=3.6,<4 pip>=23 psycopg2-binary>=2.9 pyyaml>=6.0,<7 skytools==3.9.2 pgq==3.8 londiste-3.12/etc/showfiles.sh000077500000000000000000000006651447267722200164210ustar00rootroot00000000000000#! /bin/sh find . \ -name '.git' -prune -o \ -name 'dist' -prune -o \ -name 'build' -prune -o \ -name '*.egg-info' -prune -o \ -name '__pycache__' -prune -o \ -name '.pytype' -prune -o \ -name '.mypy_cache' -prune -o \ -name '.tox' -prune -o \ -name 'data' -prune -o \ -name '*.log' -prune -o \ -name '*.pid' -prune -o \ -name 'tmp' -prune -o \ -name '*.swp' -prune -o \ -print londiste-3.12/londiste/000077500000000000000000000000001447267722200151165ustar00rootroot00000000000000londiste-3.12/londiste/__init__.py000066400000000000000000000000711447267722200172250ustar00rootroot00000000000000 """Replication on top of PgQ.""" __version__ = "3.12" londiste-3.12/londiste/admin.py000066400000000000000000001014111447267722200165560ustar00rootroot00000000000000"""Londiste setup and sanity checker. """ from typing import Optional, Sequence, Dict, List, Any, cast import os import re import sys import optparse import skytools from skytools.basetypes import Connection, Cursor, DictRow #from pgq import BatchInfo from pgq.cascade.admin import CascadeAdmin from pgq.cascade.nodeinfo import NodeInfo from .exec_attrs import ExecAttrs from .handler import ( create_handler_string, build_handler, show as show_handlers, ) from .handlers import load_handler_modules from .util import find_copy_source __all__ = ['LondisteSetup'] class LondisteSetup(CascadeAdmin): """Londiste-specific admin commands.""" initial_db_name: str = 'node_db' provider_location: Optional[str] = None commands_without_pidfile = CascadeAdmin.commands_without_pidfile + [ 'tables', 'seqs', 'missing', 'show-handlers', 'wait-sync', ] register_only_tables: Optional[Sequence[str]] = None register_only_seqs: Optional[Sequence[str]] = None register_skip_tables: Optional[Sequence[str]] = None register_skip_seqs: Optional[Sequence[str]] = None def install_code(self, db: Connection) -> None: self.extra_objs = [ skytools.DBSchema("londiste", sql='create extension londiste'), #skytools.DBFunction("londiste.global_add_table", 2, sql_file='londiste.upgrade_2.1_to_3.1.sql'), ] super().install_code(db) def __init__(self, args: Sequence[str]) -> None: """Londiste setup init.""" super().__init__('londiste', 'db', args, worker_setup=True) # compat self.queue_name = self.cf.get('pgq_queue_name', '') # real if not self.queue_name: self.queue_name = self.cf.get('queue_name') self.set_name = self.queue_name self.lock_timeout = self.cf.getfloat('lock_timeout', 10) self.register_only_tables = self.cf.getlist("register_only_tables", []) self.register_only_seqs = self.cf.getlist("register_only_seqs", []) self.register_skip_tables = self.cf.getlist("register_skip_tables", []) self.register_skip_seqs = self.cf.getlist("register_skip_seqs", []) load_handler_modules(self.cf) def init_optparse(self, parser: Optional[optparse.OptionParser] = None) -> optparse.OptionParser: """Add londiste switches to CascadeAdmin ones.""" p = super().init_optparse(parser) p.add_option("--expect-sync", action="store_true", dest="expect_sync", help="no copy needed", default=False) p.add_option("--skip-truncate", action="store_true", dest="skip_truncate", help="do not delete old data", default=False) p.add_option("--find-copy-node", action="store_true", dest="find_copy_node", help="add: find table source for copy by walking upwards") p.add_option("--copy-node", metavar="NODE", dest="copy_node", help="add: use NODE as source for initial copy") p.add_option("--force", action="store_true", help="force", default=False) p.add_option("--all", action="store_true", help="include all tables", default=False) p.add_option("--wait-sync", action="store_true", help="add: wait until all tables are in sync") p.add_option("--create", action="store_true", help="create, minimal", default=False) p.add_option("--create-full", action="store_true", help="create, full") p.add_option("--trigger-flags", help="set trigger flags (BAIUDLQ)") p.add_option("--trigger-arg", action="append", help="custom trigger arg") p.add_option("--no-triggers", action="store_true", help="no triggers on table") p.add_option("--handler", action="store", help="add: custom handler for table") p.add_option("--handler-arg", action="append", help="add: argument to custom handler") p.add_option("--merge-all", action="store_true", help="merge tables from all source queues", default=False) p.add_option("--no-merge", action="store_true", help="do not merge tables from source queues", default=False) p.add_option("--max-parallel-copy", metavar="NUM", type="int", help="max number of parallel copy processes") p.add_option("--dest-table", metavar="NAME", help="add: name for actual table") p.add_option("--skip-non-existing", action="store_true", help="add: skip object that does not exist") p.add_option("--names-only", action="store_true", help="tables: show only table names (for scripting)", default=False) return p def extra_init(self, node_type: str, node_db: Connection, provider_db: Optional[Connection]) -> None: """Callback from CascadeAdmin init.""" if not provider_db: return pcurs = provider_db.cursor() ncurs = node_db.cursor() # sync tables q = "select table_name from londiste.get_table_list(%s)" pcurs.execute(q, [self.set_name]) for row in pcurs.fetchall(): tbl = row['table_name'] if self.register_only_tables and tbl not in self.register_only_tables: continue if self.register_skip_tables and tbl in self.register_skip_tables: continue q = "select * from londiste.global_add_table(%s, %s)" ncurs.execute(q, [self.set_name, tbl]) # sync seqs q = "select seq_name, last_value from londiste.get_seq_list(%s)" pcurs.execute(q, [self.set_name]) for row in pcurs.fetchall(): seq = row['seq_name'] val = row['last_value'] if self.register_only_seqs and seq not in self.register_only_seqs: continue if self.register_skip_seqs and seq in self.register_skip_seqs: continue q = "select * from londiste.global_update_seq(%s, %s, %s)" ncurs.execute(q, [self.set_name, seq, val]) # done node_db.commit() provider_db.commit() def is_root(self) -> bool: assert self.queue_info return self.queue_info.local_node.type == 'root' def set_lock_timeout(self, curs: Cursor) -> None: ms = int(1000 * self.lock_timeout) if ms > 0: q = "SET LOCAL statement_timeout = %d" % ms self.log.debug(q) curs.execute(q) def cmd_add_table(self, *tables: str) -> None: """Attach table(s) to local node.""" self.load_local_info() src_db = self.get_provider_db() if not self.is_root(): src_curs = src_db.cursor() src_tbls = self.fetch_set_tables(src_curs) src_db.commit() dst_db = self.get_database('db') dst_curs = dst_db.cursor() dst_tbls = self.fetch_set_tables(dst_curs) if self.is_root(): src_tbls = dst_tbls else: self.sync_table_list(dst_curs, src_tbls, dst_tbls) dst_db.commit() needs_tbl = self.handler_needs_table() args = self.expand_arg_list(dst_db, 'r', False, tables, needs_tbl) # pick proper create flags if self.options.create_full: create_flags = skytools.T_ALL elif self.options.create: create_flags = skytools.T_TABLE | skytools.T_PKEY else: create_flags = 0 # search for usable copy node if requested & needed if (self.options.find_copy_node and create_flags != 0 and needs_tbl and not self.is_root()): assert self.queue_name assert self.provider_location src_name, _, _ = find_copy_source(self, self.queue_name, args, "?", self.provider_location) self.options.copy_node = src_name self.close_database('provider_db') src_db = self.get_provider_db() src_curs = src_db.cursor() src_tbls = self.fetch_set_tables(src_curs) src_db.commit() # dont check for exist/not here (root handling) if not self.is_root() and not self.options.expect_sync and not self.options.find_copy_node: problems = False for tbl in args: tbl = skytools.fq_name(tbl) if (tbl in src_tbls) and not src_tbls[tbl]['local']: if self.options.skip_non_existing: self.log.warning("Table %s does not exist on provider", tbl) else: self.log.error("Table %s does not exist on provider, need to switch to different provider", tbl) problems = True if problems: self.log.error("Problems, canceling operation") sys.exit(1) # sanity check if self.options.dest_table and len(args) > 1: self.log.error("--dest-table can be given only for single table") sys.exit(1) # seems ok for tbl in args: self.add_table(src_db, dst_db, tbl, create_flags, src_tbls) # wait if self.options.wait_sync: self.wait_for_sync(dst_db) def add_table(self, src_db: Connection, dst_db: Connection, tbl: str, create_flags: int, src_tbls: Dict[str, DictRow]) -> None: # use full names tbl = skytools.fq_name(tbl) dest_table = self.options.dest_table or tbl dest_table = skytools.fq_name(dest_table) src_curs = src_db.cursor() dst_curs = dst_db.cursor() tbl_exists = skytools.exists_table(dst_curs, dest_table) dst_db.commit() self.set_lock_timeout(dst_curs) if dest_table == tbl: desc = tbl else: desc = "%s(%s)" % (tbl, dest_table) if create_flags: if tbl_exists: self.log.info('Table %s already exist, not touching', desc) else: src_dest_table = src_tbls[tbl]['dest_table'] if not skytools.exists_table(src_curs, src_dest_table): # table not present on provider - nowhere to get the DDL from self.log.warning('Table %s missing on provider, cannot create, skipping', desc) return schema = skytools.fq_name_parts(dest_table)[0] if not skytools.exists_schema(dst_curs, schema): q = "create schema %s" % skytools.quote_ident(schema) dst_curs.execute(q) s = skytools.TableStruct(src_curs, src_dest_table) src_db.commit() # create, using rename logic only when necessary newname = None if src_dest_table != dest_table: newname = dest_table s.create(dst_curs, create_flags, log=self.log, new_table_name=newname) elif not tbl_exists and self.options.skip_non_existing: self.log.warning('Table %s does not exist on local node, skipping', desc) return tgargs = self.build_tgargs() attrs: Dict[str, str] = {} if self.options.handler: attrs['handler'] = self.build_handler(tbl, tgargs, self.options.dest_table) if self.options.find_copy_node: attrs['copy_node'] = '?' elif self.options.copy_node: attrs['copy_node'] = self.options.copy_node if not self.options.expect_sync: if self.options.skip_truncate: attrs['skip_truncate'] = "1" if self.options.max_parallel_copy: attrs['max_parallel_copy'] = self.options.max_parallel_copy # actual table registration args = [self.set_name, tbl, tgargs, None, None] if attrs: args[3] = skytools.db_urlencode(attrs) if dest_table != tbl: args[4] = dest_table q = "select * from londiste.local_add_table(%s, %s, %s, %s, %s)" self.exec_cmd(dst_curs, q, args) dst_db.commit() def build_tgargs(self) -> List[str]: """Build trigger args""" tgargs: List[str] = [] if self.options.trigger_arg: tgargs = self.options.trigger_arg tgflags = self.options.trigger_flags if tgflags: tgargs.append('tgflags=' + tgflags) if self.options.no_triggers: tgargs.append('no_triggers') if self.options.merge_all: tgargs.append('merge_all') if self.options.no_merge: tgargs.append('no_merge') if self.options.expect_sync: tgargs.append('expect_sync') return tgargs def build_handler(self, tbl: str, tgargs: List[str], dest_table: Optional[str] = None) -> str: """Build handler and return handler string""" hstr = create_handler_string(self.options.handler, self.options.handler_arg) p = build_handler(tbl, hstr, dest_table) p.add(tgargs) return hstr def handler_needs_table(self) -> bool: if self.options.handler: hstr = create_handler_string(self.options.handler, self.options.handler_arg) p = build_handler('unused.string', hstr, None) return p.needs_table() return True def sync_table_list(self, dst_curs: Cursor, src_tbls: Dict[str, DictRow], dst_tbls: Dict[str, DictRow]) -> None: for tbl in src_tbls.keys(): if self.register_only_tables and tbl not in self.register_only_tables: continue if self.register_skip_tables and tbl in self.register_skip_tables: continue q = "select * from londiste.global_add_table(%s, %s)" if tbl not in dst_tbls: self.log.info("Table %s info missing from subscriber, adding", tbl) self.exec_cmd(dst_curs, q, [self.set_name, tbl]) dst_tbls[tbl] = cast(DictRow, {'local': False, 'dest_table': tbl}) for tbl in list(dst_tbls.keys()): q = "select * from londiste.global_remove_table(%s, %s)" if tbl not in src_tbls: self.log.info("Table %s gone but exists on subscriber, removing", tbl) self.exec_cmd(dst_curs, q, [self.set_name, tbl]) del dst_tbls[tbl] def fetch_set_tables(self, curs: Cursor) -> Dict[str, DictRow]: q = "select table_name, local, "\ " coalesce(dest_table, table_name) as dest_table "\ " from londiste.get_table_list(%s)" curs.execute(q, [self.set_name]) res = {} for row in curs.fetchall(): res[row[0]] = row return res def cmd_remove_table(self, *tables: str) -> None: """Detach table(s) from local node.""" db = self.get_database('db') args = self.expand_arg_list(db, 'r', True, tables) q = "select * from londiste.local_remove_table(%s, %s)" self.exec_cmd_many(db, q, [self.set_name], args) def cmd_change_handler(self, tbl: str) -> None: """Change handler (table_attrs) of the replicated table.""" self.load_local_info() tbl = skytools.fq_name(tbl) db = self.get_database('db') curs = db.cursor() q = "select table_attrs, dest_table "\ " from londiste.get_table_list(%s) "\ " where table_name = %s and local" curs.execute(q, [self.set_name, tbl]) if curs.rowcount == 0: self.log.error("Table %s not found on this node", tbl) sys.exit(1) r_attrs, dest_table = curs.fetchone() attrs = skytools.db_urldecode(r_attrs or '') old_handler = attrs.get('handler') tgargs = self.build_tgargs() if self.options.handler: new_handler = self.build_handler(tbl, tgargs, dest_table) else: new_handler = None if old_handler == new_handler: self.log.info("Handler is already set to desired value, nothing done") sys.exit(0) if new_handler: attrs['handler'] = new_handler elif 'handler' in attrs: del attrs['handler'] args = [self.set_name, tbl, tgargs, None] if attrs: args[3] = skytools.db_urlencode(attrs) q = "select * from londiste.local_change_handler(%s, %s, %s, %s)" self.exec_cmd(curs, q, args) db.commit() def cmd_add_seq(self, *seqs: str) -> None: """Attach seqs(s) to local node.""" dst_db = self.get_database('db') dst_curs = dst_db.cursor() src_db = self.get_provider_db() src_curs = src_db.cursor() src_seqs = self.fetch_seqs(src_curs) dst_seqs = self.fetch_seqs(dst_curs) src_db.commit() self.sync_seq_list(dst_curs, src_seqs, dst_seqs) dst_db.commit() args = self.expand_arg_list(dst_db, 'S', False, seqs) # pick proper create flags if self.options.create_full: create_flags = skytools.T_SEQUENCE elif self.options.create: create_flags = skytools.T_SEQUENCE else: create_flags = 0 # seems ok for seq in args: seq = skytools.fq_name(seq) self.add_seq(src_db, dst_db, seq, create_flags) dst_db.commit() def add_seq(self, src_db: Connection, dst_db: Connection, seq: str, create_flags: int) -> None: src_curs = src_db.cursor() dst_curs = dst_db.cursor() seq_exists = skytools.exists_sequence(dst_curs, seq) if create_flags: if seq_exists: self.log.info('Sequence %s already exist, not creating', seq) else: if not skytools.exists_sequence(src_curs, seq): # sequence not present on provider - nowhere to get the DDL from self.log.warning('Sequence "%s" missing on provider, skipping', seq) return s = skytools.SeqStruct(src_curs, seq) src_db.commit() s.create(dst_curs, create_flags, log=self.log) elif not seq_exists: if self.options.skip_non_existing: self.log.warning('Sequence "%s" missing on local node, skipping', seq) return else: raise skytools.UsageError("Sequence %r missing on local node" % (seq,)) q = "select * from londiste.local_add_seq(%s, %s)" self.exec_cmd(dst_curs, q, [self.set_name, seq]) def fetch_seqs(self, curs: Cursor) -> Dict[str, DictRow]: q = "select seq_name, last_value, local from londiste.get_seq_list(%s)" curs.execute(q, [self.set_name]) res = {} for row in curs.fetchall(): res[row[0]] = row return res def sync_seq_list(self, dst_curs: Cursor, src_seqs: Dict[str, DictRow], dst_seqs: Dict[str, DictRow]) -> None: for seq in src_seqs.keys(): q = "select * from londiste.global_update_seq(%s, %s, %s)" if self.register_only_seqs and seq not in self.register_only_seqs: continue if self.register_skip_seqs and seq in self.register_skip_seqs: continue if seq not in dst_seqs: self.log.info("Sequence %s info missing from subscriber, adding", seq) self.exec_cmd(dst_curs, q, [self.set_name, seq, src_seqs[seq]['last_value']]) tmp = dict(src_seqs[seq].items()) tmp['local'] = False dst_seqs[seq] = cast(DictRow, tmp) for seq in dst_seqs.keys(): q = "select * from londiste.global_remove_seq(%s, %s)" if seq not in src_seqs: self.log.info("Sequence %s gone but exists on subscriber, removing", seq) self.exec_cmd(dst_curs, q, [self.set_name, seq]) del dst_seqs[seq] def cmd_remove_seq(self, *seqs: str) -> None: """Detach seqs(s) from local node.""" q = "select * from londiste.local_remove_seq(%s, %s)" db = self.get_database('db') args = self.expand_arg_list(db, 'S', True, seqs) self.exec_cmd_many(db, q, [self.set_name], args) def cmd_resync(self, *tables: str) -> None: """Reload data from provider node.""" db = self.get_database('db') args = self.expand_arg_list(db, 'r', True, tables) if not self.options.find_copy_node: self.load_local_info() src_db = self.get_provider_db() src_curs = src_db.cursor() src_tbls = self.fetch_set_tables(src_curs) src_db.commit() problems = 0 for tbl in args: tbl = skytools.fq_name(tbl) if tbl not in src_tbls or not src_tbls[tbl]['local']: self.log.error("Table %s does not exist on provider, need to switch to different provider", tbl) problems += 1 if problems > 0: self.log.error("Problems, cancelling operation") sys.exit(1) if self.options.find_copy_node or self.options.copy_node: q = "select table_name, table_attrs from londiste.get_table_list(%s) where local" cur = db.cursor() cur.execute(q, [self.set_name]) for row in cur.fetchall(): if row['table_name'] not in args: continue attrs = skytools.db_urldecode(row['table_attrs'] or '') if self.options.find_copy_node: attrs['copy_node'] = '?' elif self.options.copy_node: attrs['copy_node'] = self.options.copy_node s_attrs = skytools.db_urlencode(attrs) q = "select * from londiste.local_set_table_attrs(%s, %s, %s)" self.exec_cmd(db, q, [self.set_name, row['table_name'], s_attrs]) q = "select * from londiste.local_set_table_state(%s, %s, null, null)" self.exec_cmd_many(db, q, [self.set_name], args) def cmd_tables(self) -> None: """Show attached tables.""" db = self.get_database('db') def show_attr(a: str) -> str: if a: return repr(skytools.db_urldecode(a)) return '' if self.options.names_only: sql = """select table_name from londiste.get_table_list(%s) where local order by table_name""" curs = db.cursor() curs.execute(sql, [self.set_name]) rows = curs.fetchall() db.commit() if len(rows) == 0: return for row in rows: print(row['table_name']) else: q = """select table_name, merge_state, table_attrs from londiste.get_table_list(%s) where local order by table_name""" self.display_table(db, "Tables on node", q, [self.set_name], fieldfmt={'table_attrs': show_attr}) def cmd_seqs(self) -> None: """Show attached seqs.""" q = "select seq_name, local, last_value from londiste.get_seq_list(%s)" db = self.get_database('db') self.display_table(db, "Sequences on node", q, [self.set_name]) def cmd_missing(self) -> None: """Show missing tables on local node.""" q = "select * from londiste.local_show_missing(%s)" db = self.get_database('db') self.display_table(db, "Missing objects on node", q, [self.set_name]) def cmd_check(self) -> None: """TODO: check if structs match""" pass def cmd_fkeys(self) -> None: """TODO: show removed fkeys.""" pass def cmd_triggers(self) -> None: """TODO: show removed triggers.""" pass def cmd_show_handlers(self, *args: str) -> None: """Show help about handlers.""" show_handlers(args) def cmd_execute(self, *files: str) -> None: db = self.get_database('db') curs = db.cursor() tables = self.fetch_set_tables(curs) seqs = self.fetch_seqs(curs) # generate local maps local_tables = {} local_seqs = {} for tbl in tables.values(): if tbl['local']: local_tables[tbl['table_name']] = tbl['dest_table'] for seq in seqs.values(): if seq['local']: local_seqs[seq['seq_name']] = seq['seq_name'] # set replica role for EXECUTE transaction curs.execute("select londiste.set_session_replication_role('local', true)") for fn in files: fname = os.path.basename(fn) with open(fn, "r", encoding="utf8") as f: sql = f.read() attrs = ExecAttrs(sql=sql) q = "select * from londiste.execute_start(%s, %s, %s, true, %s)" res = self.exec_cmd(db, q, [self.queue_name, fname, sql, attrs.to_urlenc()], commit=False) ret = res[0]['ret_code'] if ret > 200: self.log.warning("Skipping execution of '%s'", fname) continue if attrs.need_execute(curs, local_tables, local_seqs): self.log.info("%s: executing sql", fname) xsql = attrs.process_sql(sql, local_tables, local_seqs) for stmt in skytools.parse_statements(xsql): curs.execute(stmt) else: self.log.info("%s: This SQL does not need to run on this node.", fname) q = "select * from londiste.execute_finish(%s, %s)" self.exec_cmd(db, q, [self.queue_name, fname], commit=False) db.commit() def get_provider_db(self) -> Connection: if self.options.copy_node: # use custom node for copy source_node = self.options.copy_node assert self.queue_info m = self.queue_info.get_member(source_node) if not m: raise skytools.UsageError("Cannot find node <%s>" % (source_node,)) if source_node == self.local_node: raise skytools.UsageError("Cannot use itself as provider") self.provider_location = m.location if not self.provider_location: db = self.get_database('db') q = 'select * from pgq_node.get_node_info(%s)' res = self.exec_cmd(db, q, [self.queue_name], quiet=True) self.provider_location = res[0]['provider_location'] return self.get_database('provider_db', connstr=self.provider_location, profile='remote') def expand_arg_list(self, db: Connection, kind: str, existing: bool, args: Sequence[str], needs_tbl: bool=True) -> List[str]: curs = db.cursor() if kind == 'S': q1 = "select seq_name, local from londiste.get_seq_list(%s) where local" elif kind == 'r': q1 = "select table_name, local from londiste.get_table_list(%s) where local" else: raise Exception("bug") q2 = "select obj_name from londiste.local_show_missing(%%s) where obj_kind = '%s'" % kind lst_exists: List[str] = [] map_exists: Dict[str, int] = {} curs.execute(q1, [self.set_name]) for row in curs.fetchall(): lst_exists.append(row[0]) map_exists[row[0]] = 1 lst_missing: List[str] = [] map_missing: Dict[str, int] = {} curs.execute(q2, [self.set_name]) for row in curs.fetchall(): lst_missing.append(row[0]) map_missing[row[0]] = 1 db.commit() if not args and self.options.all: if existing: return lst_exists else: return lst_missing allow_nonexist = not needs_tbl if existing: res = self.solve_globbing(args, lst_exists, map_exists, map_missing, allow_nonexist) else: res = self.solve_globbing(args, lst_missing, map_missing, map_exists, allow_nonexist) if not res: self.log.info("what to do ?") return res def solve_globbing(self, args: Sequence[str], full_list: Sequence[str], full_map: Dict[str, int], reverse_map: Dict[str, int], allow_nonexist: bool) -> List[str]: def glob2regex(s: str) -> str: s = s.replace('.', '[.]').replace('?', '.').replace('*', '.*') return '^%s$' % s res_map = {} res_list = [] err = 0 for a in args: if a.find('*') >= 0 or a.find('?') >= 0: if a.find('.') < 0: a = 'public.' + a rc = re.compile(glob2regex(a)) for x in full_list: if rc.match(x): if x not in res_map: res_map[x] = 1 res_list.append(x) else: a = skytools.fq_name(a) if a in res_map: continue elif a in full_map: res_list.append(a) res_map[a] = 1 elif a in reverse_map: self.log.info("%s already processed", a) elif allow_nonexist: res_list.append(a) res_map[a] = 1 elif self.options.force: self.log.warning("%s not available, but --force is used", a) res_list.append(a) res_map[a] = 1 else: self.log.warning("%s not available", a) err = 1 if err: raise skytools.UsageError("Cannot proceed") return res_list def load_extra_status(self, curs: Cursor, node: NodeInfo) -> None: """Fetch extra info.""" # must be thread-safe (!) super().load_extra_status(curs, node) curs.execute("select * from londiste.get_table_list(%s)", [self.queue_name]) n_ok = n_half = n_ign = 0 for tbl in curs.fetchall(): if not tbl['local']: n_ign += 1 elif tbl['merge_state'] == 'ok': n_ok += 1 else: n_half += 1 node.add_info_line('Tables: %d/%d/%d' % (n_ok, n_half, n_ign)) def cmd_wait_sync(self) -> None: self.load_local_info() dst_db = self.get_database('db') self.wait_for_sync(dst_db) def wait_for_sync(self, dst_db: Connection) -> None: self.log.info("Waiting until all tables are in sync") q = "select table_name, merge_state, local"\ " from londiste.get_table_list(%s) where local" dst_curs = dst_db.cursor() partial = {} startup_info = 0 while True: dst_curs.execute(q, [self.queue_name]) rows = dst_curs.fetchall() dst_db.commit() total_count = 0 cur_count = 0 done_list = [] for row in rows: if not row['local']: continue total_count += 1 tbl = row['table_name'] if row['merge_state'] != 'ok': partial[tbl] = 0 cur_count += 1 elif tbl in partial: if partial[tbl] == 0: partial[tbl] = 1 done_list.append(tbl) done_count = total_count - cur_count if not startup_info: self.log.info("%d/%d table(s) to copy", cur_count, total_count) startup_info = 1 for done in done_list: self.log.info("%s: finished (%d/%d)", done, done_count, total_count) if cur_count == 0: break self.sleep(2) self.log.info("All done") def resurrect_dump_event(self, ev: DictRow, stats: Dict[str, Any], batch_info: DictRow) -> None: """Collect per-table stats.""" super().resurrect_dump_event(ev, stats, batch_info) ROLLBACK = 'can rollback' NO_ROLLBACK = 'cannot rollback' if ev['ev_type'] == 'TRUNCATE': if 'truncated_tables' not in stats: stats['truncated_tables'] = [] tlist = stats['truncated_tables'] tbl = ev['ev_extra1'] if tbl not in tlist: tlist.append(tbl) elif ev['ev_type'][:2] in ('I:', 'U:', 'D:', 'I', 'U', 'D'): op = ev['ev_type'][0] tbl = ev['ev_extra1'] bak = ev['ev_extra3'] tblkey = 'table: %s' % tbl if tblkey not in stats: stats[tblkey] = [0, 0, 0, ROLLBACK] tinfo = stats[tblkey] if op == 'I': tinfo[0] += 1 elif op == 'U': tinfo[1] += 1 if not bak: tinfo[3] = NO_ROLLBACK elif op == 'D': tinfo[2] += 1 if not bak and ev['ev_type'] == 'D': tinfo[3] = NO_ROLLBACK londiste-3.12/londiste/cli.py000066400000000000000000000177041447267722200162500ustar00rootroot00000000000000"""Londiste launcher. """ from typing import Sequence, Optional import optparse import sys import skytools import pgq.cascade.admin from londiste.admin import LondisteSetup from londiste.compare import Comparator from londiste.playback import Replicator from londiste.repair import Repairer from londiste.table_copy import CopyTable command_usage = pgq.cascade.admin.command_usage + """ Replication Daemon: worker replay events to subscriber Replication Administration: add-table TBL ... add table to queue remove-table TBL ... remove table from queue change-handler TBL change handler for the table add-seq SEQ ... add sequence to provider remove-seq SEQ ... remove sequence from provider tables show all tables on provider seqs show all sequences on provider missing list tables subscriber has not yet attached to resync TBL ... do full copy again wait-sync wait until all tables are in sync Replication Extra: check compare table structure on both sides fkeys print out fkey drop/create commands compare [TBL ...] compare table contents on both sides repair [TBL ...] repair data on subscriber execute [FILE ...] execute SQL files on set show-handlers [..] show info about all or specific handler Internal Commands: copy copy table logic """ cmd_handlers = ( (('create-root', 'create-branch', 'create-leaf', 'members', 'tag-dead', 'tag-alive', 'change-provider', 'rename-node', 'status', 'node-status', 'pause', 'resume', 'node-info', 'drop-node', 'takeover', 'resurrect'), LondisteSetup), (('add-table', 'remove-table', 'change-handler', 'add-seq', 'remove-seq', 'tables', 'seqs', 'missing', 'resync', 'wait-sync', 'wait-root', 'wait-provider', 'check', 'fkeys', 'execute'), LondisteSetup), (('show-handlers',), LondisteSetup), (('worker',), Replicator), (('compare',), Comparator), (('repair',), Repairer), (('copy',), CopyTable), ) class Londiste(skytools.DBScript): script: skytools.DBScript def __init__(self, args: Sequence[str]) -> None: self.full_args = args super().__init__('londiste', args) if len(self.args) < 2: print("need command") sys.exit(1) cmd = self.args[1] script = None for names, cls in cmd_handlers: if cmd in names: script = cls(args) break if not script: print("Unknown command '%s', use --help for help" % cmd) sys.exit(1) self.script = script def start(self) -> None: self.script.start() def print_ini(self) -> None: """Let the Replicator print the default config.""" Replicator(self.full_args) def init_optparse(self, parser: Optional[optparse.OptionParser] = None) -> optparse.OptionParser: p = super().init_optparse(parser) p.set_usage(command_usage.strip()) g = optparse.OptionGroup(p, "options for cascading") g.add_option("--provider", help="init: upstream node temp connect string") g.add_option("--target", metavar="NODE", help="switchover: target node") g.add_option("--merge", metavar="QUEUE", help="create-leaf: combined queue name") g.add_option("--dead", metavar="NODE", action='append', help="cascade: assume node is dead") g.add_option("--dead-root", action='store_true', help="takeover: old node was root") g.add_option("--nocheck", action='store_true', help="create: skip public connect string check") g.add_option("--dead-branch", action='store_true', help="takeover: old node was branch") g.add_option("--sync-watermark", metavar="NODES", help="create-branch: list of node names to sync wm with") p.add_option_group(g) g = optparse.OptionGroup(p, "repair queue position") g.add_option("--rewind", action="store_true", help="change queue position according to destination") g.add_option("--reset", action="store_true", help="reset queue position on destination side") p.add_option_group(g) g = optparse.OptionGroup(p, "options for add") g.add_option("--all", action="store_true", help="add: include all possible tables") g.add_option("--wait-sync", action="store_true", help="add: wait until all tables are in sync") g.add_option("--dest-table", metavar="NAME", help="add: redirect changes to different table") g.add_option("--expect-sync", action="store_true", dest="expect_sync", help="add: no copy needed", default=False) g.add_option("--skip-truncate", action="store_true", dest="skip_truncate", help="add: keep old data", default=False) g.add_option("--create", action="store_true", help="add: create table/seq if not exist, with minimal schema") g.add_option("--create-full", action="store_true", help="add: create table/seq if not exist, with full schema") g.add_option("--trigger-flags", help="add: set trigger flags (BAIUDLQ)") g.add_option("--trigger-arg", action="append", help="add: custom trigger arg (can be specified multiple times)") g.add_option("--no-triggers", action="store_true", help="add: do not put triggers on table (makes sense on leaf)") g.add_option("--handler", action="store", help="add: custom handler for table") g.add_option("--handler-arg", action="append", help="add: argument to custom handler") g.add_option("--find-copy-node", dest="find_copy_node", action="store_true", help="add: walk upstream to find node to copy from") g.add_option("--copy-node", metavar="NODE", dest="copy_node", help="add: use NODE as source for initial COPY") g.add_option("--merge-all", action="store_true", help="merge tables from all source queues", default=False) g.add_option("--no-merge", action="store_true", help="don't merge tables from source queues", default=False) g.add_option("--max-parallel-copy", metavar="NUM", type="int", help="max number of parallel copy processes") g.add_option("--skip-non-existing", action="store_true", help="add: skip object that does not exist") p.add_option_group(g) g = optparse.OptionGroup(p, "options for tables") g.add_option("--names-only", action="store_true", help="tables: show only table names (for scripting)") p.add_option_group(g) g = optparse.OptionGroup(p, "other options") g.add_option("--force", action="store_true", help="add: ignore table differences, repair: ignore lag") g.add_option("--apply", action="store_true", help="repair: apply fixes automatically") g.add_option("--count-only", action="store_true", help="compare: just count rows, do not compare data") g.add_option("--sort-bufsize", action="store", help="repair: set coreutils sort bufsize (default: 30%)") g.add_option("--repair-where", action="store", help="repair: use where condition to filter rows for repair") g.add_option("--compact", action="store_true", help="status: use shorter output format.") p.add_option_group(g) return p def main() -> None: script = Londiste(sys.argv[1:]) script.start() if __name__ == '__main__': main() londiste-3.12/londiste/compare.py000066400000000000000000000075271447267722200171310ustar00rootroot00000000000000"""Compares tables in replication set. Currently just does count(1) on both sides. """ from typing import Dict, List, Optional import sys import optparse import skytools from skytools.basetypes import Cursor, Connection from londiste.syncer import Syncer, ATable __all__ = ['Comparator'] class Comparator(Syncer): """Simple checker based on Syncer. When tables are in sync runs simple SQL query on them. """ def process_sync(self, t1: ATable, t2: ATable, src_db: Connection, dst_db: Connection) -> int: """Actual comparison.""" src_tbl = t1.dest_table dst_tbl = t2.dest_table src_curs = src_db.cursor() dst_curs = dst_db.cursor() dst_where = t2.plugin.get_copy_condition(src_curs, dst_curs) src_where = dst_where self.log.info('Counting %s', dst_tbl) # get common cols cols = self.calc_cols(src_curs, src_tbl, dst_curs, dst_tbl) # get sane query if self.options.count_only: q = "select count(1) as cnt from only _TABLE_" else: # this way is much faster than the above q = "select count(1) as cnt, sum(hashtext(_COLS_::text)::bigint) as chksum from only _TABLE_" q = self.cf.get('compare_sql', q) q = q.replace("_COLS_", cols) src_q = q.replace('_TABLE_', skytools.quote_fqident(src_tbl)) if src_where: src_q = src_q + " WHERE " + src_where dst_q = q.replace('_TABLE_', skytools.quote_fqident(dst_tbl)) if dst_where: dst_q = dst_q + " WHERE " + dst_where f = "%(cnt)d rows" if not self.options.count_only: f += ", checksum=%(chksum)s" f = self.cf.get('compare_fmt', f) self.log.debug("srcdb: %s", src_q) src_curs.execute(src_q) src_row = src_curs.fetchone() src_str = f % src_row self.log.info("srcdb: %s", src_str) src_db.commit() self.log.debug("dstdb: %s", dst_q) dst_curs.execute(dst_q) dst_row = dst_curs.fetchone() dst_str = f % dst_row self.log.info("dstdb: %s", dst_str) dst_db.commit() if src_str != dst_str: self.log.warning("%s: Results do not match!", dst_tbl) return 1 return 0 def calc_cols(self, src_curs: Cursor, src_tbl: str, dst_curs: Cursor, dst_tbl: str) -> str: cols1 = self.load_cols(src_curs, src_tbl) cols2 = self.load_cols(dst_curs, dst_tbl) qcols = [] for c in self.calc_common(cols1, cols2): qcols.append(skytools.quote_ident(c)) return "(%s)" % ",".join(qcols) def load_cols(self, curs: Cursor, tbl: str) -> List[str]: schema, table = skytools.fq_name_parts(tbl) q = "select column_name from information_schema.columns"\ " where table_schema = %s and table_name = %s" curs.execute(q, [schema, table]) cols = [] for row in curs.fetchall(): cols.append(row[0]) return cols def calc_common(self, cols1: List[str], cols2: List[str]) -> List[str]: common = [] map2: Dict[str, int] = {} for c in cols2: map2[c] = 1 for c in cols1: if c in map2: common.append(c) if len(common) == 0: raise Exception("no common columns found") if len(common) != len(cols1) or len(cols2) != len(cols1): self.log.warning("Ignoring some columns") return common def init_optparse(self, p: Optional[optparse.OptionParser] = None) -> optparse.OptionParser: """Initialize cmdline switches.""" p = super().init_optparse(p) p.add_option("--count-only", action="store_true", help="just count rows, do not compare data") return p if __name__ == '__main__': script = Comparator(sys.argv[1:]) script.start() londiste-3.12/londiste/exec_attrs.py000066400000000000000000000265521447267722200176430ustar00rootroot00000000000000"""Custom parser for EXECUTE attributes. The values are parsed from SQL file given to EXECUTE. Format rules: * Only lines starting with meta-comment prefix will be parsed: --*-- * Empty or regular SQL comment lines are ignored. * Parsing stops on first SQL statement. * Meta-line format: "--*-- Key: value1, value2" * If line ends with ',' then next line is taken as continuation. Supported keys: * Local-Table: * Local-Sequence: * Local-Destination: * Need-Table * Need-Sequence * Need-Function * Need-Schema * Need-View Sample file:: --*-- Local-Sequence: myseq --*-- --*-- Local-Table: table1, --*-- table2, table3 --*-- Tests: >>> a = ExecAttrs() >>> a.add_value("Local-Table", "mytable") >>> a.add_value("Local-Sequence", "seq1") >>> a.add_value("Local-Sequence", "seq2") >>> a.to_urlenc() in ( ... 'local-table=mytable&local-sequence=seq1%2cseq2', ... 'local-sequence=seq1%2cseq2&local-table=mytable') True >>> a.add_value("Local-Destination", "mytable-longname-more1") >>> a.add_value("Local-Destination", "mytable-longname-more2") >>> a.add_value("Local-Destination", "mytable-longname-more3") >>> a.add_value("Local-Destination", "mytable-longname-more4") >>> a.add_value("Local-Destination", "mytable-longname-more5") >>> a.add_value("Local-Destination", "mytable-longname-more6") >>> a.add_value("Local-Destination", "mytable-longname-more7") >>> print(a.to_sql()) --*-- Local-Table: mytable --*-- Local-Sequence: seq1, seq2 --*-- Local-Destination: mytable-longname-more1, mytable-longname-more2, --*-- mytable-longname-more3, mytable-longname-more4, mytable-longname-more5, --*-- mytable-longname-more6, mytable-longname-more7 >>> a = ExecAttrs(sql = ''' ... ... -- ... ... --*-- Local-Table: foo , ... -- ... --*-- bar , ... --*-- ... --*-- zoo ... --*-- ... --*-- Local-Sequence: goo ... --*-- ... -- ... ... create fooza; ... ''') >>> print(a.to_sql()) --*-- Local-Table: foo, bar, zoo --*-- Local-Sequence: goo >>> seqs = {'public.goo': 'public.goo'} >>> tables = {} >>> tables['public.foo'] = 'public.foo' >>> tables['public.bar'] = 'other.Bar' >>> tables['public.zoo'] = 'Other.Foo' >>> a.need_execute(None, tables, seqs) True >>> a.need_execute(None, [], []) False >>> sql = '''alter table @foo@; ... alter table @bar@; ... alter table @zoo@;''' >>> print(a.process_sql(sql, tables, seqs)) alter table public.foo; alter table other."Bar"; alter table "Other"."Foo"; """ from typing import Dict, List, Optional import skytools from skytools.basetypes import Cursor META_PREFIX = "--*--" class Matcher: nice_name: str = '' def match(self, objname: str, curs: Cursor, tables: Dict[str, str], seqs: Dict[str, str]) -> bool: return False def get_key(self) -> str: return self.nice_name.lower() def local_rename(self) -> bool: return False class LocalTable(Matcher): nice_name: str = "Local-Table" def match(self, objname: str, curs: Cursor, tables: Dict[str, str], seqs: Dict[str, str]) -> bool: return objname in tables def local_rename(self) -> bool: return True class LocalSequence(Matcher): nice_name = "Local-Sequence" def match(self, objname: str, curs: Cursor, tables: Dict[str, str], seqs: Dict[str, str]) -> bool: return objname in seqs def local_rename(self) -> bool: return True class LocalDestination(Matcher): nice_name = "Local-Destination" def match(self, objname: str, curs: Cursor, tables: Dict[str, str], seqs: Dict[str, str]) -> bool: if objname not in tables: return False dest_name = tables[objname] return skytools.exists_table(curs, dest_name) def local_rename(self) -> bool: return True class NeedTable(Matcher): nice_name = "Need-Table" def match(self, objname: str, curs: Cursor, tables: Dict[str, str], seqs: Dict[str, str]) -> bool: return skytools.exists_table(curs, objname) class NeedSequence(Matcher): nice_name = "Need-Sequence" def match(self, objname: str, curs: Cursor, tables: Dict[str, str], seqs: Dict[str, str]) -> bool: return skytools.exists_sequence(curs, objname) class NeedSchema(Matcher): nice_name = "Need-Schema" def match(self, objname: str, curs: Cursor, tables: Dict[str, str], seqs: Dict[str, str]) -> bool: return skytools.exists_schema(curs, objname) class NeedFunction(Matcher): nice_name = "Need-Function" def match(self, objname: str, curs: Cursor, tables: Dict[str, str], seqs: Dict[str, str]) -> bool: nargs = 0 pos1 = objname.find('(') if pos1 > 0: pos2 = objname.find(')') if pos2 > 0: s = objname[pos1 + 1: pos2] objname = objname[:pos1] nargs = int(s) return skytools.exists_function(curs, objname, nargs) class NeedView(Matcher): nice_name = "Need-View" def match(self, objname: str, curs: Cursor, tables: Dict[str, str], seqs: Dict[str, str]) -> bool: return skytools.exists_view(curs, objname) META_SPLITLINE = 70 # list of matches, in order they need to be probed META_MATCHERS = [ LocalTable(), LocalSequence(), LocalDestination(), NeedTable(), NeedSequence(), NeedFunction(), NeedSchema(), NeedView() ] # key to nice key META_KEYS = {m.nice_name.lower(): m for m in META_MATCHERS} class ExecAttrsException(skytools.UsageError): """Some parsing problem.""" class ExecAttrs: """Container and parser for EXECUTE attributes.""" attrs: Dict[str, List[str]] def __init__(self, sql: Optional[str] = None, urlenc: Optional[str] = None) -> None: """Create container and parse either sql or urlenc string.""" self.attrs = {} if sql and urlenc: raise Exception("Both sql and urlenc set.") if urlenc: self.parse_urlenc(urlenc) elif sql: self.parse_sql(sql) def add_value(self, k: str, v: str) -> None: """Add single value to key.""" xk = k.lower().strip() if xk not in META_KEYS: raise ExecAttrsException("Invalid key: %s" % k) if xk not in self.attrs: self.attrs[xk] = [] xv = v.strip() self.attrs[xk].append(xv) def to_urlenc(self) -> str: """Convert container to urlencoded string.""" sdict = {} for k, v in self.attrs.items(): sdict[k] = ','.join(v) return skytools.db_urlencode(sdict) def parse_urlenc(self, ustr: str) -> None: """Parse urlencoded string adding values to current container.""" sdict = skytools.db_urldecode(ustr) for k, v in sdict.items(): if v: for v1 in v.split(','): self.add_value(k, v1) def to_sql(self) -> str: """Convert container to SQL meta-comments.""" lines = [] for m in META_MATCHERS: k = m.get_key() if k not in self.attrs: continue vlist = self.attrs[k] ln = "%s %s: " % (META_PREFIX, m.nice_name) start = 0 for nr, v in enumerate(vlist): if nr > start: ln = ln + ", " + v else: ln = ln + v if len(ln) >= META_SPLITLINE and nr < len(vlist) - 1: ln += ',' lines.append(ln) ln = META_PREFIX + " " start = nr + 1 lines.append(ln) return '\n'.join(lines) def parse_sql(self, sql: str) -> None: """Parse SQL meta-comments.""" cur_key: Optional[str] = None cur_continued = False for ln in sql.splitlines(): # skip empty lines ln = ln.strip() if not ln: continue # stop at non-comment if ln[:2] != '--': break # parse only meta-comments if ln[:len(META_PREFIX)] != META_PREFIX: continue # cut prefix, skip empty comments ln = ln[len(META_PREFIX):].strip() if not ln: continue # continuation of previous key if cur_continued: # collect values for v in ln.split(','): v = v.strip() if v and cur_key: self.add_value(cur_key, v) # does this key continue? if ln[-1] != ',': cur_key = None cur_continued = False # go to next line continue # parse key pos = ln.find(':') if pos < 0: continue k = ln[:pos].strip() # collect values for v in ln[pos + 1:].split(','): v = v.strip() if not v: continue self.add_value(k, v) # check if current key values will continue if ln[-1] == ',': cur_key = k cur_continued = True else: cur_key = None cur_continued = False def need_execute(self, curs: Cursor, local_tables: Dict[str, str], local_seqs: Dict[str, str]) -> bool: # if no attrs, always execute if not self.attrs: return True matched = 0 missed = 0 good_list = [] miss_list = [] for m in META_MATCHERS: k = m.get_key() if k not in self.attrs: continue for v in self.attrs[k]: fqname = skytools.fq_name(v) if m.match(fqname, curs, local_tables, local_seqs): matched += 1 good_list.append(v) else: missed += 1 miss_list.append(v) # should be drop out early? if matched > 0 and missed == 0: return True elif missed > 0 and matched == 0: return False elif missed == 0 and matched == 0: # should not happen, but lets restore old behaviour? return True else: raise Exception("SQL only partially matches local setup: matches=%r misses=%r" % (good_list, miss_list)) def get_attr(self, k: str) -> List[str]: k = k.lower().strip() if k not in META_KEYS: raise Exception("Bug: invalid key requested: " + k) if k not in self.attrs: return [] return self.attrs[k] def process_sql(self, sql: str, local_tables: Dict[str, str], local_seqs: Dict[str, str]) -> str: """Replace replacement tags in sql with actual local names.""" for k, vlist in self.attrs.items(): m = META_KEYS[k] if not m.local_rename(): continue for v in vlist: repname = '@%s@' % v fqname = skytools.fq_name(v) if fqname in local_tables: localname = local_tables[fqname] elif fqname in local_seqs: localname = local_seqs[fqname] else: # should not happen raise Exception("bug: lost table: " + v) qdest = skytools.quote_fqident(localname) sql = sql.replace(repname, qdest) return sql londiste-3.12/londiste/handler.py000066400000000000000000000330341447267722200171100ustar00rootroot00000000000000 """Table handler. Per-table decision how to create trigger, copy data and apply events. """ from typing import List, Dict, Any, Sequence, Tuple, Optional, Union, Callable, Type import json import logging import sys import skytools from skytools.basetypes import Cursor, Connection from skytools import dbdict from pgq import Event from pgq.baseconsumer import BatchInfo import londiste.util ApplyFunc = Callable[[str, Cursor], None] _ = """ -- redirect & create table partition by batch_time partition by date field -- sql handling: cube1 - I/U/D -> partition, insert cube2 - I/U/D -> partition, del/insert field remap name remap bublin filter - replay: filter events - copy: additional where - add: add trigger args multimaster - replay: conflict handling, add fncall to sql queue? - add: add 'backup' arg to trigger plain londiste: - replay: add to sql queue """ __all__ = ['RowCache', 'BaseHandler', 'build_handler', 'create_handler_string', 'BatchInfo', 'Event', 'Cursor', 'Connection'] class RowCache: table_name: str keys: Dict[str, int] rows: List[Tuple[Any, ...]] def __init__(self, table_name: str) -> None: self.table_name = table_name self.keys = {} self.rows = [] def add_row(self, d: Dict[str, Any]) -> None: row = [None] * len(self.keys) for k, v in d.items(): try: row[self.keys[k]] = v except KeyError: i = len(row) self.keys[k] = i row.append(v) self.rows.append(tuple(row)) def get_fields(self) -> Sequence[str]: row: List[str] = [""] * len(self.keys) for k, i in self.keys.items(): row[i] = k return tuple(row) def apply_rows(self, curs: Cursor) -> None: fields = self.get_fields() skytools.magic_insert(curs, self.table_name, self.rows, fields) class BaseHandler: """Defines base API, does nothing. """ handler_name = 'nop' log = logging.getLogger('basehandler') table_name: str dest_table: str fq_table_name: str fq_dest_table: str args: Dict[str, str] conf: skytools.dbdict _doc_: str = '' def __init__(self, table_name: str, args: Optional[Dict[str,str]], dest_table: Optional[str]) -> None: self.table_name = table_name self.dest_table = dest_table or table_name self.fq_table_name = skytools.quote_fqident(self.table_name) self.fq_dest_table = skytools.quote_fqident(self.dest_table) self.args = args if args else {} self._check_args(self.args) self.conf = self.get_config() def _parse_args_from_doc(self) -> List[Tuple[str, str, str]]: doc = self.__doc__ or "" params_descr: List[Tuple[str, str, str]] = [] params_found = False for line in doc.splitlines(): ln = line.strip() if params_found: if ln == "": break descr = ln.split(None, 1) name, sep, ___rest = descr[0].partition('=') if sep: expr = descr[0].rstrip(":") text = descr[1].lstrip(":- \t") else: name, expr, text = params_descr.pop() text += "\n" + ln params_descr.append((name, expr, text)) elif ln == "Parameters:": params_found = True return params_descr def _check_args(self, args: Dict[str, str]) -> None: self.valid_arg_names = [] passed_arg_names = args.keys() if args else [] args_from_doc = self._parse_args_from_doc() if args_from_doc: self.valid_arg_names = [arg[0] for arg in args_from_doc] invalid = set(passed_arg_names) - set(self.valid_arg_names) if invalid: raise ValueError("Invalid handler argument: %s" % list(invalid)) def get_arg(self, name: str, value_list: Union[List[str], List[int]], default: Optional[Union[str, int]]=None) -> Union[str, int]: """ Return arg value or default; also check if value allowed. """ default = default or value_list[0] val = type(default)(self.args.get(name, default)) if val not in value_list: raise Exception('Bad argument %s value %r' % (name, val)) return val def get_config(self) -> dbdict: """ Process args dict (into handler config). """ conf = skytools.dbdict() return conf def add(self, trigger_arg_list: List[str]) -> None: """Called when table is added. Can modify trigger args. """ pass def reset(self) -> None: """Called before starting to process a batch. Should clean any pending data. """ pass def prepare_batch(self, batch_info: Optional[BatchInfo], dst_curs: Cursor) -> None: """Called on first event for this table in current batch.""" pass def process_event(self, ev: Event, sql_queue_func: ApplyFunc, dst_curs: Cursor) -> None: """Process a event. Event should be added to sql_queue or executed directly. """ pass def finish_batch(self, batch_info: BatchInfo, dst_curs: Cursor) -> None: """Called when batch finishes.""" pass def get_copy_condition(self, src_curs: Cursor, dst_curs: Cursor) -> str: """ Use if you want to filter data """ return '' def real_copy(self, src_tablename: str, src_curs: Cursor, dst_curs: Cursor, column_list: List[str]) -> Tuple[int, int]: """do actual table copy and return tuple with number of bytes and rows copied """ condition = self.get_copy_condition(src_curs, dst_curs) return skytools.full_copy(src_tablename, src_curs, dst_curs, column_list, condition, dst_tablename=self.dest_table) def real_copy_threaded( self, src_real_table: str, src_curs: Cursor, dst_db_connstr: str, column_list: Sequence[str], config_file: str, config_section: str, parallel: int = 1, ) -> Tuple[int, int]: with skytools.connect_database(dst_db_connstr) as dst_db: with dst_db.cursor() as dst_curs: condition = self.get_copy_condition(src_curs, dst_curs) dst_db.commit() return londiste.util.full_copy_parallel( src_real_table, src_curs, dst_db_connstr=dst_db_connstr, dst_tablename=self.dest_table, column_list=column_list, condition=condition, parallel=parallel, ) def needs_table(self) -> bool: """Does the handler need the table to exist on destination.""" return True @classmethod def load_conf(cls, cf: skytools.Config) -> None: """Load conf.""" pass def get_copy_event(self, ev: Event, queue_name: str) -> Optional[Event]: """Get event copy for destination queue.""" return ev class TableHandler(BaseHandler): """Default Londiste handler, inserts events into tables with plain SQL. Parameters: encoding=ENC - Validate and fix incoming data from encoding. Only 'utf8' is supported at the moment. ignore_truncate=BOOL - Ignore truncate event. Default: 0; Values: 0,1. """ handler_name = 'londiste' sql_command = { 'I': "insert into %s %s;", 'U': "update only %s set %s;", 'D': "delete from only %s where %s;", } allow_sql_event = 1 def __init__(self, table_name: str, args: Dict[str, str], dest_table: Optional[str]) -> None: super().__init__(table_name, args, dest_table) enc = args.get('encoding') if enc: raise ValueError("encoding validator not supported") def get_config(self) -> dbdict: conf = super().get_config() conf.ignore_truncate = self.get_arg('ignore_truncate', [0, 1], 0) return conf def process_event(self, ev: Event, sql_queue_func: ApplyFunc, dst_curs: Cursor) -> None: row = self.parse_row_data(ev) if len(ev.type) == 1: # sql event fqname = self.fq_dest_table fmt = self.sql_command[ev.type] sql = fmt % (fqname, row) else: if ev.type[0] == '{': jtype = json.loads(ev.type) pklist = jtype['pkey'] op = jtype['op'][0] else: # urlenc event pklist = ev.type[2:].split(',') op = ev.type[0] tbl = self.dest_table if op == 'I': sql = skytools.mk_insert_sql(row, tbl, pklist) elif op == 'U': sql = skytools.mk_update_sql(row, tbl, pklist) elif op == 'D': sql = skytools.mk_delete_sql(row, tbl, pklist) sql_queue_func(sql, dst_curs) def parse_row_data(self, ev: Event) -> Dict[str, Any]: """Extract row data from event, with optional encoding fixes. Returns either string (sql event) or dict (urlenc event). """ if len(ev.type) == 1: if not self.allow_sql_event: raise Exception('SQL events not supported by this handler') return ev.data elif ev.data[0] == '{': row = json.loads(ev.data) return row else: row = skytools.db_urldecode(ev.data) return row def real_copy(self, src_tablename: str, src_curs: Cursor, dst_curs: Cursor, column_list: List[str]) -> Tuple[int, int]: """do actual table copy and return tuple with number of bytes and rows copied """ condition = self.get_copy_condition(src_curs, dst_curs) return skytools.full_copy(src_tablename, src_curs, dst_curs, column_list, condition, dst_tablename=self.dest_table) def real_copy_threaded( self, src_real_table: str, src_curs: Cursor, dst_db_connstr: str, column_list: Sequence[str], config_file: str, config_section: str, parallel: int = 1, ) -> Tuple[int, int]: with skytools.connect_database(dst_db_connstr) as dst_db: with dst_db.cursor() as dst_curs: condition = self.get_copy_condition(src_curs, dst_curs) dst_db.commit() return londiste.util.full_copy_parallel( src_real_table, src_curs, dst_db_connstr=dst_db_connstr, column_list=column_list, condition=condition, dst_tablename=self.dest_table, parallel=parallel, ) # # handler management # _handler_map: Dict[str, Type[BaseHandler]] = { 'londiste': TableHandler, } def register_handler_module(modname: str, cf: skytools.Config) -> None: """Import and module and register handlers.""" try: __import__(modname) except ImportError: print("Failed to load handler module: %s" % (modname,)) return m = sys.modules[modname] for h in getattr(m, "__londiste_handlers__"): h.load_conf(cf) _handler_map[h.handler_name] = h def _parse_arglist(arglist: Sequence[str]) -> Dict[str, str]: args = {} for arg in arglist or []: key, _, val = arg.partition('=') key = key.strip() if key in args: raise Exception('multiple handler arguments: %s' % key) args[key] = val.strip() return args def create_handler_string(name: str, arglist: Sequence[str]) -> str: handler = name if name.find('(') >= 0: raise Exception('invalid handler name: %s' % name) if arglist: args = _parse_arglist(arglist) astr = skytools.db_urlencode(args) handler = '%s(%s)' % (handler, astr) return handler def _parse_handler(hstr: str) -> Tuple[str, Dict[str, str]]: """Parse result of create_handler_string().""" args = {} name = hstr pos = hstr.find('(') if pos > 0: name = hstr[: pos] if hstr[-1] != ')': raise Exception('invalid handler format: %s' % hstr) astr = hstr[pos + 1: -1] if astr: astr = astr.replace(',', '&') args = { k: v for k, v in skytools.db_urldecode(astr).items() if v is not None } return (name, args) def build_handler(tblname: str, hstr: str, dest_table: Optional[str] = None) -> BaseHandler: """Parse and initialize handler. hstr is result of create_handler_string().""" hname, args = _parse_handler(hstr) # when no handler specified, use londiste hname = hname or 'londiste' klass = _handler_map[hname] if not dest_table: dest_table = tblname return klass(tblname, args, dest_table) #def load_handler_modules(cf: skytools.Config) -> None: # """Load and register modules from config.""" # from londiste.handlers import DEFAULT_HANDLERS # for m in DEFAULT_HANDLERS: # register_handler_module(m, cf) # # for m in cf.getlist('handler_modules', []): # register_handler_module(m, cf) def show(mods: Sequence[str]) -> None: if not mods: for n, kls in _handler_map.items(): desc = kls.__doc__ or '' if desc: desc = desc.strip().split('\n', 1)[0] print("%s - %s" % (n, desc)) else: for n in mods: kls = _handler_map[n] desc = kls.__doc__ or '' if desc: desc = desc.strip() print("%s - %s" % (n, desc)) londiste-3.12/londiste/handlers/000077500000000000000000000000001447267722200167165ustar00rootroot00000000000000londiste-3.12/londiste/handlers/__init__.py000066400000000000000000000034271447267722200210350ustar00rootroot00000000000000"""Handlers module """ import functools import sys from typing import List, Callable, Dict, Type, Optional import skytools from londiste.handler import BaseHandler, register_handler_module DEFAULT_HANDLERS: List[str] = [] ArgHandler = Callable[[Dict[str, str]], Dict[str,str]] ArgWrapper = Callable[[ArgHandler], ArgHandler] def handler_args(name: str, cls: Type[BaseHandler]) -> ArgWrapper: """Handler arguments initialization decorator Define successor for handler class cls with func as argument generator """ def wrapper(func: ArgHandler) -> ArgHandler: # pylint: disable=unnecessary-dunder-call def _init_override(self: BaseHandler, table_name: str, args: Dict[str, str], dest_table: Optional[str]) -> None: cls.__init__(self, table_name, func(args.copy()), dest_table) dct = {'__init__': _init_override, 'handler_name': name} module = sys.modules[cls.__module__] newname = '%s_%s' % (cls.__name__, name.replace('.', '_')) newcls = type(newname, (cls,), dct) setattr(module, newname, newcls) getattr(module, "__londiste_handlers__").append(newcls) getattr(module, "__all__").append(newname) return func return wrapper def update(*p: Dict[str, str]) -> Dict[str, str]: """ Update dicts given in params with its predecessor param dict in reverse order """ return functools.reduce(lambda x, y: x.update(y) or x, (p[i] for i in range(len(p) - 1, -1, -1)), {}) def load_handler_modules(cf: skytools.Config) -> None: """Load and register modules from config.""" for m in DEFAULT_HANDLERS: register_handler_module(m, cf) for m in cf.getlist('handler_modules', []): register_handler_module(m, cf) londiste-3.12/londiste/handlers/applyfn.py000066400000000000000000000031451447267722200207440ustar00rootroot00000000000000"""Send all events to a DB function. """ from typing import Optional, List, Type import skytools from londiste.handler import BaseHandler, BatchInfo, Cursor, Event, ApplyFunc __all__ = ['ApplyFuncHandler'] class ApplyFuncHandler(BaseHandler): """Call DB function to apply event. Parameters: func_name=NAME - database function name func_conf=CONF - database function conf """ handler_name: str = 'applyfn' cur_tick: Optional[int] = None def prepare_batch(self, batch_info: Optional[BatchInfo], dst_curs: Cursor) -> None: if batch_info is not None: self.cur_tick = batch_info['tick_id'] def process_event(self, ev: Event, sql_queue_func: ApplyFunc, qfunc_arg: Cursor) -> None: """Ignore events for this table""" fn = self.args.get('func_name') or 'undefined' fnconf = self.args.get('func_conf', '') args = [fnconf, self.cur_tick, ev.ev_id, ev.ev_time, ev.ev_txid, ev.ev_retry, ev.ev_type, ev.ev_data, ev.ev_extra1, ev.ev_extra2, ev.ev_extra3, ev.ev_extra4] qfn = skytools.quote_fqident(fn) qargs = [skytools.quote_literal(a) for a in args] sql = "select %s(%s);" % (qfn, ', '.join(qargs)) self.log.debug('applyfn.sql: %s', sql) sql_queue_func(sql, qfunc_arg) #------------------------------------------------------------------------------ # register handler class #------------------------------------------------------------------------------ __londiste_handlers__: List[Type[BaseHandler]] = [ApplyFuncHandler] londiste-3.12/londiste/handlers/bulk.py000066400000000000000000000335651447267722200202410ustar00rootroot00000000000000"""Bulk loading into OLAP database. To use set in londiste.ini: handler_modules = londiste.handlers.bulk then add table with: londiste add-table xx --handler="bulk" or: londiste add-table xx --handler="bulk(method=X)" Methods: 0 (correct) - inserts as COPY into table, update as COPY into temp table and single UPDATE from there delete as COPY into temp table and single DELETE from there 1 (delete) - as 'correct', but do update as DELETE + COPY 2 (merged) - as 'delete', but merge insert rows with update rows Default is 0. """ from typing import List, Optional, Dict, Any, Tuple import skytools from skytools import quote_fqident, quote_ident from pgq import Event from londiste.handler import BaseHandler, BatchInfo, ApplyFunc, Cursor __all__ = ['BulkLoader'] # BulkLoader load method METH_CORRECT = 0 METH_DELETE = 1 METH_MERGED = 2 DEFAULT_METHOD = METH_CORRECT # BulkLoader hacks AVOID_BIZGRES_BUG = 0 USE_LONGLIVED_TEMP_TABLES = True USE_REAL_TABLE = False class BulkEvent: """Helper class for BulkLoader to store relevant data.""" __slots__ = ('op', 'data', 'pk_data') def __init__(self, op: str, data: Dict[str, Any], pk_data: Tuple[str, ...]) -> None: self.op = op self.data = data self.pk_data = pk_data class BulkLoader(BaseHandler): """Bulk loading into OLAP database. Instead of statement-per-event, load all data with one big COPY, UPDATE or DELETE statement. Parameters: method=TYPE - method to use for copying [0..2] (default: 0) Methods: 0 (correct) - inserts as COPY into table, update as COPY into temp table and single UPDATE from there delete as COPY into temp table and single DELETE from there 1 (delete) - as 'correct', but do update as DELETE + COPY 2 (merged) - as 'delete', but merge insert rows with update rows """ handler_name = 'bulk' fake_seq = 0 pkey_list: Optional[List[str]] dist_fields: Optional[List[str]] col_list: Optional[List[str]] pkey_ev_map: Dict[Tuple[str, ...], List[BulkEvent]] method: int def __init__(self, table_name: str, args: Dict[str, str], dest_table: str) -> None: """Init per-batch table data cache.""" super().__init__(table_name, args, dest_table) self.pkey_list = None self.dist_fields = None self.col_list = None self.pkey_ev_map = {} self.method = int(args.get('method', DEFAULT_METHOD)) if self.method not in (0, 1, 2): raise Exception('unknown method: %s' % self.method) self.log.debug('bulk_init(%r), method=%d', args, self.method) def reset(self) -> None: self.pkey_ev_map = {} super().reset() def finish_batch(self, batch_info: BatchInfo, dst_curs: Cursor) -> None: self.bulk_flush(dst_curs) def process_event(self, ev: Event, sql_queue_func: ApplyFunc, arg: Cursor) -> None: if len(ev.ev_type) < 2 or ev.ev_type[1] != ':': raise Exception('Unsupported event type: %s/extra1=%s/data=%s' % ( ev.ev_type, ev.ev_extra1, ev.ev_data)) op = ev.ev_type[0] if op not in 'IUD': raise Exception('Unknown event type: ' + ev.ev_type) # pkey_list = ev.ev_type[2:].split(',') data = skytools.db_urldecode(ev.ev_data) # get pkey value if self.pkey_list is None: #self.pkey_list = pkey_list self.pkey_list = ev.ev_type[2:].split(',') pk_data: Tuple[str, ...] if len(self.pkey_list) > 0: pk_data = tuple(data[k] or '' for k in self.pkey_list) elif op == 'I': # fake pkey, just to get them spread out pk_data = (str(self.fake_seq),) self.fake_seq += 1 else: raise Exception('non-pk tables not supported: %s' % self.table_name) # get full column list, detect added columns data_keys = list(data.keys()) if not self.col_list: self.col_list = data_keys elif self.col_list != data_keys: # ^ supposedly python guarantees same order in keys() self.col_list = data_keys # keep all versions of row data bev = BulkEvent(op, data, pk_data) if bev.pk_data in self.pkey_ev_map: self.pkey_ev_map[bev.pk_data].append(bev) else: self.pkey_ev_map[bev.pk_data] = [bev] def prepare_data(self) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]: """Got all data, prepare for insertion.""" del_list = [] ins_list = [] upd_list = [] for ev_list in self.pkey_ev_map.values(): # rewrite list of I/U/D events to # optional DELETE and optional INSERT/COPY command exists_before = -1 exists_after = 1 for ev in ev_list: if ev.op == "I": if exists_before < 0: exists_before = 0 exists_after = 1 elif ev.op == "U": if exists_before < 0: exists_before = 1 #exists_after = 1 # this shouldnt be needed elif ev.op == "D": if exists_before < 0: exists_before = 1 exists_after = 0 else: raise Exception('unknown event type: %s' % ev.op) # skip short-lived rows if exists_before == 0 and exists_after == 0: continue # take last event ev = ev_list[-1] # generate needed commands if exists_before and exists_after: upd_list.append(ev.data) elif exists_before: del_list.append(ev.data) elif exists_after: ins_list.append(ev.data) return ins_list, upd_list, del_list def bulk_flush(self, curs: Cursor) -> None: ins_list, upd_list, del_list = self.prepare_data() # reorder cols, put pks first assert self.pkey_list assert self.col_list col_list = self.pkey_list[:] for k in self.col_list: if k not in self.pkey_list: col_list.append(k) real_update_count = len(upd_list) self.log.debug("bulk_flush: %s (I/U/D = %d/%d/%d)", self.table_name, len(ins_list), len(upd_list), len(del_list)) # hack to unbroke stuff if self.method == METH_MERGED: upd_list += ins_list ins_list = [] # fetch distribution fields if self.dist_fields is None: self.dist_fields = self.find_dist_fields(curs) key_fields = self.pkey_list[:] for fld in self.dist_fields: if fld not in key_fields: key_fields.append(fld) self.log.debug("PKey fields: %s Dist fields: %s", ",".join(self.pkey_list), ",".join(self.dist_fields)) # create temp table temp, qtemp = self.create_temp_table(curs) tbl = self.dest_table qtbl = self.fq_dest_table # where expr must have pkey and dist fields klist = [] for pk in key_fields: exp = "%s.%s = %s.%s" % (qtbl, quote_ident(pk), qtemp, quote_ident(pk)) klist.append(exp) whe_expr = " and ".join(klist) # create del sql del_sql = "delete from only %s using %s where %s" % (qtbl, qtemp, whe_expr) # create update sql slist = [] for col in col_list: if col not in key_fields: exp = "%s = %s.%s" % (quote_ident(col), qtemp, quote_ident(col)) slist.append(exp) upd_sql = "update only %s set %s from %s where %s" % (qtbl, ", ".join(slist), qtemp, whe_expr) # avoid updates on pk-only table if not slist: upd_list = [] # insert sql colstr = ",".join([quote_ident(c) for c in col_list]) ins_sql = "insert into %s (%s) select %s from %s" % ( qtbl, colstr, colstr, qtemp) temp_used = False # process deleted rows if len(del_list) > 0: self.log.debug("bulk: Deleting %d rows from %s", len(del_list), tbl) # delete old rows q = "truncate %s" % qtemp self.log.debug('bulk: %s', q) curs.execute(q) # copy rows self.log.debug("bulk: COPY %d rows into %s", len(del_list), temp) skytools.magic_insert(curs, qtemp, del_list, col_list, quoted_table=True) # delete rows self.log.debug('bulk: %s', del_sql) curs.execute(del_sql) self.log.debug("bulk: %s - %d", curs.statusmessage, curs.rowcount) if len(del_list) != curs.rowcount: self.log.warning("Delete mismatch: expected=%s deleted=%d", len(del_list), curs.rowcount) temp_used = True # process updated rows if len(upd_list) > 0: self.log.debug("bulk: Updating %d rows in %s", len(upd_list), tbl) # delete old rows q = "truncate %s" % qtemp self.log.debug('bulk: %s', q) curs.execute(q) # copy rows self.log.debug("bulk: COPY %d rows into %s", len(upd_list), temp) skytools.magic_insert(curs, qtemp, upd_list, col_list, quoted_table=True) temp_used = True if self.method == METH_CORRECT: # update main table self.log.debug('bulk: %s', upd_sql) curs.execute(upd_sql) self.log.debug("bulk: %s - %d", curs.statusmessage, curs.rowcount) # check count if len(upd_list) != curs.rowcount: self.log.warning("Update mismatch: expected=%s updated=%d", len(upd_list), curs.rowcount) else: # delete from main table self.log.debug('bulk: %s', del_sql) curs.execute(del_sql) self.log.debug('bulk: %s', curs.statusmessage) # check count if real_update_count != curs.rowcount: self.log.warning("bulk: Update mismatch: expected=%s deleted=%d", real_update_count, curs.rowcount) # insert into main table if AVOID_BIZGRES_BUG: # copy again, into main table self.log.debug("bulk: COPY %d rows into %s", len(upd_list), tbl) skytools.magic_insert(curs, qtbl, upd_list, col_list, quoted_table=True) else: # better way, but does not work due bizgres bug self.log.debug('bulk: %s', ins_sql) curs.execute(ins_sql) self.log.debug('bulk: %s', curs.statusmessage) # process new rows if len(ins_list) > 0: self.log.debug("bulk: Inserting %d rows into %s", len(ins_list), tbl) self.log.debug("bulk: COPY %d rows into %s", len(ins_list), tbl) skytools.magic_insert(curs, qtbl, ins_list, col_list, quoted_table=True) # delete remaining rows if temp_used: if USE_LONGLIVED_TEMP_TABLES or USE_REAL_TABLE: q = "truncate %s" % qtemp else: # fscking problems with long-lived temp tables q = "drop table %s" % qtemp self.log.debug('bulk: %s', q) curs.execute(q) self.reset() def create_temp_table(self, curs: Cursor) -> Tuple[str, str]: if USE_REAL_TABLE: tempname = self.dest_table + "_loadertmpx" else: # create temp table for loading tempname = self.dest_table.replace('.', '_') + "_loadertmp" # check if exists if USE_REAL_TABLE: if skytools.exists_table(curs, tempname): self.log.debug("bulk: Using existing real table %s", tempname) return tempname, quote_fqident(tempname) # create non-temp table q = "create table %s (like %s)" % ( quote_fqident(tempname), quote_fqident(self.dest_table)) self.log.debug("bulk: Creating real table: %s", q) curs.execute(q) return tempname, quote_fqident(tempname) elif USE_LONGLIVED_TEMP_TABLES: if skytools.exists_temp_table(curs, tempname): self.log.debug("bulk: Using existing temp table %s", tempname) return tempname, quote_ident(tempname) # bizgres crashes on delete rows # removed arg = "on commit delete rows" arg = "on commit preserve rows" # create temp table for loading q = "create temp table %s (like %s) %s" % ( quote_ident(tempname), quote_fqident(self.dest_table), arg) self.log.debug("bulk: Creating temp table: %s", q) curs.execute(q) return tempname, quote_ident(tempname) def find_dist_fields(self, curs: Cursor) -> List[str]: if not skytools.exists_table(curs, "pg_catalog.gp_distribution_policy"): return [] schema, name = skytools.fq_name_parts(self.dest_table) q = "select a.attname"\ " from pg_class t, pg_namespace n, pg_attribute a,"\ " gp_distribution_policy p"\ " where n.oid = t.relnamespace"\ " and p.localoid = t.oid"\ " and a.attrelid = t.oid"\ " and a.attnum = any(p.attrnums)"\ " and n.nspname = %s and t.relname = %s" curs.execute(q, [schema, name]) res = [] for row in curs.fetchall(): res.append(row[0]) return res # register handler class __londiste_handlers__ = [BulkLoader] londiste-3.12/londiste/handlers/dispatch.py000066400000000000000000001214071447267722200210740ustar00rootroot00000000000000""" == HANDLERS == * dispatch - "vanilla" dispatch handler with default args (see below) * hourly_event * hourly_batch * hourly_field * hourly_time * daily_event * daily_batch * daily_field * daily_time * monthly_event * monthly_batch * monthly_field * monthly_time * yearly_event * yearly_batch * yearly_field * yearly_time * bulk_hourly_event * bulk_hourly_batch * bulk_hourly_field * bulk_hourly_time * bulk_daily_event * bulk_daily_batch * bulk_daily_field * bulk_daily_time * bulk_monthly_event * bulk_monthly_batch * bulk_monthly_field * bulk_monthly_time * bulk_yearly_event * bulk_yearly_batch * bulk_yearly_field * bulk_yearly_time * bulk_direct - functionally identical to bulk == HANDLER ARGUMENTS == table_mode: * part - partitioned table (default) * direct - non-partitioned table * ignore - all events are ignored part_func: database function to use for creating partition table. default is {londiste|public}.create_partition part_mode: * batch_time - partitioned by batch creation time (default) * event_time - partitioned by event creation time * date_field - partitioned by date_field value. part_field required * current_time - partitioned by current time part_field: date_field to use for partition. Required when part_mode=date_field period: partition period, used for automatic part_name and part_template building * hour * day - default * month * year part_name: custom name template for partition table. default is None as it is built automatically. example for daily partition: %(parent)s_%(year)s_%(month)s_%(day)s template variables: * parent - parent table name * year * month * day * hour part_template: custom sql template for creating partition table. if omitted then partition function is used. template variables: * dest - destination table name. result on part_name evaluation * part - same as dest * parent - parent table name * pkey - parent table primary keys * schema_table - table name with replace: '.' -> '__'. for using in pk names etc. * part_field - date field name if table is partitioned by field * part_time - time of partition row_mode: how rows are applied to target table * plain - each event creates SQL statement to run (default) * keep_latest - change updates to DELETE + INSERT * keep_all - change updates to inserts, ignore deletes event_types: event types to process, separated by comma. Other events are ignored. default is all event types * I - inserts * U - updates * D - deletes load_mode: how data is loaded to dst database. default direct * direct - using direct sql statements (default) * bulk - using copy to temp table and then sql. method: loading method for load_mode bulk. defaults to 0 * 0 (correct) - inserts as COPY into table, update as COPY into temp table and single UPDATE from there delete as COPY into temp table and single DELETE from there * 1 (delete) - as 'correct', but do update as DELETE + COPY * 2 (merged) - as 'delete', but merge insert rows with update rows * 3 (insert) - COPY inserts into table, error when other events fields: field name map for using just part of the fields and rename them * '*' - all fields. default * [,..] - list of source fields to include in target * : - renaming fields list and rename syntax can be mixed: field1,field2:new_field2,field3 skip_fields: list of field names to skip table: new name of destination table. default is same as source pre_part: sql statement(s) to execute before creating partition table. Usable variables are the same as in part_template post_part: sql statement(s) to execute after creating partition table. Usable variables are the same as in part_template retention_period: how long to keep partitions around. examples: '3 months', '1 year' ignore_old_events: * 0 - handle all events in the same way (default) * 1 - ignore events coming for obsolete partitions ignore_truncate: * 0 - process truncate event (default) * 1 - ignore truncate event encoding: name of destination encoding. handler replaces all invalid encoding symbols and logs them as warnings analyze: * 0 - do not run analyze on temp tables (default) * 1 - run analyze on temp tables == NOTES == NB! londiste does not currently support table renaming and field mapping when creating or coping initial data to destination table. --expect-sync and --skip-truncate should be used and --create switch is to be avoided. """ import datetime import re import logging from typing import Sequence, List, Tuple, Optional, Dict, Any, Callable, Mapping, Type, Set import skytools from skytools import UsageError, quote_fqident, quote_ident, dbdict #from skytools.basetypes import DictRow from skytools.sqltools import DictRows from skytools.dbstruct import T_ALL, TableStruct from londiste.handler import BatchInfo, Cursor, Event, ApplyFunc, BaseHandler from londiste.handlers import handler_args, update from londiste.handlers.shard import ShardHandler import londiste.util __all__ = ['Dispatcher'] # BulkLoader load method METH_CORRECT = 0 METH_DELETE = 1 METH_MERGED = 2 METH_INSERT = 3 # BulkLoader hacks AVOID_BIZGRES_BUG = 0 USE_LONGLIVED_TEMP_TABLES = True USE_REAL_TABLE = False # mode variables (first in list is default value) TABLE_MODES = ['part', 'direct', 'ignore'] PART_MODES = ['batch_time', 'event_time', 'date_field', 'current_time'] ROW_MODES = ['plain', 'keep_all', 'keep_latest'] LOAD_MODES = ['direct', 'bulk'] PERIODS = ['day', 'month', 'year', 'hour'] METHODS = [METH_CORRECT, METH_DELETE, METH_MERGED, METH_INSERT] EVENT_TYPES = ['I', 'U', 'D'] PART_FUNC_OLD = 'public.create_partition' PART_FUNC_NEW = 'londiste.create_partition' PART_FUNC_ARGS = ['parent', 'part', 'pkeys', 'part_field', 'part_time', 'period'] RETENTION_FUNC = "londiste.drop_obsolete_partitions" #------------------------------------------------------------------------------ # LOADERS #------------------------------------------------------------------------------ class BaseLoader: table: str pkeys: Sequence[str] log: logging.Logger conf: skytools.dbdict def __init__(self, table: str, pkeys: Sequence[str], log: logging.Logger, conf: skytools.dbdict) -> None: self.table = table self.pkeys = pkeys self.log = log self.conf = conf or skytools.dbdict() def process(self, op: str, row: Dict[str, Any]) -> None: raise NotImplementedError() def flush(self, curs: Cursor) -> None: raise NotImplementedError() class DirectLoader(BaseLoader): data: List[Tuple[str, Dict[str, Any]]] def __init__(self, table: str, pkeys: Sequence[str], log: logging.Logger, conf: skytools.dbdict) -> None: super().__init__(table, pkeys, log, conf) self.data = [] def process(self, op: str, row: Dict[str, Any]) -> None: self.data.append((op, row)) def flush(self, curs: Cursor) -> None: mk_sql: Dict[str, Callable[ [Mapping[str, Any], str, Sequence[str], Optional[Mapping[str, str]]], str ]] = { 'I': skytools.mk_insert_sql, 'U': skytools.mk_update_sql, 'D': skytools.mk_delete_sql } if self.data: curs.execute("\n".join(mk_sql[op](row, self.table, self.pkeys, None) for op, row in self.data)) class BaseBulkCollectingLoader(BaseLoader): """ Collect events into I,U,D lists by pk and keep only last event with most suitable operation. For example when event has operations I,U,U keep only last U, when I,U,D, keep nothing etc If after processing the op is not in I,U or D, then ignore that event for rest """ OP_GRAPH = {'-': {'U': 'U', 'I': 'I', 'D': 'D'}, 'I': {'D': '.'}, 'U': {'D': 'D'}, 'D': {'I': 'U'}, '.': {'I': 'I'}, } pkey_ev_map: Dict[Tuple[str, ...], Tuple[str, Dict[str, Any]]] def __init__(self, table: str, pkeys: Sequence[str], log: logging.Logger, conf: skytools.dbdict) -> None: super().__init__(table, pkeys, log, conf) if not self.pkeys: raise Exception('non-pk tables not supported: %s' % self.table) self.pkey_ev_map = {} def process(self, op: str, row: Dict[str, Any]) -> None: """Collect rows into pk dict, keeping only last row with most suitable op""" pk_data: Tuple[str, ...] = tuple(row[k] for k in self.pkeys) # get current op state, None if first event _op = self.pkey_ev_map.get(pk_data, ('-', {}))[0] # find new state and store together with row data try: # get new op state using op graph # when no edge defined for old -> new op, keep old _op = self.OP_GRAPH[_op].get(op, _op) self.pkey_ev_map[pk_data] = (_op, row) # skip update to pk-only table if len(pk_data) == len(row) and _op == 'U': del self.pkey_ev_map[pk_data] except KeyError: raise Exception('unknown event type: %s' % op) from None def collect_data(self) -> Dict[str, List[Dict[str, Any]]]: """Collects list of rows into operation hashed dict """ op_map: Dict[str, List[Dict[str, Any]]] = {'I': [], 'U': [], 'D': []} for op, row in self.pkey_ev_map.values(): # ignore None op events if op in op_map: op_map[op].append(row) return op_map def flush(self, curs: Cursor) -> None: op_map = self.collect_data() self.bulk_flush(curs, op_map) def bulk_flush(self, curs: Cursor, op_map: Dict[str, List[Dict[str, Any]]]) -> None: pass class BaseBulkTempLoader(BaseBulkCollectingLoader): """ Provide methods for operating bulk collected events with temp table """ keys: List[str] fields: Optional[List[str]] temp: str qtemp: str qtable: str def __init__(self, table: str, pkeys: Sequence[str], log: logging.Logger, conf: skytools.dbdict) -> None: super().__init__(table, pkeys, log, conf) # temp table name if USE_REAL_TABLE: self.temp = self.table + "_loadertmpx" self.qtemp = quote_fqident(self.temp) else: self.temp = self.table.replace('.', '_') + "_loadertmp" self.qtemp = quote_ident(self.temp) # quoted table name self.qtable = quote_fqident(self.table) # all fields self.fields = None # key fields used in where part, possible to add non pk fields # (like dist keys in gp) self.keys = list(self.pkeys) def nonkeys(self) -> List[str]: """returns fields not in keys""" if not self.fields: return [] return [f for f in self.fields if f not in self.keys] def logexec(self, curs: Cursor, sql: str) -> None: """Logs and executes sql statement""" self.log.debug('exec: %s', sql) curs.execute(sql) self.log.debug('msg: %s, rows: %s', curs.statusmessage, curs.rowcount) # create sql parts def _where(self) -> str: tmpl = "%(tbl)s.%(col)s = t.%(col)s" stmt = (tmpl % {'col': quote_ident(f), 'tbl': self.qtable} for f in self.keys) return ' and '.join(stmt) def _cols(self) -> str: if not self.fields: return '' return ','.join(quote_ident(f) for f in self.fields) def insert(self, curs: Cursor) -> None: sql = "insert into %s (%s) select %s from %s" % (self.qtable, self._cols(), self._cols(), self.qtemp) self.logexec(curs, sql) def update(self, curs: Cursor) -> None: qcols = [quote_ident(c) for c in self.nonkeys()] # no point to update pk-only table if not qcols: return tmpl = "%s = t.%s" eqlist = [tmpl % (c, c) for c in qcols] _set = ", ".join(eqlist) sql = "update only %s set %s from %s as t where %s" % (self.qtable, _set, self.qtemp, self._where()) self.logexec(curs, sql) def delete(self, curs: Cursor) -> None: sql = "delete from only %s using %s as t where %s" % (self.qtable, self.qtemp, self._where()) self.logexec(curs, sql) def truncate(self, curs: Cursor) -> None: self.logexec(curs, "truncate %s" % self.qtemp) def drop(self, curs: Cursor) -> None: self.logexec(curs, "drop table %s" % self.qtemp) def create(self, curs: Cursor) -> None: if USE_REAL_TABLE: tmpl = "create table %s (like %s)" else: tmpl = "create temp table %s (like %s) on commit preserve rows" self.logexec(curs, tmpl % (self.qtemp, self.qtable)) def analyze(self, curs: Cursor) -> None: self.logexec(curs, "analyze %s" % self.qtemp) def process(self, op: str, row: Dict[str, Any]) -> None: super().process(op, row) # TODO: maybe one assignment is enough? self.fields = list(row.keys()) class BulkLoader(BaseBulkTempLoader): """ Collects events to and loads bulk data using copy and temp tables """ dist_fields: Optional[List[str]] run_analyze: int method: int temp_present: bool def __init__(self, table: str, pkeys: Sequence[str], log: logging.Logger, conf: skytools.dbdict) -> None: super().__init__(table, pkeys, log, conf) self.method = self.conf['method'] self.run_analyze = self.conf['analyze'] self.dist_fields = None # is temp table created self.temp_present = False def process(self, op: str, row: Dict[str, Any]) -> None: if self.method == METH_INSERT and op != 'I': raise Exception('%s not supported by method insert' % op) super().process(op, row) def process_delete(self, curs: Cursor, op_map: Dict[str, List[Dict[str, Any]]]) -> None: """Process delete list""" data = op_map['D'] cnt = len(data) if cnt == 0: return self.log.debug("bulk: Deleting %d rows from %s", cnt, self.table) # copy rows to temp self.bulk_insert(curs, data) # delete rows using temp self.delete(curs) # check if right amount of rows deleted (only in direct mode) if self.conf.table_mode == 'direct' and cnt != curs.rowcount: self.log.warning("%s: Delete mismatch: expected=%s deleted=%d", self.table, cnt, curs.rowcount) def process_update(self, curs: Cursor, op_map: Dict[str, List[Dict[str, Any]]]) -> None: """Process update list""" data = op_map['U'] # original update list count real_cnt = len(data) # merged method loads inserts together with updates if self.method == METH_MERGED: data += op_map['I'] cnt = len(data) if cnt == 0: return self.log.debug("bulk: Updating %d rows in %s", cnt, self.table) # copy rows to temp self.bulk_insert(curs, data) if self.method == METH_CORRECT: # update main table from temp self.update(curs) # check count (only in direct mode) if self.conf.table_mode == 'direct' and cnt != curs.rowcount: self.log.warning("%s: Update mismatch: expected=%s updated=%d", self.table, cnt, curs.rowcount) else: # delete from main table using temp self.delete(curs) # check count (only in direct mode) if self.conf.table_mode == 'direct' and real_cnt != curs.rowcount: self.log.warning("%s: Update mismatch: expected=%s deleted=%d", self.table, real_cnt, curs.rowcount) # insert into main table if AVOID_BIZGRES_BUG: # copy again, into main table self.bulk_insert(curs, data, table=self.qtable) else: # insert from temp - better way, but does not work # due bizgres bug self.insert(curs) def process_insert(self, curs: Cursor, op_map: Dict[str, List[Dict[str, Any]]]) -> None: """Process insert list""" data = op_map['I'] cnt = len(data) # merged method loads inserts together with updates if (cnt == 0) or (self.method == METH_MERGED): return self.log.debug("bulk: Inserting %d rows into %s", cnt, self.table) # copy into target table (no temp used) self.bulk_insert(curs, data, table=self.qtable) def bulk_flush(self, curs: Cursor, op_map: Dict[str, List[Dict[str, Any]]]) -> None: self.log.debug("bulk_flush: %s (I/U/D = %d/%d/%d)", self.table, len(op_map['I']), len(op_map['U']), len(op_map['D'])) # fetch distribution fields if self.dist_fields is None: self.dist_fields = self.find_dist_fields(curs) assert self.dist_fields self.log.debug("Key fields: %s Dist fields: %s", ",".join(self.pkeys or []), ",".join(self.dist_fields or [])) # add them to key for key in self.dist_fields: if key not in self.keys: self.keys.append(key) # check if temp table present self.check_temp(curs) # process I,U,D self.process_delete(curs, op_map) self.process_update(curs, op_map) self.process_insert(curs, op_map) # truncate or drop temp table self.clean_temp(curs) def check_temp(self, curs: Cursor) -> None: if USE_REAL_TABLE: self.temp_present = skytools.exists_table(curs, self.temp) else: self.temp_present = skytools.exists_temp_table(curs, self.temp) def clean_temp(self, curs: Cursor) -> None: # delete remaining rows if self.temp_present: if USE_LONGLIVED_TEMP_TABLES or USE_REAL_TABLE: self.truncate(curs) else: # fscking problems with long-lived temp tables self.drop(curs) def create_temp(self, curs: Cursor) -> bool: """ check if temp table exists. Returns False if using existing temp table and True if creating new """ if USE_LONGLIVED_TEMP_TABLES or USE_REAL_TABLE: if self.temp_present: self.log.debug("bulk: Using existing temp table %s", self.temp) return False self.create(curs) self.temp_present = True return True def bulk_insert(self, curs: Cursor, data: DictRows, table: Optional[str] = None) -> None: """Copy data to table. If table not provided, use temp table. When re-using existing temp table, it is always truncated first and analyzed after copy. """ if not data: return _use_temp = table is None xtable = self.temp if table is None else table # if table not specified use temp if _use_temp: # truncate when re-using existing table if not self.create_temp(curs): self.truncate(curs) self.log.debug("bulk: COPY %d rows into %s", len(data), xtable) skytools.magic_insert(curs, xtable, data, self.fields, quoted_table=True) if _use_temp and self.run_analyze: self.analyze(curs) def find_dist_fields(self, curs: Cursor) -> List[str]: """Find GP distribution keys""" if not skytools.exists_table(curs, "pg_catalog.gp_distribution_policy"): return [] schema, name = skytools.fq_name_parts(self.table) qry = "select a.attname"\ " from pg_class t, pg_namespace n, pg_attribute a,"\ " gp_distribution_policy p"\ " where n.oid = t.relnamespace"\ " and p.localoid = t.oid"\ " and a.attrelid = t.oid"\ " and a.attnum = any(p.attrnums)"\ " and n.nspname = %s and t.relname = %s" curs.execute(qry, [schema, name]) res = [] for row in curs.fetchall(): res.append(row[0]) return res LOADERS = {'direct': DirectLoader, 'bulk': BulkLoader} #------------------------------------------------------------------------------ # ROW HANDLERS #------------------------------------------------------------------------------ class RowHandler: log: logging.Logger table_map: Dict[str, BaseLoader] def __init__(self, log: logging.Logger) -> None: self.log = log self.table_map = {} def add_table(self, table: str, ldr_cls: Type[BaseLoader], pkeys: List[str], args: dbdict) -> None: self.table_map[table] = ldr_cls(table, pkeys, self.log, args) def process(self, table: str, op: str, row: Dict[str, Any]) -> None: try: self.table_map[table].process(op, row) except KeyError: raise Exception("No loader for table %s" % table) from None def flush(self, curs: Cursor) -> None: for ldr in self.table_map.values(): ldr.flush(curs) class KeepAllRowHandler(RowHandler): def process(self, table: str, op: str, row: Dict[str, Any]) -> None: """Keep all row versions. Updates are changed to inserts, deletes are ignored. Makes sense only for partitioned tables. """ if op == 'U': op = 'I' elif op == 'D': return super().process(table, op, row) class KeepLatestRowHandler(RowHandler): def process(self, table: str, op: str, row: Dict[str, Any]) -> None: """Keep latest row version. Updates are changed to delete + insert Makes sense only for partitioned tables. """ if op == 'U': super().process(table, 'D', row) super().process(table, 'I', row) elif op == 'I': super().process(table, 'I', row) elif op == 'D': super().process(table, 'D', row) ROW_HANDLERS = {'plain': RowHandler, 'keep_all': KeepAllRowHandler, 'keep_latest': KeepLatestRowHandler} #------------------------------------------------------------------------------ # DISPATCHER #------------------------------------------------------------------------------ class Dispatcher(ShardHandler): _doc_ = """Partitioned loader. Splits events into partitions, if requested. Then applies them without further processing. """ handler_name = 'dispatch' dst_curs: Optional[Cursor] ignored_tables: Set[str] batch_info: Optional[BatchInfo] pkeys: Optional[List[str]] @property def __doc__(self) -> Optional[str]: return self._doc_ @__doc__.setter def __doc__(self, value: Optional[str]) -> None: pass def __init__(self, table_name: str, args: Dict[str, str], dest_table: str) -> None: # compat for dest-table dest_table = args.get('table', dest_table) super().__init__(table_name, args, dest_table) # show args self.log.debug("dispatch.init: table_name=%r, args=%r", table_name, args) self.ignored_tables = set() self.batch_info = None self.dst_curs = None self.pkeys = None # config hdlr_cls = ROW_HANDLERS[self.conf.row_mode] self.row_handler = hdlr_cls(self.log) def _parse_args_from_doc(self) -> List[Tuple[str, str, str]]: doc = __doc__ params_descr: List[Tuple[str, str, str]] = [] params_found = False for line in doc.splitlines(): ln = line.strip() if params_found: if ln.startswith("=="): break m = re.match(r"^(\w+):$", ln) if m: name = m.group(1) expr = text = "" elif not params_descr: continue else: name, expr, text = params_descr.pop() text += ln + "\n" params_descr.append((name, expr, text)) elif ln == "== HANDLER ARGUMENTS ==": params_found = True return params_descr def get_config(self) -> dbdict: """Processes args dict""" conf = super().get_config() # set table mode conf.table_mode = self.get_arg('table_mode', TABLE_MODES) conf.analyze = self.get_arg('analyze', [0, 1]) if conf.table_mode == 'part': conf.part_mode = self.get_arg('part_mode', PART_MODES) conf.part_field = self.args.get('part_field') if conf.part_mode == 'date_field' and not conf.part_field: raise Exception('part_mode date_field requires part_field!') conf.period = self.get_arg('period', PERIODS) conf.part_name = self.args.get('part_name') conf.part_template = self.args.get('part_template') conf.pre_part = self.args.get('pre_part') conf.post_part = self.args.get('post_part') conf.part_func = self.args.get('part_func', PART_FUNC_NEW) conf.retention_period = self.args.get('retention_period') conf.ignore_old_events = self.get_arg('ignore_old_events', [0, 1], 0) # set row mode and event types to process conf.row_mode = self.get_arg('row_mode', ROW_MODES) cf_event_types = self.args.get('event_types', '*') if cf_event_types == '*': event_types = EVENT_TYPES else: event_types = [evt.upper() for evt in cf_event_types.split(',')] # noqa for evt in event_types: if evt not in EVENT_TYPES: raise Exception('Unsupported operation: %s' % evt) conf.event_types = event_types # set load handler conf.load_mode = self.get_arg('load_mode', LOAD_MODES) conf.method = self.get_arg('method', METHODS) # fields to skip conf.skip_fields = [f.strip().lower() for f in self.args.get('skip_fields', '').split(',')] # get fields map (obsolete, for compatibility reasons) fields = self.args.get('fields', '*') if fields == "*": conf.field_map = None else: conf.field_map = {} for fval in fields.split(','): tmp = fval.split(':') if len(tmp) == 1: conf.field_map[tmp[0]] = tmp[0] else: conf.field_map[tmp[0]] = tmp[1] return conf def _validate_hash_key(self) -> None: pass # no need for hash key when not sharding def prepare_batch(self, batch_info: Optional[BatchInfo], dst_curs: Cursor) -> None: """Called on first event for this table in current batch.""" if batch_info is not None and self.conf.table_mode != 'ignore': self.batch_info = batch_info self.dst_curs = dst_curs super().prepare_batch(batch_info, dst_curs) def filter_data(self, data: Dict[str, Any]) -> Dict[str, Any]: """Process with fields skip and map""" fskip = self.conf.skip_fields fmap = self.conf.field_map if fskip: data = dict((k, v) for k, v in data.items() if k not in fskip) if fmap: # when field name not present in source is used then None (NULL) # value is inserted. is it ok? data = dict((v, data.get(k)) for k, v in fmap.items()) return data def filter_pkeys(self, pkeys: List[str]) -> List[str]: """Process with fields skip and map""" fskip = self.conf.skip_fields fmap = self.conf.field_map if fskip: pkeys = [f for f in pkeys if f not in fskip] if fmap: pkeys = [fmap[p] for p in pkeys if p in fmap] return pkeys def _process_event(self, ev: Event, sql_queue_func: ApplyFunc, arg: Cursor) -> None: """Process a event. Event should be added to sql_queue or executed directly. """ if self.conf.table_mode == 'ignore': return # get data data = skytools.db_urldecode(ev.data) if len(ev.ev_type) < 2 or ev.ev_type[1] != ':': raise Exception('Unsupported event type: %s/extra1=%s/data=%s' % ( ev.ev_type, ev.ev_extra1, ev.ev_data)) op, pkeys = ev.type.split(':', 1) if op not in 'IUD': raise Exception('Unknown event type: %s' % ev.ev_type) # process only operations specified if op not in self.conf.event_types: #self.log.debug('dispatch.process_event: ignored event type') return if self.pkeys is None: self.pkeys = self.filter_pkeys(pkeys.split(',')) data = self.filter_data(data) # prepare split table when needed if self.conf.table_mode == 'part': dst, part_time = self.split_format(ev, data) if dst in self.ignored_tables: return if dst not in self.row_handler.table_map: self.check_part(dst, part_time) if dst in self.ignored_tables: return else: dst = self.dest_table if dst not in self.row_handler.table_map: self.row_handler.add_table(dst, LOADERS[self.conf.load_mode], self.pkeys, self.conf) self.row_handler.process(dst, op, data) def finish_batch(self, batch_info: BatchInfo, dst_curs: Cursor) -> None: """Called when batch finishes.""" if self.conf.table_mode != 'ignore': self.row_handler.flush(dst_curs) #super().finish_batch(batch_info, dst_curs) def get_part_name(self) -> str: # if custom part name template given, use it if self.conf.part_name: return self.conf.part_name parts = ['year', 'month', 'day', 'hour'] name_parts = ['parent'] + parts[:parts.index(self.conf.period) + 1] return '_'.join('%%(%s)s' % part for part in name_parts) def split_format(self, ev: Event, data: Dict[str, Any]) -> Tuple[str, datetime.datetime]: """Generates part table name from template""" assert self.batch_info if self.conf.part_mode == 'batch_time': dtm = self.batch_info['batch_end'] elif self.conf.part_mode == 'event_time': dtm = ev.ev_time elif self.conf.part_mode == 'current_time': dtm = datetime.datetime.now() elif self.conf.part_mode == 'date_field': dt_str = data[self.conf.part_field] if dt_str is None: raise Exception('part_field(%s) is NULL: %s' % (self.conf.part_field, ev)) dtm = datetime.datetime.strptime(dt_str[:19], "%Y-%m-%d %H:%M:%S") else: raise UsageError('Bad value for part_mode: %s' % self.conf.part_mode) vals = { 'parent': self.dest_table, 'year': "%04d" % dtm.year, 'month': "%02d" % dtm.month, 'day': "%02d" % dtm.day, 'hour': "%02d" % dtm.hour, } return (self.get_part_name() % vals, dtm) def check_part(self, dst: str, part_time: datetime.datetime) -> None: """Create part table if not exists. It part_template present, execute it else if part function present in db, call it else clone master table""" curs = self.dst_curs assert curs if (self.conf.ignore_old_events and self.conf.retention_period and self.is_obsolete_partition(dst, self.conf.retention_period, self.conf.period)): self.ignored_tables.add(dst) return if skytools.exists_table(curs, dst): return dst = quote_fqident(dst) vals = {'dest': dst, 'part': dst, 'parent': self.fq_dest_table, 'pkeys': ",".join(self.pkeys or []), # quoting? # we do this to make sure that constraints for # tables who contain a schema will still work 'schema_table': dst.replace(".", "__"), 'part_field': self.conf.part_field, 'part_time': part_time, 'period': self.conf.period, } def exec_with_vals(tmpl: str) -> bool: if tmpl: sql = tmpl % vals curs.execute(sql) return True return False exec_with_vals(self.conf.pre_part) if not exec_with_vals(self.conf.part_template): self.log.debug('part_template not provided, using part func') # if part func exists call it with val arguments pfargs = ', '.join('%%(%s)s' % arg for arg in PART_FUNC_ARGS) # set up configured function pfcall = 'select %s(%s)' % (self.conf.part_func, pfargs) have_func = skytools.exists_function(curs, self.conf.part_func, len(PART_FUNC_ARGS)) # backwards compat if not have_func and self.conf.part_func == PART_FUNC_NEW: pfcall = 'select %s(%s)' % (PART_FUNC_OLD, pfargs) have_func = skytools.exists_function(curs, PART_FUNC_OLD, len(PART_FUNC_ARGS)) if have_func: self.log.debug('check_part.exec: func: %s, args: %s', pfcall, vals) curs.execute(pfcall, vals) else: # # Otherwise create simple clone. # # FixMe: differences from create_partitions(): # - check constraints # - inheritance # self.log.debug('part func %s not found, cloning table', self.conf.part_func) struct = TableStruct(curs, self.dest_table) struct.create(curs, T_ALL, dst) exec_with_vals(self.conf.post_part) self.log.info("Created table: %s", dst) if self.conf.retention_period: dropped = self.drop_obsolete_partitions(self.dest_table, self.conf.retention_period, self.conf.period) if self.conf.ignore_old_events and dropped: for tbl in dropped: self.ignored_tables.add(tbl) if tbl in self.row_handler.table_map: del self.row_handler.table_map[tbl] def drop_obsolete_partitions(self, parent_table: str, retention_period: str, partition_period: str) -> List[str]: """ Drop obsolete partitions of partition-by-date parent table. """ curs = self.dst_curs assert curs func = RETENTION_FUNC args = [parent_table, retention_period, partition_period] sql = "select " + func + "(%s, %s, %s)" self.log.debug("func: %s, args: %s", func, args) curs.execute(sql, args) res = [row[0] for row in curs.fetchall()] if res: self.log.info("Dropped tables: %s", ", ".join(res)) return res def is_obsolete_partition(self, partition_table: str, retention_period: str, partition_period: str) -> bool: """ Test partition name of partition-by-date parent table. """ curs = self.dst_curs assert curs func = "londiste.is_obsolete_partition" args = [partition_table, retention_period, partition_period] sql = "select " + func + "(%s, %s, %s)" self.log.debug("func: %s, args: %s", func, args) curs.execute(sql, args) res = curs.fetchone()[0] if res: self.log.info("Ignored table: %s", partition_table) return res def real_copy(self, tablename: str, src_curs: Cursor, dst_curs: Cursor, column_list: Sequence[str]) -> Tuple[int, int]: """do actual table copy and return tuple with number of bytes and rows copied """ _src_cols = _dst_cols = column_list condition = self.get_copy_condition(src_curs, dst_curs) if self.conf.skip_fields: _src_cols = [col for col in column_list if col not in self.conf.skip_fields] _dst_cols = _src_cols if self.conf.field_map: _src_cols = [col for col in _src_cols if col in self.conf.field_map] _dst_cols = [self.conf.field_map[col] for col in _src_cols] return skytools.full_copy(tablename, src_curs, dst_curs, _src_cols, condition, dst_tablename=self.dest_table, dst_column_list=_dst_cols) def real_copy_threaded( self, src_real_table: str, src_curs: Cursor, dst_db_connstr: str, column_list: Sequence[str], config_file: str, config_section: str, parallel: int = 1, ) -> Tuple[int, int]: with skytools.connect_database(dst_db_connstr) as dst_db: with dst_db.cursor() as dst_curs: condition = self.get_copy_condition(src_curs, dst_curs) dst_db.commit() _src_cols = _dst_cols = column_list if self.conf.skip_fields: _src_cols = [col for col in column_list if col not in self.conf.skip_fields] _dst_cols = _src_cols if self.conf.field_map: _src_cols = [col for col in _src_cols if col in self.conf.field_map] _dst_cols = [self.conf.field_map[col] for col in _src_cols] return londiste.util.full_copy_parallel( src_real_table, src_curs, dst_db_connstr=dst_db_connstr, dst_tablename=self.dest_table, condition=condition, column_list=_src_cols, dst_column_list=_dst_cols, parallel=parallel, ) # add arguments' description to handler's docstring def _install_handler_docstrings(dst_cls: Type[BaseHandler]) -> None: found = False for line in __doc__.splitlines(): if line.startswith("== HANDLER ARGUMENTS =="): found = True if found: dst_cls._doc_ += "\n" + line _install_handler_docstrings(Dispatcher) #------------------------------------------------------------------------------ # register handler class #------------------------------------------------------------------------------ __londiste_handlers__ = [Dispatcher] #------------------------------------------------------------------------------ # build set of handlers with different default values for easier use #------------------------------------------------------------------------------ LOAD = { '': {'load_mode': 'direct'}, 'bulk': {'load_mode': 'bulk'} } PERIOD = { 'hourly': {'period': 'hour'}, 'daily': {'period': 'day'}, 'monthly': {'period': 'month'}, 'yearly': {'period': 'year'}, } MODE = { 'event': {'part_mode': 'event_time'}, 'batch': {'part_mode': 'batch_time'}, 'field': {'part_mode': 'date_field'}, 'time': {'part_mode': 'current_time'}, } BASE = { 'table_mode': 'part', 'row_mode': 'keep_latest', } def set_handler_doc(cls: Type[BaseHandler], handler_defs: Dict[str, str]) -> None: """ generate handler docstring """ cls._doc_ = "Custom dispatch handler with default args.\n\n" \ "Parameters:\n" for k, v in handler_defs.items(): cls._doc_ += " %s = %s\n" % (k, v) def _generate_handlers() -> None: for load, load_dict in LOAD.items(): for period, period_dict in PERIOD.items(): for mode, mode_dict in MODE.items(): handler_name = '_'.join(p for p in (load, period, mode) if p) # define creator func to keep default dicts in separate context def create_handler(_handler_name: str, _load_dict: Dict[str, str], _period_dict: Dict[str, str], _mode_dict: Dict[str, str]) -> None: default = update(_mode_dict, _period_dict, _load_dict, BASE) @handler_args(_handler_name, Dispatcher) def handler_func(args: Dict[str, str]) -> Dict[str, str]: return update(args, default) #assert handler_func # avoid 'unused' warning, decorator registers it create_handler(handler_name, load_dict, period_dict, mode_dict) hcls = __londiste_handlers__[-1] # it was just added defs = update(mode_dict, period_dict, load_dict, BASE) set_handler_doc(hcls, defs) _generate_handlers() @handler_args('bulk_direct', Dispatcher) def bulk_direct_handler(args: Dict[str, str]) -> Dict[str, str]: return update(args, {'load_mode': 'bulk', 'table_mode': 'direct'}) set_handler_doc(__londiste_handlers__[-1], {'load_mode': 'bulk', 'table_mode': 'direct'}) @handler_args('direct', Dispatcher) def direct_handler(args: Dict[str, str]) -> Dict[str, str]: return update(args, {'load_mode': 'direct', 'table_mode': 'direct'}) set_handler_doc(__londiste_handlers__[-1], {'load_mode': 'direct', 'table_mode': 'direct'}) londiste-3.12/londiste/handlers/multimaster.py000066400000000000000000000027201447267722200216370ustar00rootroot00000000000000""" Handler for replica with multiple master nodes. Can only handle initial copy from one master. Add other masters with expect-sync option. NB! needs merge_on_time function to be compiled on database first. """ from typing import Dict, Optional, List import skytools from londiste.handlers import update from londiste.handlers.applyfn import ApplyFuncHandler __all__ = ['MultimasterHandler'] class MultimasterHandler(ApplyFuncHandler): __doc__ = __doc__ handler_name = 'multimaster' def __init__(self, table_name: str, args: Dict[str, str], dest_table: Optional[str]) -> None: """Init per-batch table data cache.""" conf = args.copy() # remove Multimaster args from conf for name in ['func_name', 'func_conf']: if name in conf: conf.pop(name) fconf = skytools.db_urlencode(conf) args = update(args, {'func_name': 'merge_on_time', 'func_conf': fconf}) super().__init__(table_name, args, dest_table) def _check_args(self, args: Dict[str, str]) -> None: pass # any arg can be passed def add(self, trigger_arg_list: List[str]) -> None: """Create SKIP and BEFORE INSERT trigger""" trigger_arg_list.append('no_merge') #------------------------------------------------------------------------------ # register handler class #------------------------------------------------------------------------------ __londiste_handlers__ = [MultimasterHandler] londiste-3.12/londiste/handlers/obfuscate.py000066400000000000000000000251001447267722200212410ustar00rootroot00000000000000"""Handler that uses keyed-hash to obfuscate data. To use set in londiste.ini: handler_modules = londiste.handlers.obfuscate obfuscator_map = rules.yaml obfuscator_key = seedForHash then add table with: londiste add-table xx --handler="obfuscate" """ import json import uuid from hashlib import blake2s from typing import Dict, Any, Sequence, Tuple, Optional, List, cast from skytools.basetypes import Cursor, DictRow import skytools import yaml from pgq.event import Event from londiste.handler import TableHandler import londiste.util __all__ = ['Obfuscator'] _KEY = b'' BOOL = 'bool' KEEP = 'keep' JSON = 'json' HASH32 = 'hash32' HASH64 = 'hash64' HASH128 = 'hash' SKIP = 'skip' RuleDict = Dict[str, Any] def as_bytes(data: Any) -> bytes: """Convert input string or json value into bytes. """ if isinstance(data, str): return data.encode('utf8') if isinstance(data, int): return b'%d' % data if isinstance(data, float): # does not work - pgsql repr may differ return b'%r' % data if isinstance(data, bool): # may work but needs to be in sync with copy and event # only 2 output hashes.. return data and b't' or b'f' # no point hashing str() of list or dict raise ValueError('Invalid input type for hashing: %s' % type(data)) def hash32(data: Any) -> Optional[int]: """Returns hash as 32-bit signed int. """ if data is None: return None hash_bytes = blake2s(as_bytes(data), digest_size=4, key=_KEY).digest() return int.from_bytes(hash_bytes, byteorder='big', signed=True) def hash64(data: Any) -> Optional[int]: """Returns hash as 64-bit signed int. """ if data is None: return None hash_bytes = blake2s(as_bytes(data), digest_size=8, key=_KEY).digest() return int.from_bytes(hash_bytes, byteorder='big', signed=True) def hash128(data: Any) -> Optional[str]: """Returns hash as 128-bit variant 0 uuid. """ if data is None: return None hash_bytes = blake2s(as_bytes(data), digest_size=16, key=_KEY).digest() hash_int = int.from_bytes(hash_bytes, byteorder='big') # rfc4122 variant bit: # normal uuids are variant==1 (X >= 8), make this variant==0 (X <= 7) # uuid: ........-....-....-X...-............ hash_int &= ~(0x8000 << 48) return str(uuid.UUID(int=hash_int)) def data_to_dict(data: str, column_list: Sequence[str]) -> Dict[str, Any]: """Convert data received from copy to dict """ if data[-1] == '\n': data = data[:-1] vals = [skytools.unescape_copy(value) for value in data.split('\t')] row = dict(zip(column_list, vals)) return row def obf_vals_to_data(obf_vals: Sequence[Optional[str]]) -> str: """Converts obfuscated values back to copy data """ vals = [skytools.quote_copy(value) for value in obf_vals] obf_data = '\t'.join(vals) + '\n' return obf_data def obf_json(json_data: Any, rule_data: RuleDict) -> Any: """JSON cleanup. >>> obf_json({'a': 1, 'b': 2, 'c': 3}, {'a': 'keep', 'b': 'hash'}) {'a': 1, 'b': 'da0f3012-9a91-a079-484b-883a64e535df'} >>> obf_json({'a': {'b': {'c': 3}}}, {'a': {}}) >>> obf_json({'a': {'b': {'c': 3}}}, {'a': {'b': {'c': 'hash'}}}) {'a': {'b': {'c': 'ad8f95d3-1e86-689a-24aa-54dbb60d022e'}}} >>> obf_json({'a': {'b': {'c': 3}}, 'd': []}, {'a': {'b': {'c': 'skip'}}, 'd': 'keep'}) {'d': []} """ if isinstance(rule_data, dict): if not isinstance(json_data, dict): return None result = {} for rule_key, rule_value in rule_data.items(): val = obf_json(json_data.get(rule_key), rule_value) if val is not None: result[rule_key] = val if not result: return None return result if rule_data == KEEP: return json_data if rule_data == SKIP: return None if isinstance(json_data, (dict, list)): return None if rule_data == BOOL: if json_data is None: return None return bool(json_data) and 't' or 'f' if rule_data == HASH32: return hash32(json_data) if rule_data == HASH64: return hash64(json_data) if rule_data == HASH128: return hash128(json_data) raise ValueError('Invalid rule value: %r' % rule_data) class Obfuscator(TableHandler): """Default Londiste handler, inserts events into tables with plain SQL. """ handler_name = 'obfuscate' obf_map: Dict[str, RuleDict] = {} @classmethod def load_conf(cls, cf: skytools.Config) -> None: global _KEY _KEY = as_bytes(cf.get('obfuscator_key', '')) with open(cf.getfile('obfuscator_map'), 'r', encoding="utf8") as f: cls.obf_map = yaml.safe_load(f) def _get_map(self, src_tablename: str, row: Optional[Dict[str, Any]] = None) -> RuleDict: """Can be over ridden in inherited classes to implemnt data driven maps """ if src_tablename not in self.obf_map: raise KeyError('Source table not in obf_map: %s' % src_tablename) return self.obf_map[src_tablename] def parse_row_data(self, ev: Event) -> Dict[str, Any]: """Extract row data from event, with optional encoding fixes. Returns either string (sql event) or dict (urlenc event). """ row = super().parse_row_data(ev) rule_data = self._get_map(self.table_name, row) dst: Dict[str, Any] = {} for field, value in row.items(): action = rule_data.get(field, SKIP) if isinstance(action, dict): dst[field] = self.obf_json(value, action) elif action == KEEP: dst[field] = value elif action == SKIP: continue elif action == BOOL: if value is None: dst[field] = value else: dst[field] = bool(value) and 't' or 'f' elif action == HASH32: dst[field] = hash32(value) elif action == HASH64: dst[field] = hash64(value) elif action == HASH128: dst[field] = hash128(value) else: raise ValueError('Invalid value for action: %r' % action) return dst def obf_json(self, value: Any, rule_data: RuleDict) -> Optional[str]: """Recursive obfuscate for json """ if value is None: return None json_data = json.loads(value) obf_data = obf_json(json_data, rule_data) if obf_data is None: obf_data = {} return json.dumps(obf_data) def obf_copy_row(self, data: str, column_list: Sequence[str], src_tablename: str) -> str: """Apply obfuscation to one row """ row = data_to_dict(data, column_list) obf_col_map = self._get_map(src_tablename, row) obf_vals: List[Optional[str]] = [] for field, value in row.items(): action = obf_col_map.get(field, SKIP) if isinstance(action, dict): obf_val = self.obf_json(value, action) obf_vals.append(obf_val) continue elif action == KEEP: obf_vals.append(value) continue elif action == SKIP: continue if value is None: obf_vals.append(value) elif action == BOOL: obf_val = str(bool(value) and 't' or 'f') obf_vals.append(obf_val) elif action == HASH32: obf_val = str(hash32(value)) obf_vals.append(obf_val) elif action == HASH64: obf_val = str(hash64(value)) obf_vals.append(obf_val) elif action == HASH128: obf_val = hash128(value) obf_vals.append(obf_val) else: raise ValueError('Invalid value for action: %s' % action) obf_data = obf_vals_to_data(obf_vals) return obf_data def real_copy(self, src_tablename: str, src_curs: Cursor, dst_curs: Cursor, column_list: Sequence[str]) -> Tuple[int, int]: """Initial copy """ obf_col_map = self._get_map(src_tablename) new_list = [] for col in column_list: action = obf_col_map.get(col, SKIP) if action != SKIP: new_list.append(col) column_list = new_list def _write_hook(pipe: Any, data: str) -> str: return self.obf_copy_row(data, column_list, src_tablename) condition = self.get_copy_condition(src_curs, dst_curs) return skytools.full_copy(src_tablename, src_curs, dst_curs, column_list, condition, dst_tablename=self.dest_table, write_hook=_write_hook) def real_copy_threaded( self, src_real_table: str, src_curs: Cursor, dst_db_connstr: str, column_list: Sequence[str], config_file: str, config_section: str, parallel: int = 1, ) -> Tuple[int, int]: with skytools.connect_database(dst_db_connstr) as dst_db: with dst_db.cursor() as dst_curs: condition = self.get_copy_condition(src_curs, dst_curs) dst_db.commit() obf_col_map = self._get_map(src_real_table) new_list = [] for col in column_list: action = obf_col_map.get(col, SKIP) if action != SKIP: new_list.append(col) column_list = new_list def _write_hook(pipe: Any, data: str) -> str: return self.obf_copy_row(data, column_list, src_real_table) return londiste.util.full_copy_parallel( src_real_table, src_curs, dst_db_connstr=dst_db_connstr, dst_tablename=self.dest_table, condition=condition, column_list=column_list, write_hook=_write_hook, parallel=parallel, ) def get_copy_event(self, ev: Event, queue_name: str) -> Optional[Event]: row = self.parse_row_data(ev) ev_data: str if len(ev.type) == 1: raise ValueError("sql trigger not supported") elif ev.data[0] == '{': ev_data = skytools.json_encode(row) else: ev_data = skytools.db_urlencode(row) ev_row = dict(ev._event_row.items()) ev_row['ev_data'] = ev_data return Event(queue_name, cast(DictRow, ev_row)) __londiste_handlers__ = [Obfuscator] if __name__ == '__main__': import doctest doctest.testmod() londiste-3.12/londiste/handlers/qtable.py000066400000000000000000000067711447267722200205530ustar00rootroot00000000000000"""Set up table that sends inserts to queue. Handlers: qtable - dummy handler to setup queue tables. All events are ignored. Use in root node. fake_local - dummy handler to setup queue tables. All events are ignored. Table structure is not required. Use in branch/leaf. qsplitter - dummy handler to setup queue tables. All events are ignored. Table structure is not required. All table events are inserted to destination queue, specified with handler arg 'queue'. """ from typing import Sequence, Tuple, List, Dict, Optional, Type, Any from skytools.basetypes import Cursor import pgq from pgq.baseconsumer import BatchInfo from pgq.event import Event from londiste.handler import BaseHandler, ApplyFunc __all__ = ['QueueTableHandler', 'QueueSplitterHandler'] class QueueTableHandler(BaseHandler): """Queue table handler. Do nothing. Trigger: before-insert, skip trigger. Event-processing: do nothing. """ handler_name = 'qtable' def add(self, trigger_arg_list: List[str]) -> None: """Create SKIP and BEFORE INSERT trigger""" trigger_arg_list.append('tgflags=BI') trigger_arg_list.append('SKIP') trigger_arg_list.append('expect_sync') def real_copy(self, tablename: str, src_curs: Cursor, dst_curs: Cursor, column_list: List[str]) -> Tuple[int, int]: """Force copy not to start""" return (0, 0) def real_copy_threaded( self, src_real_table: str, src_curs: Cursor, dst_db_connstr: str, common_cols: Sequence[str], config_file: str, config_section: str, parallel: int = 1, ) -> Tuple[int, int]: return (0, 0) def needs_table(self) -> bool: return False class QueueSplitterHandler(BaseHandler): """Send events for one table to another queue. Parameters: queue=QUEUE - Queue name. """ handler_name = 'qsplitter' rows: List[Sequence[Any]] def __init__(self, table_name: str, args: Dict[str, str], dest_table: Optional[str]) -> None: """Init per-batch table data cache.""" super().__init__(table_name, args, dest_table) try: self.dst_queue_name = args['queue'] except KeyError: raise Exception('specify queue with handler-arg') from None self.rows = [] def add(self, trigger_arg_list: List[str]) -> None: trigger_arg_list.append('virtual_table') def prepare_batch(self, batch_info: Optional[BatchInfo], dst_curs: Cursor) -> None: """Called on first event for this table in current batch.""" self.rows = [] def process_event(self, ev: Event, sql_queue_func: ApplyFunc, dst_curs: Cursor) -> None: """Process a event. Event should be added to sql_queue or executed directly. """ if self.dst_queue_name is None: return data = [ev.type, ev.data, ev.extra1, ev.extra2, ev.extra3, ev.extra4, ev.time] self.rows.append(data) def finish_batch(self, batch_info: BatchInfo, dst_curs: Cursor) -> None: """Called when batch finishes.""" if self.dst_queue_name is None: return fields = ['type', 'data', 'extra1', 'extra2', 'extra3', 'extra4', 'time'] pgq.bulk_insert_events(dst_curs, self.rows, fields, self.dst_queue_name) def needs_table(self) -> bool: return False __londiste_handlers__: List[Type[BaseHandler]] = [QueueTableHandler, QueueSplitterHandler] londiste-3.12/londiste/handlers/shard.py000066400000000000000000000151061447267722200203740ustar00rootroot00000000000000"""Event filtering by hash, for partitioned databases. Parameters: key=COLUMN: column name to use for hashing hash_key=COLUMN: column name to use for hashing (overrides 'key' parameter) encoding=ENC: validate and fix incoming data (only utf8 supported atm) ignore_truncate=BOOL: ignore truncate event, default: 0, values: 0,1 disable_replay=BOOL: no replay to table, just copy events. default: 0, values: 0 1 On root node: * Hash of key field will be added to ev_extra3. This is implemented by adding additional trigger argument: ev_extra3='hash='||hashfunc(key_column) On branch/leaf node: * On COPY time, the SELECT on provider side gets filtered by hash. * On replay time, the events gets filtered by looking at hash in ev_extra3. Local config: * Local hash value and mask are loaded from partconf.conf table. Custom parameters from config file * shard_hash_func: function to use for hashing * shard_info_sql: SQL query to get (shard_nr, shard_mask, shard_count) values. """ from typing import Dict, List, Sequence, Tuple, Optional, Type import skytools from skytools.basetypes import Cursor from pgq.baseconsumer import BatchInfo from pgq.event import Event from londiste.handler import TableHandler, BaseHandler, ApplyFunc __all__ = ['ShardHandler', 'PartHandler'] _SHARD_HASH_FUNC = 'partconf.get_hash_raw' _SHARD_INFO_SQL = "select shard_nr, shard_mask, shard_count from partconf.conf" _SHARD_NR = None # part number of local node _SHARD_MASK = None # max part nr (atm) class ShardHandler(TableHandler): __doc__: Optional[str] = __doc__ handler_name = 'shard' DEFAULT_HASH_EXPR = "%s(%s)" hash_key: str hash_expr: str disable_replay: bool def __init__(self, table_name: str, args: Dict[str, str], dest_table: str) -> None: super().__init__(table_name, args, dest_table) # primary key columns hash_key = args.get('hash_key', args.get('key')) if hash_key is None: raise Exception('Specify hash key field as hash_key argument') self.hash_key = hash_key # hash function & full expression self.hash_expr = self.DEFAULT_HASH_EXPR % ( skytools.quote_fqident(_SHARD_HASH_FUNC), skytools.quote_ident(self.hash_key or '')) self.hash_expr = args.get('hash_expr', self.hash_expr) disable_replay = args.get('disable_replay', 'false') self.disable_replay = disable_replay in ('true', '1') @classmethod def load_conf(cls, cf: skytools.Config) -> None: global _SHARD_HASH_FUNC, _SHARD_INFO_SQL _SHARD_HASH_FUNC = cf.get("shard_hash_func", _SHARD_HASH_FUNC) _SHARD_INFO_SQL = cf.get("shard_info_sql", _SHARD_INFO_SQL) def add(self, trigger_arg_list: List[str]) -> None: """Let trigger put hash into extra3""" arg = "ev_extra3='hash='||%s" % self.hash_expr trigger_arg_list.append(arg) super().add(trigger_arg_list) def is_local_shard_event(self, ev: Event) -> bool: assert _SHARD_MASK is not None if ev.extra3 is None: raise ValueError("handlers.shard: extra3 not filled on %s" % (self.table_name,)) meta = skytools.db_urldecode(ev.extra3) meta_hash = meta.get('hash') if meta_hash is None: raise ValueError("handlers.shard: extra3 does not have 'hash' key") is_local = (int(meta_hash) & _SHARD_MASK) == _SHARD_NR self.log.debug('shard.process_event: meta=%r, shard_nr=%i, mask=%i, is_local=%r', meta, _SHARD_NR, _SHARD_MASK, is_local) return is_local def prepare_batch(self, batch_info: Optional[BatchInfo], dst_curs: Cursor) -> None: """Called on first event for this table in current batch.""" if _SHARD_MASK is None: self.load_shard_info(dst_curs) super().prepare_batch(batch_info, dst_curs) def process_event(self, ev: Event, sql_queue_func: ApplyFunc, dst_curs: Cursor) -> None: """Filter event by hash in extra3, apply only if for local shard.""" if self.disable_replay: return if self.is_local_shard_event(ev): super().process_event(ev, sql_queue_func, dst_curs) def get_copy_condition(self, src_curs: Cursor, dst_curs: Cursor) -> str: """Prepare the where condition for copy and replay filtering""" self.load_shard_info(dst_curs) assert _SHARD_MASK is not None assert _SHARD_NR is not None expr = "(%s & %d) = %d" % (self.hash_expr, _SHARD_MASK, _SHARD_NR) self.log.debug('shard: copy_condition=%r', expr) return expr def load_shard_info(self, curs: Cursor) -> None: """Load part/slot info from database.""" global _SHARD_NR, _SHARD_MASK curs.execute(_SHARD_INFO_SQL) row = curs.fetchone() shard_nr: Optional[int] = row[0] shard_mask: Optional[int] = row[1] shard_count: Optional[int] = row[2] if shard_nr is None or shard_mask is None or shard_count is None: raise Exception('Error loading shard info') if shard_count & shard_mask != 0 or shard_mask + 1 != shard_count: raise Exception('Invalid shard info') if shard_nr < 0 or shard_nr >= shard_count: raise Exception('Invalid shard nr') _SHARD_NR = shard_nr _SHARD_MASK = shard_mask def get_copy_event(self, ev: Event, queue_name: str) -> Optional[Event]: if self.is_local_shard_event(ev): return ev return None def real_copy(self, tablename: str, src_curs: Cursor, dst_curs: Cursor, column_list: List[str]) -> Tuple[int, int]: """Force copy not to start""" if self.disable_replay: return (0, 0) return super().real_copy(tablename, src_curs, dst_curs, column_list) def real_copy_threaded( self, src_real_table: str, src_curs: Cursor, dst_db_connstr: str, common_cols: Sequence[str], config_file: str, config_section: str, parallel: int = 1, ) -> Tuple[int, int]: if self.disable_replay: return (0, 0) return super().real_copy_threaded( src_real_table, src_curs, dst_db_connstr, common_cols, config_file, config_section, parallel ) def needs_table(self) -> bool: if self.disable_replay: return False return True class PartHandler(ShardHandler): __doc__ = "Deprecated compat name for shard handler.\n" + __doc__.split('\n', 1)[1] handler_name = 'part' # register handler class __londiste_handlers__: List[Type[BaseHandler]] = [ShardHandler, PartHandler] londiste-3.12/londiste/handlers/vtable.py000066400000000000000000000012721447267722200205470ustar00rootroot00000000000000"""Virtual Table handler. Hack to get local=t for a table, but without processing any events. """ from typing import List, Type from londiste.handler import BaseHandler __all__ = ['VirtualTableHandler', 'FakeLocalHandler'] class VirtualTableHandler(BaseHandler): __doc__ = __doc__ handler_name = 'vtable' def add(self, trigger_arg_list: List[str]) -> None: trigger_arg_list.append('virtual_table') def needs_table(self) -> bool: return False class FakeLocalHandler(VirtualTableHandler): """Deprecated compat name for vtable.""" handler_name = 'fake_local' __londiste_handlers__: List[Type[BaseHandler]] = [VirtualTableHandler, FakeLocalHandler] londiste-3.12/londiste/playback.py000066400000000000000000001226151447267722200172650ustar00rootroot00000000000000"""Basic replication core. """ import os import sys import time import fnmatch from logging import Logger from typing import List, Optional, Dict, Sequence, Mapping, Tuple, Iterator import skytools from skytools.basetypes import DictRow, Connection, Cursor from pgq.baseconsumer import EventList from pgq.event import Event from pgq.cascade.worker import CascadedWorker from .exec_attrs import ExecAttrs from .handler import build_handler, BaseHandler from .handlers import load_handler_modules __all__ = ['Replicator', 'TableState', 'TABLE_MISSING', 'TABLE_IN_COPY', 'TABLE_CATCHING_UP', 'TABLE_WANNA_SYNC', 'TABLE_DO_SYNC', 'TABLE_OK'] # state # owner - who is allowed to change TABLE_MISSING = 0 # main TABLE_IN_COPY = 1 # copy TABLE_CATCHING_UP = 2 # copy TABLE_WANNA_SYNC = 3 # main TABLE_DO_SYNC = 4 # copy TABLE_OK = 5 # setup SYNC_OK = 0 # continue with batch SYNC_LOOP = 1 # sleep, try again SYNC_EXIT = 2 # nothing to do, exit script MAX_PARALLEL_COPY = 8 # default number of allowed max parallel copy processes def is_data_event(ev: Event) -> bool: """Is it insert/update/delete for one table? """ if ev.type in ('I', 'U', 'D'): return True elif ev.type[:2] in ('I:', 'U:', 'D:', '{"'): return True return False class Counter: """Counts table statuses.""" missing = 0 copy = 0 catching_up = 0 wanna_sync = 0 do_sync = 0 ok = 0 def __init__(self, tables: List["TableState"], copy_method_map: Dict[str, Optional[int]]) -> None: """Counts and sanity checks.""" for t in tables: if t.state == TABLE_MISSING: self.missing += 1 elif t.state == TABLE_IN_COPY: nthreads = copy_method_map[t.name] if nthreads is None: self.copy += 1 else: self.copy += nthreads elif t.state == TABLE_CATCHING_UP: self.catching_up += 1 elif t.state == TABLE_WANNA_SYNC: self.wanna_sync += 1 elif t.state == TABLE_DO_SYNC: self.do_sync += 1 elif t.state == TABLE_OK: self.ok += 1 def get_copy_count(self) -> int: return self.copy + self.catching_up + self.wanna_sync + self.do_sync class TableState: """Keeps state about one table.""" name: str dest_table: str log: Logger state: int last_snapshot_tick: Optional[int] str_snapshot: Optional[str] from_snapshot: Optional[skytools.Snapshot] sync_tick_id: Optional[int] ok_batch_count: int last_tick: Optional[int] table_attrs: Mapping[str, Optional[str]] copy_role: Optional[str] dropped_ddl: Optional[str] plugin: Optional[BaseHandler] changed: int copy_pos: int max_parallel_copy: int def __init__(self, name: str, log: Logger) -> None: """Init TableState for one table.""" self.name = name self.dest_table = name self.log = log # same as forget: self.state = TABLE_MISSING self.last_snapshot_tick = None self.str_snapshot = None self.from_snapshot = None self.sync_tick_id = None self.ok_batch_count = 0 self.last_tick = 0 self.table_attrs = {} self.copy_role = None self.dropped_ddl = None self.plugin = None # except this self.changed = 0 # position in parallel copy work order self.copy_pos = 0 # max number of parallel copy processes allowed self.max_parallel_copy = MAX_PARALLEL_COPY def forget(self) -> None: """Reset all info.""" self.state = TABLE_MISSING self.last_snapshot_tick = None self.str_snapshot = None self.from_snapshot = None self.sync_tick_id = None self.ok_batch_count = 0 self.last_tick = 0 self.table_attrs = {} self.changed = 1 self.plugin = None self.copy_pos = 0 self.max_parallel_copy = MAX_PARALLEL_COPY def change_snapshot(self, str_snapshot: Optional[str], tag_changed: int = 1) -> None: """Set snapshot.""" if self.str_snapshot == str_snapshot: return self.log.debug("%s: change_snapshot to %s", self.name, str_snapshot) self.str_snapshot = str_snapshot if str_snapshot: self.from_snapshot = skytools.Snapshot(str_snapshot) else: self.from_snapshot = None if tag_changed: self.ok_batch_count = 0 self.last_tick = None self.changed = 1 def change_state(self, state: int, tick_id: Optional[int] = None) -> None: """Set state.""" if self.state == state and self.sync_tick_id == tick_id: return self.state = state self.sync_tick_id = tick_id self.changed = 1 self.log.debug("%s: change_state to %s", self.name, self.render_state()) def render_state(self) -> Optional[str]: """Make a string to be stored in db.""" if self.state == TABLE_MISSING: return None elif self.state == TABLE_IN_COPY: return 'in-copy' elif self.state == TABLE_CATCHING_UP: return 'catching-up' elif self.state == TABLE_WANNA_SYNC: return 'wanna-sync:%d' % (self.sync_tick_id or 0) elif self.state == TABLE_DO_SYNC: return 'do-sync:%d' % (self.sync_tick_id or 0) elif self.state == TABLE_OK: return 'ok' return None def parse_state(self, merge_state: Optional[str]) -> int: """Read state from string.""" state = -1 if merge_state is None: state = TABLE_MISSING elif merge_state == "in-copy": state = TABLE_IN_COPY elif merge_state == "catching-up": state = TABLE_CATCHING_UP elif merge_state == "ok": state = TABLE_OK elif merge_state == "?": state = TABLE_OK else: tmp = merge_state.split(':') if len(tmp) == 2: self.sync_tick_id = int(tmp[1]) if tmp[0] == 'wanna-sync': state = TABLE_WANNA_SYNC elif tmp[0] == 'do-sync': state = TABLE_DO_SYNC if state < 0: raise Exception("Bad table state: %s" % merge_state) return state def loaded_state(self, row: DictRow) -> None: """Update object with info from db.""" self.log.debug("loaded_state: %s: %s / %s", self.name, row['merge_state'], row['custom_snapshot']) self.change_snapshot(row['custom_snapshot'], 0) self.state = self.parse_state(row['merge_state']) self.changed = 0 if row['table_attrs']: self.table_attrs = skytools.db_urldecode(row['table_attrs']) else: self.table_attrs = {} self.copy_role = row['copy_role'] self.dropped_ddl = row['dropped_ddl'] if row['merge_state'] == "?": self.changed = 1 self.copy_pos = int(row.get('copy_pos', '0')) max_parallel_copy = self.table_attrs.get('max_parallel_copy') if max_parallel_copy: self.max_parallel_copy = int(max_parallel_copy) if 'dest_table' in row and row['dest_table']: self.dest_table = row['dest_table'] else: self.dest_table = self.name hstr = self.table_attrs.get('handlers', '') # compat hstr = self.table_attrs.get('handler', hstr) or '' self.plugin = build_handler(self.name, hstr, self.dest_table) def max_parallel_copies_reached(self) -> bool: return self.max_parallel_copy is not None and \ self.copy_pos >= self.max_parallel_copy def interesting(self, ev: Event, tick_id: int, copy_thread: bool, copy_table_name: Optional[str]) -> bool: """Check if table wants this event.""" if copy_thread: if self.name != copy_table_name: return False if self.state not in (TABLE_CATCHING_UP, TABLE_DO_SYNC): return False else: if self.state != TABLE_OK: return False # if no snapshot tracking, then accept always if not self.from_snapshot: return True # uninteresting? if self.from_snapshot.contains(ev.txid): return False # after couple interesting batches there no need to check snapshot # as there can be only one partially interesting batch if tick_id != self.last_tick: self.last_tick = tick_id self.ok_batch_count += 1 # disable batch tracking if self.ok_batch_count > 3: self.change_snapshot(None) return True def gc_snapshot(self, copy_thread: bool, prev_tick: int, cur_tick: int, no_lag: bool) -> None: """Remove attached snapshot if possible. If the event processing is in current moment, the snapshot is not needed beyond next batch. The logic is needed for mostly unchanging tables, where the .ok_batch_count check in .interesting() method can take a lot of time. """ # check if gc is needed if self.str_snapshot is None: return # check if allowed to modify if copy_thread: if self.state != TABLE_CATCHING_UP: return else: if self.state != TABLE_OK: return # aquire last tick if not self.last_snapshot_tick: if no_lag: self.last_snapshot_tick = cur_tick return # reset snapshot if not needed anymore if self.last_snapshot_tick < prev_tick: self.change_snapshot(None) def get_plugin(self) -> BaseHandler: if not self.plugin: raise ValueError("no handler set") return self.plugin class Replicator(CascadedWorker): """Replication core. Config options:: ## Parameters for Londiste ## # target database db = dbname=somedb host=127.0.0.1 # extra connect string parameters to add to node public connect strings. # useful values: user= sslmode= #remote_extra_connstr = # how many tables can be copied in parallel #parallel_copies = 1 # glob patterns for table names: archive.*, public.* #threaded_copy_tables = # number of threads in pool #threaded_copy_pool_size = 1 # accept only events for locally present tables #local_only = false # do not load EXECUTE events from source queue when local_only is active #local_only_drop_execute = false ## compare/repair # max amount of time table can be locked #lock_timeout = 10 # compare: sql to use #compare_sql = select count(1) as cnt, sum(hashtext(t.*::text)) as chksum from only _TABLE_ t # workaround for hashtext change between 8.3 and 8.4 #compare_sql = select count(1) as cnt, sum(('x' || # substr(md5(t.*::text),1,16))::bit(64)::bigint) as chksum from only _TABLE_ t #compare_fmt = %(cnt)d rows, checksum=%(chksum)s ## Parameters for initial node creation: create-root/branch/leaf ## # These parameters can be given on either command-line or in config # command-line values override config values. Those values are # used only during create time, otherwise they are loaded from database. # Name for local node. #node_name = # public connect string for local node, which other nodes will use # to connect to this one. #public_node_location = # connect string for existing node to use as provider #initial_provider_location = # filter for table/seq registration #register_only_tables = #register_only_seqs = #register_skip_tables = s.a, s.b, s.c #register_skip_seqs = """ # batch info cur_tick: int = 0 prev_tick: int = 0 copy_table_name: Optional[str] = None # filled by Copytable() sql_list: List[str] = [] current_event: Optional[Event] = None threaded_copy_tables: Sequence[str] threaded_copy_pool_size: int copy_method_map: Dict[str, Optional[int]] register_only_tables: Optional[Sequence[str]] = None register_only_seqs: Optional[Sequence[str]] = None register_skip_tables: Optional[Sequence[str]] = None register_skip_seqs: Optional[Sequence[str]] = None local_only: bool = False local_only_drop_execute: bool = False table_list: List[TableState] table_map: Dict[str, TableState] used_plugins: Dict[str, BaseHandler] copy_thread: bool def __init__(self, args: Sequence[str]) -> None: """Replication init.""" super().__init__('londiste', 'db', args) self.table_list = [] self.table_map = {} self.threaded_copy_tables = self.cf.getlist('threaded_copy_tables', []) self.threaded_copy_pool_size = self.cf.getint('threaded_copy_pool_size', 1) self.copy_method_map = {} self.copy_thread = False self.set_name = self.queue_name self.used_plugins = {} self.parallel_copies = self.cf.getint('parallel_copies', 1) if self.parallel_copies < 1: raise Exception('Bad value for parallel_copies: %d' % self.parallel_copies) self.consumer_filter = None self.register_only_tables = self.cf.getlist("register_only_tables", []) self.register_only_seqs = self.cf.getlist("register_only_seqs", []) self.register_skip_tables = self.cf.getlist("register_skip_tables", []) self.register_skip_seqs = self.cf.getlist("register_skip_seqs", []) self.local_only = self.cf.getboolean('local_only', False) self.local_only_drop_execute = self.cf.getboolean('local_only_drop_execute', False) def reload(self) -> None: super().reload() load_handler_modules(self.cf) self.threaded_copy_tables = self.cf.getlist('threaded_copy_tables', []) self.threaded_copy_pool_size = self.cf.getint('threaded_copy_pool_size', 1) self.copy_method_map = {} self.register_only_tables = self.cf.getlist("register_only_tables", []) self.register_only_seqs = self.cf.getlist("register_only_seqs", []) self.register_skip_tables = self.cf.getlist("register_skip_tables", []) self.register_skip_seqs = self.cf.getlist("register_skip_seqs", []) self.local_only = self.cf.getboolean('local_only', False) self.local_only_drop_execute = self.cf.getboolean('local_only_drop_execute', False) def fill_copy_method(self) -> None: for table_name in self.table_map: if table_name not in self.copy_method_map: for pat in self.threaded_copy_tables: if fnmatch.fnmatchcase(table_name, pat): self.copy_method_map[table_name] = self.threaded_copy_pool_size break if table_name not in self.copy_method_map: self.copy_method_map[table_name] = None def connection_hook(self, dbname: str, db: Connection) -> None: if dbname == 'db': curs = db.cursor() curs.execute("select londiste.set_session_replication_role('replica', false)") db.commit() code_check_done = 0 def check_code(self, db: Connection) -> None: objs = [ skytools.DBFunction("pgq.maint_operations", 0, sql_file="londiste.maint-upgrade.sql"), ] skytools.db_install(db.cursor(), objs, self.log) db.commit() def process_remote_batch(self, src_db: Connection, tick_id: int, ev_list: EventList, dst_db: Connection) -> None: "All work for a batch. Entry point from SetConsumer." self.current_event = None # this part can play freely with transactions if not self.code_check_done: self.check_code(dst_db) self.code_check_done = 1 self.sync_database_encodings(src_db, dst_db) assert self.batch_info self.cur_tick = self.batch_info['tick_id'] self.prev_tick = self.batch_info['prev_tick_id'] dst_curs = dst_db.cursor() self.load_table_state(dst_curs) self.sync_tables(src_db, dst_db) self.copy_snapshot_cleanup(dst_db) # only main thread is allowed to restore fkeys assert self._worker_state if not self.copy_thread and self._worker_state.process_events: self.restore_fkeys(dst_db) for p in self.used_plugins.values(): p.reset() self.used_plugins = {} # now the actual event processing happens. # they must be done all in one tx in dst side # and the transaction must be kept open so that # the cascade-consumer can save last tick and commit. self.sql_list = [] super().process_remote_batch(src_db, tick_id, ev_list, dst_db) self.flush_sql(dst_curs) for p in self.used_plugins.values(): p.finish_batch(self.batch_info, dst_curs) self.used_plugins = {} # finalize table changes self.save_table_state(dst_curs) def sync_tables(self, src_db: Connection, dst_db: Connection) -> None: """Table sync loop. Calls appropriate handles, which is expected to return one of SYNC_* constants.""" self.log.debug('Sync tables') while True: cnt = Counter(self.table_list, self.copy_method_map) if self.copy_thread: res = self.sync_from_copy_thread(cnt, src_db, dst_db) else: res = self.sync_from_main_thread(cnt, src_db, dst_db) if res == SYNC_EXIT: self.log.debug('Sync tables: exit') if self.copy_thread: self.unregister_consumer() src_db.commit() sys.exit(0) elif res == SYNC_OK: return elif res != SYNC_LOOP: raise Exception('Program error') self.log.debug('Sync tables: sleeping') time.sleep(3) dst_db.commit() self.load_table_state(dst_db.cursor()) dst_db.commit() dsync_backup: Optional[Tuple[int, Optional[int], Optional[str]]] = None def sync_from_main_thread(self, cnt: Counter, src_db: Connection, dst_db: Connection) -> int: "Main thread sync logic." # This operates on all table, any amount can be in any state ret = SYNC_OK if cnt.do_sync: # wait for copy thread to catch up ret = SYNC_LOOP # we need to do wanna-sync->do_sync with small batches need_dsync = False dsync_ok = True if self.pgq_min_interval or self.pgq_min_count: dsync_ok = False elif self.dsync_backup and self.dsync_backup[0] >= self.cur_tick: dsync_ok = False # now check if do-sync is needed for t in self.get_tables_in_state(TABLE_WANNA_SYNC): # copy thread wants sync, if not behind, do it if t.sync_tick_id is not None and self.cur_tick >= t.sync_tick_id: if dsync_ok: self.change_table_state(dst_db, t, TABLE_DO_SYNC, self.cur_tick) ret = SYNC_LOOP else: need_dsync = True # tune batch size if needed if need_dsync: if self.pgq_min_count or self.pgq_min_interval: bak = (self.cur_tick, self.pgq_min_count, self.pgq_min_interval) self.dsync_backup = bak self.pgq_min_count = None self.pgq_min_interval = None elif self.dsync_backup: self.pgq_min_count = self.dsync_backup[1] self.pgq_min_interval = self.dsync_backup[2] self.dsync_backup = None # now handle new copies npossible = self.parallel_copies - cnt.get_copy_count() if cnt.missing and npossible > 0: pmap = self.get_state_map(src_db.cursor()) src_db.commit() for t in self.get_tables_in_state(TABLE_MISSING): if 'copy_node' in t.table_attrs: # should we go and check this node? pass else: # regular provider is used if t.name not in pmap: self.log.warning("Table %s not available on provider", t.name) continue pt = pmap[t.name] if pt.state != TABLE_OK: # or pt.custom_snapshot: # FIXME: does snapsnot matter? self.log.info("Table %s not OK on provider, waiting", t.name) continue # don't allow more copies than configured if npossible == 0: break npossible -= 1 # drop all foreign keys to and from this table self.drop_fkeys(dst_db, t.dest_table) # change state after fkeys are dropped thus allowing # failure inbetween self.change_table_state(dst_db, t, TABLE_IN_COPY) # the copy _may_ happen immediately self.launch_copy(t) # there cannot be interesting events in current batch # but maybe there's several tables, lets do them in one go ret = SYNC_LOOP return ret def sync_from_copy_thread(self, cnt: Counter, src_db: Connection, dst_db: Connection) -> int: "Copy thread sync logic." # somebody may have done remove-table in the meantime if self.copy_table_name not in self.table_map: self.log.error("copy_sync: lost table: %s", self.copy_table_name) return SYNC_EXIT # This operates on single table t = self.table_map[self.copy_table_name] if t.state == TABLE_DO_SYNC: # these settings may cause copy to miss right tick self.pgq_min_count = None self.pgq_min_interval = None assert t.sync_tick_id # main thread is waiting, catch up, then handle over if self.cur_tick == t.sync_tick_id: self.change_table_state(dst_db, t, TABLE_OK) return SYNC_EXIT elif self.cur_tick < t.sync_tick_id: return SYNC_OK else: self.log.error("copy_sync: cur_tick=%d sync_tick=%d", self.cur_tick, t.sync_tick_id) raise Exception('Invalid table state') elif t.state == TABLE_WANNA_SYNC: # wait for main thread to react return SYNC_LOOP elif t.state == TABLE_CATCHING_UP: # partition merging if t.copy_role in ('wait-replay', 'lead'): return SYNC_LOOP # copy just finished if t.dropped_ddl: self.restore_copy_ddl(t, dst_db) return SYNC_OK # is there more work? if self.work_state: return SYNC_OK # seems we have catched up self.change_table_state(dst_db, t, TABLE_WANNA_SYNC, self.cur_tick) return SYNC_LOOP elif t.state == TABLE_IN_COPY: # table is not copied yet, do it self.do_copy(t, src_db, dst_db) # forget previous value self.work_state = 1 return SYNC_LOOP else: # nothing to do return SYNC_EXIT def restore_copy_ddl(self, ts: TableState, dst_db: Connection) -> None: self.log.info("%s: restoring DDL", ts.name) dst_curs = dst_db.cursor() if ts.dropped_ddl: for ddl in skytools.parse_statements(ts.dropped_ddl): self.log.info(ddl) dst_curs.execute(ddl) q = "select * from londiste.local_set_table_struct(%s, %s, NULL)" self.exec_cmd(dst_curs, q, [self.queue_name, ts.name]) ts.dropped_ddl = None dst_db.commit() # analyze self.log.info("%s: analyze", ts.name) dst_curs.execute("analyze " + skytools.quote_fqident(ts.name)) dst_db.commit() def do_copy(self, tbl: TableState, src_db: Connection, dst_db: Connection) -> None: """Callback for actual copy implementation.""" raise Exception('do_copy not implemented') def process_remote_event(self, src_curs: Cursor, dst_curs: Cursor, ev: Event) -> None: """handle one event""" self.log.debug( "New event: id=%s / type=%s / data=%s / extra1=%s / extra2=%r / extra3=%r", ev.id, ev.type, ev.data, ev.extra1, ev.extra2, ev.extra3 ) # set current_event only if processing them one-by-one if self.work_state < 0: self.current_event = ev if is_data_event(ev): self.handle_data_event(ev, dst_curs) elif ev.type == "R": self.flush_sql(dst_curs) self.handle_truncate_event(ev, dst_curs) elif ev.type == 'EXECUTE': self.flush_sql(dst_curs) self.handle_execute_event(ev, dst_curs) elif ev.type == 'londiste.add-table': self.flush_sql(dst_curs) self.add_set_table(dst_curs, ev.data) elif ev.type == 'londiste.remove-table': self.flush_sql(dst_curs) self.remove_set_table(dst_curs, ev.data) elif ev.type == 'londiste.remove-seq': self.flush_sql(dst_curs) self.remove_set_seq(dst_curs, ev.data) elif ev.type == 'londiste.update-seq': self.flush_sql(dst_curs) self.update_seq(dst_curs, ev) else: super().process_remote_event(src_curs, dst_curs, ev) # no point keeping it around longer self.current_event = None def handle_data_event(self, ev: Event, dst_curs: Cursor) -> None: """handle one data event""" t = self.get_table_by_name(ev.extra1) if not t or not t.interesting(ev, self.cur_tick, self.copy_thread, self.copy_table_name): self.stat_increase('ignored_events') return try: p = self.used_plugins[ev.extra1] except KeyError: p = t.get_plugin() self.used_plugins[ev.extra1] = p assert self.batch_info p.prepare_batch(self.batch_info, dst_curs) p.process_event(ev, self.apply_sql, dst_curs) def handle_truncate_event(self, ev: Event, dst_curs: Cursor) -> None: """handle one truncate event""" t = self.get_table_by_name(ev.extra1) if not t or not t.interesting(ev, self.cur_tick, self.copy_thread, self.copy_table_name): self.stat_increase('ignored_events') return fqname = skytools.quote_fqident(t.dest_table) try: p = self.used_plugins[ev.extra1] except KeyError: p = t.get_plugin() self.used_plugins[ev.extra1] = p assert self.batch_info p.prepare_batch(self.batch_info, dst_curs) if p.conf.get('ignore_truncate'): self.log.info("ignoring truncate for %s", fqname) return # # Always use CASCADE, because without it the # operation cannot work with FKeys, on both # slave and master. # sql = "TRUNCATE %s CASCADE;" % fqname self.flush_sql(dst_curs) dst_curs.execute(sql) def handle_execute_event(self, ev: Event, dst_curs: Cursor) -> None: """handle one EXECUTE event""" if self.copy_thread: return # parse event fname = ev.extra1 s_attrs = ev.extra2 exec_attrs = ExecAttrs(urlenc=s_attrs) sql = ev.data # fixme: curs? dst_curs.execute("select londiste.set_session_replication_role('local', true)") seq_map = {} q = "select seq_name, local from londiste.get_seq_list(%s) where local" dst_curs.execute(q, [self.queue_name]) for row in dst_curs.fetchall(): seq_map[row['seq_name']] = row['seq_name'] tbl_map = {} for t in self.table_map.values(): tbl_map[t.name] = t.dest_table q = "select * from londiste.execute_start(%s, %s, %s, false, %s)" res = self.exec_cmd(dst_curs, q, [self.queue_name, fname, sql, s_attrs], commit=False) ret = res[0]['ret_code'] if ret > 200: self.log.warning("Skipping execution of '%s'", fname) dst_curs.execute("select londiste.set_session_replication_role('replica', true)") return if exec_attrs.need_execute(dst_curs, tbl_map, seq_map): self.log.info("%s: executing sql") xsql = exec_attrs.process_sql(sql, tbl_map, seq_map) for stmt in skytools.parse_statements(xsql): dst_curs.execute(stmt) else: self.log.info("%s: execution not needed on this node") q = "select * from londiste.execute_finish(%s, %s)" self.exec_cmd(dst_curs, q, [self.queue_name, fname], commit=False) dst_curs.execute("select londiste.set_session_replication_role('replica', true)") def apply_sql(self, sql: str, dst_curs: Cursor) -> None: # how many queries to batch together, drop batching on error limit = 200 if self.work_state == -1: limit = 0 self.sql_list.append(sql) if len(self.sql_list) >= limit: self.flush_sql(dst_curs) def flush_sql(self, dst_curs: Cursor) -> None: """Send all buffered statements to DB.""" if len(self.sql_list) == 0: return buf = "\n".join(self.sql_list) self.sql_list = [] dst_curs.execute(buf) def add_set_table(self, dst_curs: Cursor, tbl: str) -> None: """There was new table added to root, remember it.""" if self.register_only_tables and tbl not in self.register_only_tables: return if self.register_skip_tables and tbl in self.register_skip_tables: return q = "select londiste.global_add_table(%s, %s)" dst_curs.execute(q, [self.set_name, tbl]) def remove_set_table(self, dst_curs: Cursor, tbl: str) -> None: """There was table dropped from root, remember it.""" if tbl in self.table_map: t = self.table_map[tbl] del self.table_map[tbl] self.table_list.remove(t) q = "select londiste.global_remove_table(%s, %s)" dst_curs.execute(q, [self.set_name, tbl]) def remove_set_seq(self, dst_curs: Cursor, seq: str) -> None: """There was seq dropped from root, remember it.""" q = "select londiste.global_remove_seq(%s, %s)" dst_curs.execute(q, [self.set_name, seq]) def setup_local_only_filter(self) -> None: # store event filter if self.local_only: # create list of tables if self.copy_thread: _filterlist = skytools.quote_literal(self.copy_table_name) else: _filterlist = ','.join(map(skytools.quote_literal, self.table_map.keys())) # build filter cond_list = [ "ev_type like 'pgq.%'", "ev_type like 'londiste.%'", ] if not self.local_only_drop_execute: cond_list.append("ev_type = 'EXECUTE'") if _filterlist: cond_list.append(f"ev_extra1 in ({_filterlist})") expr = " or ".join(cond_list) self.consumer_filter = f"({expr})" else: # no filter self.consumer_filter = None def load_table_state(self, curs: Cursor) -> None: """Load table state from database. Todo: if all tables are OK, there is no need to load state on every batch. """ q = "select * from londiste.get_table_list(%s)" curs.execute(q, [self.set_name]) new_list = [] new_map = {} for row in curs.fetchall(): if not row['local']: continue t = self.get_table_by_name(row['table_name']) if not t: t = TableState(row['table_name'], self.log) t.loaded_state(row) new_list.append(t) new_map[t.name] = t self.table_list = new_list self.table_map = new_map self.fill_copy_method() self.setup_local_only_filter() def refresh_state(self, dst_db: Connection, full_logic: bool = True) -> DictRow: res = super().refresh_state(dst_db, full_logic=full_logic) # make sure local_only filter is loaded on boot if self.local_only and self.consumer_filter is None: self.load_table_state(dst_db.cursor()) dst_db.commit() return res def get_state_map(self, curs: Cursor) -> Dict[str, TableState]: """Get dict of table states.""" q = "select * from londiste.get_table_list(%s)" curs.execute(q, [self.set_name]) new_map = {} for row in curs.fetchall(): if not row['local']: continue t = TableState(row['table_name'], self.log) t.loaded_state(row) new_map[t.name] = t return new_map def save_table_state(self, curs: Cursor) -> None: """Store changed table state in database.""" for t in self.table_list: # backwards compat: move plugin-only dest_table to table_info if t.plugin and t.dest_table != t.plugin.dest_table: self.log.info("Overwriting .dest_table from plugin: tbl=%s dst=%s", t.name, t.plugin.dest_table) q = "update londiste.table_info set dest_table = %s"\ " where queue_name = %s and table_name = %s" curs.execute(q, [t.plugin.dest_table, self.set_name, t.name]) if not t.changed: continue merge_state = t.render_state() self.log.info("storing state of %s: copy:%d new_state:%s", t.name, self.copy_thread, merge_state) q = "select londiste.local_set_table_state(%s, %s, %s, %s)" curs.execute(q, [self.set_name, t.name, t.str_snapshot, merge_state]) t.changed = 0 def change_table_state(self, dst_db: Connection, tbl: TableState, state: int, tick_id: Optional[int] = None) -> None: """Chage state for table.""" tbl.change_state(state, tick_id) self.save_table_state(dst_db.cursor()) dst_db.commit() self.log.info("Table %s status changed to '%s'", tbl.name, tbl.render_state()) def get_tables_in_state(self, state: int) -> Iterator[TableState]: "get all tables with specific state" for t in self.table_list: if t.state == state: yield t def get_table_by_name(self, name: str) -> Optional[TableState]: """Returns cached state object.""" if name.find('.') < 0: name = "public.%s" % name if name in self.table_map: return self.table_map[name] return None def launch_copy(self, tbl_stat: TableState) -> None: """Run parallel worker for copy.""" self.log.info("Launching copy process") main_exe = sys.argv[0] conf = self.cf.filename or 'undefined' cmd: List[str] = [main_exe, conf, 'copy', tbl_stat.name, '-d'] # pass same verbosity options as main script got if self.options.quiet: cmd.append('-q') if self.options.verbose: cmd += ['-v'] * self.options.verbose # let existing copy finish and clean its pidfile, # otherwise new copy will exit immediately. # FIXME: should not happen on per-table pidfile ??? copy_pidfile = "%s.copy.%s" % (self.pidfile, tbl_stat.name) while skytools.signal_pidfile(copy_pidfile, 0): self.log.warning("Waiting for existing copy to exit") time.sleep(2) # launch and wait for daemonization result self.log.debug("Launch args: %r", cmd) res = os.spawnvp(os.P_WAIT, main_exe, cmd) self.log.debug("Launch result: %r", res) if res != 0: self.log.error("Failed to launch copy process, result=%d", res) def sync_database_encodings(self, src_db: Connection, dst_db: Connection) -> None: """Make sure client_encoding is same on both side.""" try: # psycopg2 if src_db.encoding != dst_db.encoding: dst_db.set_client_encoding(src_db.encoding) except AttributeError: # psycopg1 src_curs = src_db.cursor() dst_curs = dst_db.cursor() src_curs.execute("show client_encoding") src_enc = src_curs.fetchone()[0] dst_curs.execute("show client_encoding") dst_enc = dst_curs.fetchone()[0] if src_enc != dst_enc: dst_curs.execute("set client_encoding = %s", [src_enc]) def copy_snapshot_cleanup(self, dst_db: Connection) -> None: """Remove unnecessary snapshot info from tables.""" no_lag = not self.work_state changes = False for t in self.table_list: t.gc_snapshot(self.copy_thread, self.prev_tick, self.cur_tick, no_lag) if t.changed: changes = True if changes: self.save_table_state(dst_db.cursor()) dst_db.commit() def restore_fkeys(self, dst_db: Connection) -> None: """Restore fkeys that have both tables on sync.""" dst_curs = dst_db.cursor() # NOT VALID appreared in 9.1 q = "select londiste.version() as ext_version, current_setting('server_version_num')::int < 90100 as compat" dst_curs.execute(q) info = dst_curs.fetchone() if info[0]: ext_version = [int(v) for v in info[0].split('.')] do_compat_restore = ext_version < [3, 7] or info[1] else: do_compat_restore = True # restore fkeys -- one at a time q = "select * from londiste.get_valid_pending_fkeys(%s)" dst_curs.execute(q, [self.set_name]) fkey_list = dst_curs.fetchall() dst_db.commit() for row in fkey_list: self.log.info('Creating fkey: %s (%s --> %s)', row['fkey_name'], row['from_table'], row['to_table']) if do_compat_restore: q2 = "select londiste.restore_table_fkey(%s, %s)" dst_curs.execute(q2, [row['from_table'], row['fkey_name']]) dst_db.commit() else: q3 = "select londiste.restore_table_fkey(%s, %s, true)" done = False while not done: dst_curs.execute(q3, [row['from_table'], row['fkey_name']]) sql = dst_curs.fetchone()[0] if sql: dst_curs.execute(sql) else: done = True dst_db.commit() def drop_fkeys(self, dst_db: Connection, table_name: str) -> None: """Drop all foreign keys to and from this table. They need to be dropped one at a time to avoid deadlocks with user code. """ dst_curs = dst_db.cursor() q = "select * from londiste.find_table_fkeys(%s)" dst_curs.execute(q, [table_name]) fkey_list = dst_curs.fetchall() for row in fkey_list: self.log.info('Dropping fkey: %s', row['fkey_name']) q2 = "select londiste.drop_table_fkey(%s, %s)" dst_curs.execute(q2, [row['from_table'], row['fkey_name']]) dst_db.commit() def process_root_node(self, dst_db: Connection) -> None: """On root node send seq changes to queue.""" super().process_root_node(dst_db) q = "select * from londiste.root_check_seqs(%s)" self.exec_cmd(dst_db, q, [self.queue_name]) def update_seq(self, dst_curs: Cursor, ev: Event) -> None: if self.copy_thread: return val = int(ev.data) seq = ev.extra1 q = "select * from londiste.global_update_seq(%s, %s, %s)" self.exec_cmd(dst_curs, q, [self.queue_name, seq, val]) def copy_event(self, dst_curs: Cursor, ev: Event, filtered_copy: int) -> None: # filtered_copy means merge-leaf # send only data events down (skipping seqs also) if filtered_copy: if ev.type[:9] in ('londiste.',): return if is_data_event(ev): t = self.get_table_by_name(ev.extra1) if t: try: p = self.used_plugins[ev.extra1] except KeyError: p = t.get_plugin() self.used_plugins[ev.extra1] = p p.prepare_batch(None, dst_curs) # handler may rewrite or drop the event ev2 = p.get_copy_event(ev, self.queue_name) if ev2 is None: return ev = ev2 super().copy_event(dst_curs, ev, filtered_copy) def exception_hook(self, det: Exception, emsg: str) -> None: # add event info to error message if self.current_event: ev = self.current_event info = "[ev_id=%d,ev_txid=%d] " % (ev.ev_id, ev.ev_txid) emsg = info + emsg super().exception_hook(det, emsg) if __name__ == '__main__': script = Replicator(sys.argv[1:]) script.start() londiste-3.12/londiste/repair.py000066400000000000000000000277441447267722200167700ustar00rootroot00000000000000"""Repair data on subscriber. Walks tables by primary key and searches for missing inserts/updates/deletes. """ import os import optparse import subprocess import sys from typing import Optional, Dict, List, Sequence, Any, IO import skytools from skytools.basetypes import Cursor, Connection from londiste.syncer import Syncer, ATable __all__ = ['Repairer'] def unescape(s: str) -> Optional[str]: """Remove copy escapes.""" return skytools.unescape_copy(s) class Repairer(Syncer): """Walks tables in primary key order and checks if data matches.""" cnt_insert: int = 0 cnt_update: int = 0 cnt_delete: int = 0 total_src: int = 0 total_dst: int = 0 pkey_list: List[str] = [] common_fields: List[str] = [] apply_curs: Optional[Cursor] = None fq_common_fields: Sequence[str] = () def init_optparse(self, p: Optional[optparse.OptionParser] = None) -> optparse.OptionParser: """Initialize cmdline switches.""" p = super().init_optparse(p) p.add_option("--apply", action="store_true", help="apply fixes") p.add_option("--sort-bufsize", help="buffer for coreutils sort") p.add_option("--repair-where", help="where condition for selecting data") return p def process_sync(self, t1: ATable, t2: ATable, src_db: Connection, dst_db: Connection) -> int: """Actual comparison.""" apply_db: Optional[Connection] = None if self.options.apply: apply_db = self.get_database('db', cache='applydb', autocommit=1) self.apply_curs = apply_db.cursor() self.apply_curs.execute("select londiste.set_session_replication_role('replica', false)") src_tbl = t1.dest_table dst_tbl = t2.dest_table src_curs = src_db.cursor() dst_curs = dst_db.cursor() self.log.info('Checking %s', dst_tbl) self.common_fields = [] self.fq_common_fields = [] self.pkey_list = [] self.load_common_columns(src_tbl, dst_tbl, src_curs, dst_curs) dump_src = dst_tbl + ".src" dump_dst = dst_tbl + ".dst" dump_src_sorted = dump_src + ".sorted" dump_dst_sorted = dump_dst + ".sorted" dst_where = t2.plugin.get_copy_condition(src_curs, dst_curs) src_where = dst_where if self.options.repair_where and src_where: dst_where = src_where = src_where + ' and ' + self.options.repair_where elif self.options.repair_where: dst_where = src_where = self.options.repair_where self.log.info("Dumping src table: %s %s", src_tbl, src_where) self.dump_table(src_tbl, src_curs, dump_src, src_where) src_db.commit() self.log.info("Dumping dst table: %s %s", dst_tbl, dst_where) self.dump_table(dst_tbl, dst_curs, dump_dst, dst_where) dst_db.commit() self.log.info("Sorting src table: %s", dump_src) self.do_sort(dump_src, dump_src_sorted) self.log.info("Sorting dst table: %s", dump_dst) self.do_sort(dump_dst, dump_dst_sorted) self.dump_compare(dst_tbl, dump_src_sorted, dump_dst_sorted) os.unlink(dump_src) os.unlink(dump_dst) os.unlink(dump_src_sorted) os.unlink(dump_dst_sorted) return 0 def do_sort(self, src: str, dst: str) -> None: """ Sort contents of src file, write them to dst file. """ with subprocess.Popen(["sort", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p: s_ver = p.communicate()[0].decode('utf8', 'replace') xenv = os.environ.copy() xenv['LANG'] = 'C' xenv['LC_ALL'] = 'C' cmdline = ['sort', '-T', '.'] if s_ver.find("coreutils") > 0: cmdline.append('-S') if self.options.sort_bufsize: cmdline.append(self.options.sort_bufsize) else: cmdline.append('30%') cmdline.append('-o') cmdline.append(dst) cmdline.append(src) with subprocess.Popen(cmdline, env=xenv) as p: if p.wait() != 0: raise Exception('sort failed') def load_common_columns(self, src_tbl: str, dst_tbl: str, src_curs: Cursor, dst_curs: Cursor) -> None: """Get common fields, put pkeys in start.""" self.pkey_list = skytools.get_table_pkeys(src_curs, src_tbl) dst_pkey = skytools.get_table_pkeys(dst_curs, dst_tbl) if dst_pkey != self.pkey_list: self.log.error('pkeys do not match') sys.exit(1) src_cols = skytools.get_table_columns(src_curs, src_tbl) dst_cols = skytools.get_table_columns(dst_curs, dst_tbl) field_list = [] for f in self.pkey_list: field_list.append(f) for f in src_cols: if f in self.pkey_list: continue if f in dst_cols: field_list.append(f) self.common_fields = field_list fqlist = [skytools.quote_ident(col) for col in field_list] self.fq_common_fields = fqlist cols = ",".join(fqlist) self.log.debug("using columns: %s", cols) def dump_table(self, tbl: str, curs: Cursor, fn: str, whr: str) -> None: """Dump table to disk.""" cols = ','.join(self.fq_common_fields) if len(whr) == 0: whr = 'true' q = "copy (SELECT %s FROM %s WHERE %s) to stdout" % (cols, skytools.quote_fqident(tbl), whr) self.log.debug("Query: %s", q) with open(fn, "w", 64 * 1024, encoding="utf8") as f: curs.copy_expert(q, f) size = f.tell() self.log.info('%s: Got %d bytes', tbl, size) def get_row(self, ln: str) -> Dict[str, str]: """Parse a row into dict.""" t = ln[:-1].split('\t') row = {} for i, fname in enumerate(self.common_fields): row[fname] = t[i] return row def dump_compare(self, tbl: str, src_fn: str, dst_fn: str) -> None: """ Compare two table dumps, create sql file to fix target table or apply changes to target table directly. """ with open(src_fn, "r", 64 * 1024, encoding="utf8") as f1: with open(dst_fn, "r", 64 * 1024, encoding="utf8") as f2: self.dump_compare_streams(tbl, f1, f2) def dump_compare_streams(self, tbl: str, f1: IO[str], f2: IO[str]) -> None: self.log.info("Comparing dumps: %s", tbl) self.cnt_insert = 0 self.cnt_update = 0 self.cnt_delete = 0 self.total_src = 0 self.total_dst = 0 src_ln = f1.readline() dst_ln = f2.readline() if src_ln: self.total_src += 1 if dst_ln: self.total_dst += 1 fix = "fix.%s.sql" % tbl if os.path.isfile(fix): os.unlink(fix) while src_ln or dst_ln: keep_src = keep_dst = 0 if src_ln != dst_ln: src_row = self.get_row(src_ln) dst_row = self.get_row(dst_ln) diff = self.cmp_keys(src_row, dst_row) if diff > 0: # src > dst self.got_missed_delete(tbl, dst_row) keep_src = 1 elif diff < 0: # src < dst self.got_missed_insert(tbl, src_row) keep_dst = 1 else: if self.cmp_data(src_row, dst_row) != 0: self.got_missed_update(tbl, src_row, dst_row) if not keep_src: src_ln = f1.readline() if src_ln: self.total_src += 1 if not keep_dst: dst_ln = f2.readline() if dst_ln: self.total_dst += 1 self.log.info("finished %s: src: %d rows, dst: %d rows," " missed: %d inserts, %d updates, %d deletes", tbl, self.total_src, self.total_dst, self.cnt_insert, self.cnt_update, self.cnt_delete) def got_missed_insert(self, tbl: str, src_row: Dict[str, str]) -> None: """Create sql for missed insert.""" self.cnt_insert += 1 fld_list = self.common_fields fq_list = [] val_list = [] for f in fld_list: fq_list.append(skytools.quote_ident(f)) v = unescape(src_row[f]) val_list.append(skytools.quote_literal(v)) q = "insert into %s (%s) values (%s);" % ( tbl, ", ".join(fq_list), ", ".join(val_list)) self.show_fix(tbl, q, 'insert') def got_missed_update(self, tbl: str, src_row: Dict[str, str], dst_row: Dict[str, str]) -> None: """Create sql for missed update.""" self.cnt_update += 1 fld_list = self.common_fields set_list: List[str] = [] whe_list: List[str] = [] for f in self.pkey_list: self.addcmp(whe_list, skytools.quote_ident(f), unescape(src_row[f])) for f in fld_list: v1 = src_row[f] v2 = dst_row[f] if self.cmp_value(v1, v2) == 0: continue self.addeq(set_list, skytools.quote_ident(f), unescape(v1)) self.addcmp(whe_list, skytools.quote_ident(f), unescape(v2)) q = "update only %s set %s where %s;" % ( tbl, ", ".join(set_list), " and ".join(whe_list)) self.show_fix(tbl, q, 'update') def got_missed_delete(self, tbl: str, dst_row: Dict[str, str]) -> None: """Create sql for missed delete.""" self.cnt_delete += 1 whe_list: List[str] = [] for f in self.pkey_list: self.addcmp(whe_list, skytools.quote_ident(f), unescape(dst_row[f])) q = "delete from only %s where %s;" % (skytools.quote_fqident(tbl), " and ".join(whe_list)) self.show_fix(tbl, q, 'delete') def show_fix(self, tbl: str, q: str, desc: str) -> None: """Print/write/apply repair sql.""" self.log.debug("missed %s: %s", desc, q) if self.apply_curs: self.apply_curs.execute(q) else: fn = "fix.%s.sql" % tbl with open(fn, "a", encoding="utf8") as f: f.write("%s\n" % q) def addeq(self, dst_list: List[str], f: str, v: Any) -> None: """Add quoted SET.""" vq = skytools.quote_literal(v) s = "%s = %s" % (f, vq) dst_list.append(s) def addcmp(self, dst_list: List[str], f: str, v: Any) -> None: """Add quoted comparison.""" if v is None: s = "%s is null" % f else: vq = skytools.quote_literal(v) s = "%s = %s" % (f, vq) dst_list.append(s) def cmp_data(self, src_row: Dict[str, str], dst_row: Dict[str, str]) -> int: """Compare data field-by-field.""" for k in self.common_fields: v1 = src_row[k] v2 = dst_row[k] if self.cmp_value(v1, v2) != 0: return -1 return 0 def cmp_value(self, v1: str, v2: str) -> int: """Compare single field, tolerates tz vs notz dates.""" if v1 == v2: return 0 # try to work around tz vs. notz z1 = len(v1) z2 = len(v2) if z1 == z2 + 3 and z2 >= 19 and v1[z2] == '+': v1 = v1[:-3] if v1 == v2: return 0 elif z1 + 3 == z2 and z1 >= 19 and v2[z1] == '+': v2 = v2[:-3] if v1 == v2: return 0 return -1 def cmp_keys(self, src_row: Dict[str, str], dst_row: Dict[str, str]) -> int: """Compare primary keys of the rows. Returns 1 if src > dst, -1 if src < dst and 0 if src == dst""" # None means table is done. tag it larger than any existing row. if src_row is None: if dst_row is None: return 0 return 1 elif dst_row is None: return -1 for k in self.pkey_list: v1 = src_row[k] v2 = dst_row[k] if v1 < v2: return -1 elif v1 > v2: return 1 return 0 londiste-3.12/londiste/syncer.py000066400000000000000000000342031447267722200167750ustar00rootroot00000000000000"""Catch moment when tables are in sync on master and slave. """ from typing import Dict, Tuple, List, Optional, Sequence import sys import time import optparse import skytools from skytools.basetypes import Cursor, Connection, DictRow from londiste.handler import build_handler, BaseHandler from londiste.handlers import load_handler_modules from londiste.util import find_copy_source class ATable: table_name: str dest_table: str merge_state: str table_attrs: Dict[str, str] plugin: BaseHandler def __init__(self, row: DictRow): self.table_name = row['table_name'] self.dest_table = row['dest_table'] or row['table_name'] self.merge_state = row['merge_state'] attrs = row['table_attrs'] or '' self.table_attrs = {k: v for k, v in skytools.db_urldecode(attrs).items() if v is not None} hstr = self.table_attrs.get('handler', '') self.plugin = build_handler(self.table_name, hstr, row['dest_table']) class Syncer(skytools.DBScript): """Walks tables in primary key order and checks if data matches.""" bad_tables = 0 provider_info: Optional[DictRow] = None downstream_worker_name: Optional[str] = None old_worker_paused: Optional[bool] = None def __init__(self, args: Sequence[str]) -> None: """Syncer init.""" super().__init__('londiste', args) self.set_single_loop(1) # compat names self.queue_name = self.cf.get("pgq_queue_name", '') self.consumer_name = self.cf.get('pgq_consumer_id', '') # good names if not self.queue_name: self.queue_name = self.cf.get("queue_name") if not self.consumer_name: self.consumer_name = self.cf.get('consumer_name', self.job_name) self.lock_timeout = self.cf.getfloat('lock_timeout', 10) if self.pidfile: self.pidfile += ".repair" load_handler_modules(self.cf) def set_lock_timeout(self, curs: Cursor) -> None: ms = int(1000 * self.lock_timeout) if ms > 0: q = "SET LOCAL statement_timeout = %d" % ms self.log.debug(q) curs.execute(q) def init_optparse(self, p: Optional[optparse.OptionParser] = None) -> optparse.OptionParser: """Initialize cmdline switches.""" p = super().init_optparse(p) p.add_option("--force", action="store_true", help="ignore lag") return p def get_provider_info(self, setup_curs: Cursor) -> DictRow: q = "select ret_code, ret_note, node_name, node_type, worker_name"\ " from pgq_node.get_node_info(%s)" res = self.exec_cmd(setup_curs, q, [self.queue_name]) pnode = res[0] self.log.info('Provider: %s (%s)', pnode['node_name'], pnode['node_type']) return pnode def check_consumer(self, setup_db: Connection, dst_db: Connection) -> None: """Before locking anything check if consumer is working ok.""" setup_curs = setup_db.cursor() c = 0 while True: q = "select * from pgq_node.get_consumer_state(%s, %s)" res = self.exec_cmd(dst_db, q, [self.queue_name, self.consumer_name]) completed_tick = res[0]['completed_tick'] q = "select extract(epoch from ticker_lag) from pgq.get_queue_info(%s)" setup_curs.execute(q, [self.queue_name]) ticker_lag = setup_curs.fetchone()[0] q = "select extract(epoch from (now() - t.tick_time)) as lag"\ " from pgq.tick t, pgq.queue q"\ " where q.queue_name = %s"\ " and t.tick_queue = q.queue_id"\ " and t.tick_id = %s" setup_curs.execute(q, [self.queue_name, completed_tick]) res = setup_curs.fetchall() if len(res) == 0: self.log.warning('Consumer completed_tick (%d) to not exists on provider (%s), too big lag?', completed_tick, self.provider_info['node_name'] if self.provider_info else "???") self.sleep(10) continue consumer_lag = res[0][0] if consumer_lag < ticker_lag + 5: break lag_msg = 'Consumer lag: %s, ticker_lag %s, too big difference, waiting' if c % 30 == 0: self.log.warning(lag_msg, consumer_lag, ticker_lag) else: self.log.debug(lag_msg, consumer_lag, ticker_lag) c += 1 time.sleep(1) def get_tables(self, db: Connection) -> Tuple[Dict[str, ATable], List[str]]: """Load table info. Returns tuple of (dict(name->ATable), namelist)""" curs = db.cursor() q = "select table_name, merge_state, dest_table, table_attrs"\ " from londiste.get_table_list(%s) where local" curs.execute(q, [self.queue_name]) rows = curs.fetchall() db.commit() res: Dict[str, ATable] = {} names: List[str] = [] for row in rows: t = ATable(row) res[t.table_name] = t names.append(t.table_name) return res, names def work(self) -> Optional[int]: """Syncer main function.""" # 'SELECT 1' and COPY must use same snapshot, so change isolation level. dst_db = self.get_database('db', isolation_level=skytools.I_REPEATABLE_READ) pnode, ploc = self.get_provider_location(dst_db) dst_tables, names = self.get_tables(dst_db) if len(self.args) > 2: tlist = self.args[2:] else: tlist = names for tbl in tlist: tbl = skytools.fq_name(tbl) if tbl not in dst_tables: self.log.warning('Table not subscribed: %s', tbl) continue t2 = dst_tables[tbl] if t2.merge_state != 'ok': self.log.warning('Table %s not synced yet, no point', tbl) continue pnode, ploc, wname = find_copy_source(self, self.queue_name, tbl, pnode, ploc) self.log.info('%s: Using node %s as provider', tbl, pnode) if wname is None: wname = self.consumer_name self.downstream_worker_name = wname self.process_one_table(tbl, t2, dst_db, pnode, ploc) # signal caller about bad tables sys.exit(self.bad_tables) def process_one_table(self, tbl: str, t2: ATable, dst_db: Connection, provider_node: str, provider_loc: str) -> None: lock_db = self.get_database('lock_db', connstr=provider_loc, profile='remote') setup_db = self.get_database('setup_db', autocommit=1, connstr=provider_loc, profile='remote') src_db = self.get_database('provider_db', connstr=provider_loc, profile='remote', isolation_level=skytools.I_REPEATABLE_READ) setup_curs = setup_db.cursor() # provider node info self.provider_info = self.get_provider_info(setup_curs) src_tables, _ = self.get_tables(src_db) if tbl not in src_tables: self.log.warning('Table not available on provider: %s', tbl) return t1 = src_tables[tbl] if t1.merge_state != 'ok': self.log.warning('Table %s not ready yet on provider', tbl) return #self.check_consumer(setup_db, dst_db) self.check_table(t1, t2, lock_db, src_db, dst_db, setup_db) lock_db.commit() src_db.commit() dst_db.commit() self.close_database('setup_db') self.close_database('lock_db') self.close_database('provider_db') def force_tick(self, setup_curs: Cursor, wait: bool=True) -> int: q = "select pgq.force_tick(%s)" setup_curs.execute(q, [self.queue_name]) res = setup_curs.fetchone() cur_pos = res[0] if not wait: return cur_pos #start = time.time() while True: time.sleep(0.5) setup_curs.execute(q, [self.queue_name]) res = setup_curs.fetchone() if res[0] != cur_pos: # new pos return res[0] # dont loop more than 10 secs #dur = time.time() - start #if dur > 10 and not self.options.force: # raise Exception("Ticker seems dead") def check_table(self, t1: ATable, t2: ATable, lock_db: Connection, src_db: Connection, dst_db: Connection, setup_db: Connection) -> None: """Get transaction to same state, then process.""" src_tbl = t1.dest_table dst_tbl = t2.dest_table src_curs = src_db.cursor() dst_curs = dst_db.cursor() if not skytools.exists_table(src_curs, src_tbl): self.log.warning("Table %s does not exist on provider side", src_tbl) return if not skytools.exists_table(dst_curs, dst_tbl): self.log.warning("Table %s does not exist on subscriber side", dst_tbl) return # lock table against changes try: if self.provider_info and self.provider_info['node_type'] == 'root': self.lock_table_root(lock_db, setup_db, dst_db, src_tbl, dst_tbl) else: self.lock_table_branch(lock_db, setup_db, dst_db, src_tbl, dst_tbl) # take snapshot on provider side src_db.commit() src_curs.execute("SELECT 1") # take snapshot on subscriber side dst_db.commit() dst_curs.execute("SELECT 1") finally: # release lock if self.provider_info and self.provider_info['node_type'] == 'root': self.unlock_table_root(lock_db, setup_db) else: self.unlock_table_branch(lock_db, setup_db) # do work bad = self.process_sync(t1, t2, src_db, dst_db) if bad: self.bad_tables += 1 # done src_db.commit() dst_db.commit() def lock_table_root(self, lock_db: Connection, setup_db: Connection, dst_db: Connection, src_tbl: str, dst_tbl: str) -> None: setup_curs = setup_db.cursor() lock_curs = lock_db.cursor() # lock table in separate connection self.log.info('Locking %s', src_tbl) lock_db.commit() self.set_lock_timeout(lock_curs) lock_time = time.time() lock_curs.execute("LOCK TABLE %s IN SHARE MODE" % skytools.quote_fqident(src_tbl)) # now wait until consumer has updated target table until locking self.log.info('Syncing %s', dst_tbl) # consumer must get further than this tick tick_id = self.force_tick(setup_curs) # try to force second tick also self.force_tick(setup_curs) # now wait while True: time.sleep(0.5) q = "select * from pgq_node.get_node_info(%s)" res = self.exec_cmd(dst_db, q, [self.queue_name]) last_tick = res[0]['worker_last_tick'] if last_tick > tick_id: break # limit lock time if time.time() > lock_time + self.lock_timeout and not self.options.force: self.log.error('Consumer lagging too much, exiting') lock_db.rollback() sys.exit(1) def unlock_table_root(self, lock_db: Connection, setup_db: Connection) -> None: lock_db.commit() def lock_table_branch(self, lock_db: Connection, setup_db: Connection, dst_db: Connection, src_tbl: str, dst_tbl: str) -> None: setup_curs = setup_db.cursor() lock_time = time.time() assert self.provider_info self.old_worker_paused = self.pause_consumer(setup_curs, self.provider_info['worker_name']) self.log.info('Syncing %s', dst_tbl) # consumer must get further than this tick tick_id = self.force_tick(setup_curs, False) # now wait while True: time.sleep(0.5) q = "select * from pgq_node.get_node_info(%s)" res = self.exec_cmd(dst_db, q, [self.queue_name]) last_tick = res[0]['worker_last_tick'] if last_tick > tick_id: break # limit lock time if time.time() > lock_time + self.lock_timeout and not self.options.force: self.log.error('Consumer lagging too much, exiting') lock_db.rollback() sys.exit(1) def unlock_table_branch(self, lock_db: Connection, setup_db: Connection) -> None: # keep worker paused if it was so before if self.old_worker_paused: return assert self.provider_info setup_curs = setup_db.cursor() self.resume_consumer(setup_curs, self.provider_info['worker_name']) def process_sync(self, t1: ATable, t2: ATable, src_db: Connection, dst_db: Connection) -> int: """It gets 2 connections in state where tbl should be in same state. """ raise Exception('process_sync not implemented') def get_provider_location(self, dst_db: Connection) -> Tuple[str, str]: q = "select * from pgq_node.get_node_info(%s)" rows = self.exec_cmd(dst_db, q, [self.queue_name]) return (rows[0]['provider_node'], rows[0]['provider_location']) def pause_consumer(self, curs: Cursor, cons_name: str) -> bool: self.log.info("Pausing upstream worker: %s", cons_name) return self.set_pause_flag(curs, cons_name, True) def resume_consumer(self, curs: Cursor, cons_name: str) -> bool: self.log.info("Resuming upstream worker: %s", cons_name) return self.set_pause_flag(curs, cons_name, False) def set_pause_flag(self, curs: Cursor, cons_name: str, flag: bool) -> bool: q = "select * from pgq_node.get_consumer_state(%s, %s)" res = self.exec_cmd(curs, q, [self.queue_name, cons_name]) oldflag = res[0]['paused'] q = "select * from pgq_node.set_consumer_paused(%s, %s, %s)" self.exec_cmd(curs, q, [self.queue_name, cons_name, flag]) while True: q = "select * from pgq_node.get_consumer_state(%s, %s)" res = self.exec_cmd(curs, q, [self.queue_name, cons_name]) if res[0]['uptodate']: break time.sleep(0.5) return oldflag londiste-3.12/londiste/table_copy.py000066400000000000000000000250751447267722200176220ustar00rootroot00000000000000"""Do a full table copy. For internal usage. """ from typing import Sequence, Optional import sys import time import skytools from skytools.basetypes import Cursor, Connection from skytools.dbstruct import ( T_CONSTRAINT, T_INDEX, T_PARENT, T_RULE, TableStruct, ) from londiste.playback import TABLE_CATCHING_UP, TABLE_OK, Replicator, TableState from londiste.util import find_copy_source __all__ = ['CopyTable'] class CopyTable(Replicator): """Table copy thread implementation.""" reg_ok = False old_consumer_name: str copy_table_name: str def __init__(self, args: Sequence[str], copy_thread: bool = True) -> None: """Initializer. copy_thread arg shows if the copy process is separate from main Playback thread or not. copy_thread=0 means copying happens in same process. """ super().__init__(args) if not copy_thread: raise Exception("Combined copy not supported") if len(self.args) != 3: self.log.error("londiste copy requires table name") sys.exit(1) self.copy_table_name = self.args[2] sfx = self.get_copy_suffix(self.copy_table_name) self.old_consumer_name = self.consumer_name if self.pidfile: self.pidfile += sfx self.consumer_name += sfx self.copy_thread = True self.main_worker = False def get_copy_suffix(self, tblname: str) -> str: return ".copy.%s" % tblname def reload_table_stat(self, dst_curs: Cursor, tblname: str) -> TableState: self.load_table_state(dst_curs) if tblname not in self.table_map: self.log.warning('Table %s removed from replication', tblname) sys.exit(1) t = self.table_map[tblname] return t def do_copy(self, tbl_stat: TableState, src_db: Connection, dst_db: Connection) -> None: """Entry point into copying logic.""" dst_db.commit() src_curs = src_db.cursor() dst_curs = dst_db.cursor() while True: if tbl_stat.copy_role == 'wait-copy': self.log.info('waiting for first partition to initialize copy') elif tbl_stat.max_parallel_copies_reached(): self.log.info('number of max parallel copies (%s) reached', tbl_stat.max_parallel_copy) else: break time.sleep(10) tbl_stat = self.reload_table_stat(dst_curs, tbl_stat.name) dst_db.commit() while True: pmap = self.get_state_map(src_db.cursor()) src_db.commit() if tbl_stat.name not in pmap: raise Exception("table %s not available on provider" % tbl_stat.name) pt = pmap[tbl_stat.name] if pt.state == TABLE_OK: break self.log.warning("table %s not in sync yet on provider, waiting", tbl_stat.name) time.sleep(10) src_real_table = pt.dest_table # 0 - dont touch # 1 - single tx # 2 - multi tx cmode = 1 if tbl_stat.copy_role == 'lead': cmode = 2 elif tbl_stat.copy_role: cmode = 0 # We need to see COPY snapshot from txid_current_snapshot() later. oldiso = src_db.isolation_level src_db.set_isolation_level(skytools.I_REPEATABLE_READ) src_db.commit() self.sync_database_encodings(src_db, dst_db) self.log.info("Starting full copy of %s", tbl_stat.name) # just in case, drop all fkeys (in case "replay" was skipped) # !! this may commit, so must be done before anything else !! if cmode > 0: self.drop_fkeys(dst_db, tbl_stat.dest_table) # now start ddl-dropping tx if cmode > 0: q = "lock table " + skytools.quote_fqident(tbl_stat.dest_table) dst_curs.execute(q) # find dst struct src_struct = TableStruct(src_curs, src_real_table) dst_struct = TableStruct(dst_curs, tbl_stat.dest_table) # take common columns, warn on missing ones dlist = dst_struct.get_column_list() slist = src_struct.get_column_list() common_cols = [] for c in slist: if c not in dlist: self.log.warning("Table %s column %s does not exist on subscriber", tbl_stat.name, c) else: common_cols.append(c) for c in dlist: if c not in slist: self.log.warning("Table %s column %s does not exist on provider", tbl_stat.name, c) # drop unnecessary stuff if cmode > 0: objs = T_CONSTRAINT | T_INDEX | T_RULE | T_PARENT # | T_TRIGGER dst_struct.drop(dst_curs, objs, log=self.log) # drop data if tbl_stat.table_attrs.get('skip_truncate'): self.log.info("%s: skipping truncate", tbl_stat.name) else: self.log.info("%s: truncating", tbl_stat.name) q = "truncate " if dst_db.server_version >= 80400: q += "only " q += skytools.quote_fqident(tbl_stat.dest_table) dst_curs.execute(q) if cmode == 2 and tbl_stat.dropped_ddl is None: ddl: Optional[str] = dst_struct.get_create_sql(objs) if ddl: q = "select * from londiste.local_set_table_struct(%s, %s, %s)" self.exec_cmd(dst_curs, q, [self.queue_name, tbl_stat.name, ddl]) else: ddl = None dst_db.commit() tbl_stat.dropped_ddl = ddl use_threads = False threaded_copy_pool_size = self.copy_method_map.get(tbl_stat.name) if threaded_copy_pool_size is not None: use_threads = True # do copy p = tbl_stat.get_plugin() if use_threads: assert threaded_copy_pool_size self.log.info("%s: start threaded copy", tbl_stat.name) dst_db_connstr = self.db_cache["db"].loc dst_db.commit() assert self.cf.filename assert self.cf.main_section stats = p.real_copy_threaded( src_real_table, src_curs, dst_db_connstr, common_cols, config_file=self.cf.filename, config_section=self.cf.main_section, parallel=threaded_copy_pool_size, ) else: self.log.info("%s: start copy", tbl_stat.name) stats = p.real_copy(src_real_table, src_curs, dst_curs, common_cols) if stats: self.log.info("%s: copy finished: %d bytes, %d rows", tbl_stat.name, stats[0], stats[1]) # get snapshot src_curs.execute("select txid_current_snapshot()") snapshot = src_curs.fetchone()[0] src_db.commit() # restore old behaviour src_db.set_isolation_level(oldiso) src_db.commit() tbl_stat.change_state(TABLE_CATCHING_UP) tbl_stat.change_snapshot(snapshot) self.save_table_state(dst_curs) # create previously dropped objects if cmode == 1: dst_struct.create(dst_curs, objs, log=self.log) elif cmode == 2: dst_db.commit() # start waiting for other copy processes to finish while tbl_stat.copy_role: self.log.info('waiting for other partitions to finish copy') time.sleep(10) tbl_stat = self.reload_table_stat(dst_curs, tbl_stat.name) dst_db.commit() if tbl_stat.dropped_ddl is not None: self.looping = 0 for ddl in skytools.parse_statements(tbl_stat.dropped_ddl): self.log.info(ddl) dst_curs.execute(ddl) q = "select * from londiste.local_set_table_struct(%s, %s, NULL)" self.exec_cmd(dst_curs, q, [self.queue_name, tbl_stat.name]) tbl_stat.dropped_ddl = None self.looping = 1 dst_db.commit() # hack for copy-in-playback if not self.copy_thread: tbl_stat.change_state(TABLE_OK) self.save_table_state(dst_curs) dst_db.commit() # copy finished if tbl_stat.copy_role == 'wait-replay': return # if copy done, request immediate tick from pgqd, # to make state juggling faster. on mostly idle db-s # each step may take tickers idle_timeout secs, which is pain. q = "select pgq.force_tick(%s)" src_curs.execute(q, [self.queue_name]) src_db.commit() def work(self) -> Optional[int]: if not self.reg_ok: # check if needed? (table, not existing reg) self.register_copy_consumer() self.reg_ok = True return super().work() def register_copy_consumer(self) -> None: dst_db = self.get_database('db') dst_curs = dst_db.cursor() # fetch table attrs q = "select * from londiste.get_table_list(%s) where table_name = %s" dst_curs.execute(q, [self.queue_name, self.copy_table_name]) rows = dst_curs.fetchall() attrs = {} if len(rows) > 0: v_attrs = rows[0]['table_attrs'] if v_attrs: attrs = skytools.db_urldecode(v_attrs) # fetch parent consumer state q = "select * from pgq_node.get_consumer_state(%s, %s)" rows = self.exec_cmd(dst_db, q, [self.queue_name, self.old_consumer_name]) state = rows[0] source_node = state['provider_node'] source_location = state['provider_location'] # do we have node here? if 'copy_node' in attrs: if attrs['copy_node'] == '?': source_node, source_location, ___wname = find_copy_source( self, self.queue_name, self.copy_table_name, source_node, source_location, ) else: # take node from attrs source_node = attrs['copy_node'] q = "select * from pgq_node.get_queue_locations(%s) where node_name = %s" dst_curs.execute(q, [self.queue_name, source_node]) rows = dst_curs.fetchall() if len(rows): source_location = rows[0]['node_location'] self.log.info("Using '%s' as source node", source_node) self.register_consumer(source_location) if __name__ == '__main__': script = CopyTable(sys.argv[1:]) script.start() londiste-3.12/londiste/util.py000066400000000000000000000241241447267722200164500ustar00rootroot00000000000000"""Misc utilities for COPY code. """ from typing import Optional, Tuple, Union, Sequence, List, Any, Callable, TYPE_CHECKING import io import skytools from skytools.basetypes import Cursor if TYPE_CHECKING: import multiprocessing.connection from _typeshed import ReadableBuffer else: ReadableBuffer = Union[bytes, memoryview] __all__ = ['handler_allows_copy', 'find_copy_source'] WriteHook = Optional[Callable[[Any, str], str]] FlushHook = Optional[Callable[[Any], None]] def handler_allows_copy(table_attrs: Optional[str]) -> bool: """Decide if table is copyable based on attrs.""" import londiste.handler if not table_attrs: return True attrs = skytools.db_urldecode(table_attrs) hstr = attrs.get('handler') or '' p = londiste.handler.build_handler('unused.string', hstr, None) return p.needs_table() def find_copy_source( script: "skytools.DBScript", queue_name: str, copy_table_name: Union[str, Sequence[str]], node_name: str, node_location: str, ) -> Tuple[str, str, Optional[str]]: """Find source node for table. @param script: DbScript @param queue_name: name of the cascaded queue @param copy_table_name: name of the table (or list of names) @param node_name: target node name @param node_location: target node location @returns (node_name, node_location, downstream_worker_name) of source node """ # None means no steps upwards were taken, so local consumer is worker worker_name = None if isinstance(copy_table_name, str): need = set([copy_table_name]) else: need = set(copy_table_name) while True: src_db = script.get_database('_source_db', connstr=node_location, autocommit=1, profile='remote') src_curs = src_db.cursor() q = "select * from pgq_node.get_node_info(%s)" src_curs.execute(q, [queue_name]) info = src_curs.fetchone() if info['ret_code'] >= 400: raise skytools.UsageError("Node does not exist") script.log.info("Checking if %s can be used for copy", info['node_name']) q = "select table_name, local, table_attrs from londiste.get_table_list(%s)" src_curs.execute(q, [queue_name]) got = set() for row in src_curs.fetchall(): tbl = row['table_name'] if tbl not in need: continue if not row['local']: script.log.debug("Problem: %s is not local", tbl) continue if not handler_allows_copy(row['table_attrs']): script.log.debug("Problem: %s handler does not store data [%s]", tbl, row['table_attrs']) continue script.log.debug("Good: %s is usable", tbl) got.add(tbl) script.close_database('_source_db') if got == need: script.log.info("Node %s seems good source, using it", info['node_name']) return node_name, node_location, worker_name else: script.log.info("Node %s does not have all tables", info['node_name']) if info['node_type'] == 'root': raise skytools.UsageError("Found root and no source found") # walk upwards node_name = info['provider_node'] node_location = info['provider_location'] worker_name = info['worker_name'] COPY_FROM_BLK = 1024 * 1024 COPY_MERGE_BUF = 256 * 1024 class MPipeReader(io.RawIOBase): """Read from pipe """ p_recv: "multiprocessing.connection.Connection" buf: Union[memoryview, bytes] blocks: List[bytes] def __init__(self, p_recv: "multiprocessing.connection.Connection") -> None: super().__init__() self.p_recv = p_recv self.buf = b"" self.blocks = [] def readable(self) -> bool: return True def read(self, size: int = -1) -> bytes: # size=-1 means 'all' if size < 0: size = 1 << 30 # fetch current block of data data: Union[bytes, memoryview] if self.buf: data = self.buf self.buf = b"" else: if not self.blocks: try: self.blocks = self.p_recv.recv() except EOFError: return b"" self.blocks.reverse() data = self.blocks.pop() # return part of it if len(data) > size: data = memoryview(data) self.buf = data[size:] return data[:size].tobytes() return data if isinstance(data, bytes) else data.tobytes() # args: pipe, sql, cstr, fn, sect, encoding def copy_worker_proc( p_recv: "multiprocessing.connection.Connection", sql_from: str, dst_db_connstr: str, config_file: Optional[str], config_section: Optional[str], src_encoding: Optional[str], ) -> bool: """Launched in separate process. """ if config_file and config_section: from londiste.handlers import load_handler_modules cf = skytools.Config(config_section, config_file) load_handler_modules(cf) preader = MPipeReader(p_recv) with skytools.connect_database(dst_db_connstr) as dst_db: if src_encoding and dst_db.encoding != src_encoding: dst_db.set_client_encoding(src_encoding) with dst_db.cursor() as dst_curs: dst_curs.execute("select londiste.set_session_replication_role('replica', true)") dst_curs.copy_expert(sql_from, preader, COPY_FROM_BLK) dst_db.commit() return True class CopyPipeMultiProc(io.RawIOBase): """Pass COPY data over thread. """ block_buf: List[bytes] write_hook: WriteHook def __init__( self, sql_from: str, dst_db_connstr: str, parallel: int = 1, config_file: Optional[str] = None, config_section: Optional[str] = None, write_hook: WriteHook = None, src_encoding: Optional[str] = None, ) -> None: """Setup queue and worker thread. """ import multiprocessing import concurrent.futures super().__init__() self.sql_from = sql_from self.total_rows = 0 self.total_bytes = 0 self.parallel = parallel self.work_threads = [] self.send_pipes = [] self.block_buf = [] self.block_buf_len = 0 self.send_pos = 0 self.write_hook = write_hook # avoid fork mp_ctx = multiprocessing.get_context("spawn") self.executor = concurrent.futures.ProcessPoolExecutor(max_workers=parallel, mp_context=mp_ctx) for _ in range(parallel): p_recv, p_send = mp_ctx.Pipe(False) # args: pipe, sql, cstr, fn, sect, encoding f = self.executor.submit( copy_worker_proc, p_recv, self.sql_from, dst_db_connstr, config_file, config_section, src_encoding, ) self.work_threads.append(f) self.send_pipes.append(p_send) def writable(self) -> bool: return True def write(self, data: ReadableBuffer) -> int: """New row from psycopg """ if not isinstance(data, bytes): data = memoryview(data).tobytes() write_hook = self.write_hook if write_hook: data = write_hook(self, data.decode()).encode() # pylint: disable=not-callable self.block_buf.append(data) self.block_buf_len += len(data) if self.block_buf_len > COPY_MERGE_BUF: self.send_blocks() self.total_bytes += len(data) self.total_rows += 1 return len(data) def send_blocks(self) -> None: """Send collected rows. """ pos = self.send_pos % self.parallel self.send_pipes[pos].send(self.block_buf) self.block_buf = [] self.block_buf_len = 0 self.send_pos += 1 def flush(self) -> None: """Finish sending. """ if self.block_buf: self.send_blocks() for p_send in self.send_pipes: p_send.close() for f in self.work_threads: f.result() self.executor.shutdown() def full_copy_parallel( tablename: str, src_curs: Cursor, dst_db_connstr: str, column_list: Sequence[str] = (), condition: Optional[str] = None, dst_tablename: Optional[str] = None, dst_column_list: Optional[Sequence[str]] = None, config_file: Optional[str] = None, config_section: Optional[str] = None, write_hook: WriteHook = None, flush_hook: FlushHook = None, parallel: int = 1, ) -> Tuple[int, int]: """COPY table from one db to another.""" # default dst table and dst columns to source ones dst_tablename = dst_tablename or tablename dst_column_list = dst_column_list or column_list[:] if len(dst_column_list) != len(column_list): raise Exception('src and dst column lists must match in length') def build_qfields(cols: Optional[Sequence[str]]) -> str: if cols: return ",".join([skytools.quote_ident(f) for f in cols]) else: return "*" def build_statement(table: str, cols: Optional[Sequence[str]]) -> str: qtable = skytools.quote_fqident(table) if cols: qfields = build_qfields(cols) return "%s (%s)" % (qtable, qfields) else: return qtable dst = build_statement(dst_tablename, dst_column_list) if condition: src = "(SELECT %s FROM %s WHERE %s)" % ( build_qfields(column_list), skytools.quote_fqident(tablename), condition ) else: src = build_statement(tablename, column_list) copy_opts = "" sql_to = "COPY %s TO stdout%s" % (src, copy_opts) sql_from = "COPY %s FROM stdin%s" % (dst, copy_opts) bufm = CopyPipeMultiProc( config_file=config_file, config_section=config_section, sql_from=sql_from, dst_db_connstr=dst_db_connstr, parallel=parallel, write_hook=write_hook, ) try: src_curs.copy_expert(sql_to, bufm) finally: bufm.flush() if flush_hook: flush_hook(bufm) return (bufm.total_bytes, bufm.total_rows) londiste-3.12/pyproject.toml000066400000000000000000000613571447267722200162250ustar00rootroot00000000000000[project] name = "londiste" description = "Database replication for PostgreSQL" readme = "README.rst" keywords = ["database", "replication", "pgq"] dynamic = ["version"] requires-python = ">= 3.7" maintainers = [{name = "Marko Kreen", email = "markokr@gmail.com"}] classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Console", "Environment :: No Input/Output (Daemon)", "Intended Audience :: System Administrators", "License :: OSI Approved :: ISC License (ISCL)", "Operating System :: POSIX", "Programming Language :: Python :: 3", "Topic :: Database", "Topic :: System :: Clustering", "Topic :: Utilities", ] dependencies = ["skytools", "pgq"] [project.optional-dependencies] test = ["pytest", "pytest-cov", "coverage[toml]", "psycopg2-binary"] doc = ["sphinx"] [project.scripts] londiste = "londiste.cli:main" [project.urls] homepage = "https://github.com/pgq/londiste" #documentation = "https://readthedocs.org" repository = "https://github.com/pgq/londiste" changelog = "https://github.com/pgq/londiste/blob/master/NEWS.rst" [build-system] requires = ["setuptools", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools] packages = ["londiste", "londiste.handlers"] package-data = {"londiste" = ["py.typed"]} zip-safe = false [tool.setuptools.dynamic.version] attr = "londiste.__version__" # # testing # [tool.pytest] testpaths = ["tests"] [tool.coverage.paths] source = ["londiste", "**/site-packages/londiste"] [tool.coverage.report] exclude_lines = [ "pragma: no cover", "def __repr__", "if self.debug:", "if settings.DEBUG", "raise AssertionError", "raise NotImplementedError", "if 0:", "if __name__ == .__main__.:", ] # # formatting # [tool.isort] atomic = true line_length = 100 multi_line_output = 5 known_first_party = ["pgq"] known_third_party = ["pytest", "yaml", "skytools"] include_trailing_comma = true balanced_wrapping = true [tool.autopep8] exclude = ".tox, git, tmp, build, cover, dist" ignore = ["E301", "E265", "W391"] max-line-length = 110 in-place = true recursive = true aggressive = 2 [tool.doc8] extensions = "rst" # # linters # [tool.mypy] python_version = "3.10" strict = true disallow_any_decorated = true disallow_any_generics = true disallow_any_unimported = true disallow_any_expr = false disallow_any_explicit = false warn_return_any = false warn_unreachable = false [[tool.mypy.overrides]] module = [] strict = false disallow_untyped_defs = false disallow_untyped_calls = false disallow_incomplete_defs = false [tool.ruff] line-length = 120 select = ["E", "F", "Q", "W", "UP", "YTT", "ANN"] ignore = [ "ANN101", # Missing type annotation for `self` in method "ANN102", # Missing type annotation for `cls` in classmethod "ANN401", # Dynamically typed expressions (typing.Any) are disallowed "UP006", # Use `dict` instead of `Dict` "UP007", # Use `X | Y` for type annotations "UP031", # Use format specifiers instead of percent format "UP032", # Use f-string instead of `format` call "UP035", # typing.List` is deprecated "UP037", # Remove quotes from type annotation "UP038", # Use `X | Y` in `isinstance` call instead of `(X, Y)` ] [tool.ruff.flake8-quotes] docstring-quotes = "double" # # reference links # # https://packaging.python.org/en/latest/specifications/declaring-project-metadata/ # https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html # [tool.pylint.main] # Analyse import fallback blocks. This can be used to support both Python 2 and 3 # compatible code, which means that the block might have code that exists only in # one or another interpreter, leading to false positives when analysed. # analyse-fallback-blocks = # Clear in-memory caches upon conclusion of linting. Useful if running pylint in # a server-like mode. # clear-cache-post-run = # Always return a 0 (non-error) status code, even if lint errors are found. This # is primarily useful in continuous integration scripts. # exit-zero = # A comma-separated list of package or module names from where C extensions may # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code. # extension-pkg-allow-list = # A comma-separated list of package or module names from where C extensions may # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code. (This is an alternative name to extension-pkg-allow-list # for backward compatibility.) # extension-pkg-whitelist = # Return non-zero exit code if any of these messages/categories are detected, # even if score is above --fail-under value. Syntax same as enable. Messages # specified are enabled, while categories only check already-enabled messages. # fail-on = # Specify a score threshold under which the program will exit with error. fail-under = 10 # Interpret the stdin as a python script, whose filename needs to be passed as # the module_or_package argument. # from-stdin = # Files or directories to be skipped. They should be base names, not paths. ignore = ["CVS", "tmp", "dist"] # Add files or directories matching the regular expressions patterns to the # ignore-list. The regex matches against paths and can be in Posix or Windows # format. Because '\\' represents the directory delimiter on Windows systems, it # can't be used as an escape character. # ignore-paths = # Files or directories matching the regular expression patterns are skipped. The # regex matches against base names, not paths. The default value ignores Emacs # file locks # ignore-patterns = # List of module names for which member attributes should not be checked (useful # for modules/projects where namespaces are manipulated during runtime and thus # existing member attributes cannot be deduced by static analysis). It supports # qualified module names, as well as Unix pattern matching. # ignored-modules = # Python code to execute, usually for sys.path manipulation such as # pygtk.require(). # init-hook = # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the # number of processors available to use, and will cap the count on Windows to # avoid hangs. jobs = 1 # Control the amount of potential inferred values when inferring a single object. # This can help the performance when dealing with large functions or complex, # nested conditions. limit-inference-results = 100 # List of plugins (as comma separated values of python module names) to load, # usually to register additional checkers. # load-plugins = # Pickle collected data for later comparisons. persistent = true # Minimum Python version to use for version dependent checks. Will default to the # version used to run pylint. py-version = "3.10" # Discover python modules and packages in the file system subtree. # recursive = # Add paths to the list of the source roots. Supports globbing patterns. The # source root is an absolute path or a path relative to the current working # directory used to determine a package namespace for modules located under the # source root. # source-roots = # When enabled, pylint would attempt to guess common misconfiguration and emit # user-friendly hints instead of false-positive error messages. suggestion-mode = true # Allow loading of arbitrary C extensions. Extensions are imported into the # active Python interpreter and may run arbitrary code. # unsafe-load-any-extension = [tool.pylint.basic] # Naming style matching correct argument names. argument-naming-style = "snake_case" # Regular expression matching correct argument names. Overrides argument-naming- # style. If left empty, argument names will be checked with the set naming style. # argument-rgx = # Naming style matching correct attribute names. attr-naming-style = "snake_case" # Regular expression matching correct attribute names. Overrides attr-naming- # style. If left empty, attribute names will be checked with the set naming # style. # attr-rgx = # Bad variable names which should always be refused, separated by a comma. bad-names = ["foo", "bar", "baz", "toto", "tutu", "tata"] # Bad variable names regexes, separated by a comma. If names match any regex, # they will always be refused # bad-names-rgxs = # Naming style matching correct class attribute names. class-attribute-naming-style = "any" # Regular expression matching correct class attribute names. Overrides class- # attribute-naming-style. If left empty, class attribute names will be checked # with the set naming style. # class-attribute-rgx = # Naming style matching correct class constant names. class-const-naming-style = "UPPER_CASE" # Regular expression matching correct class constant names. Overrides class- # const-naming-style. If left empty, class constant names will be checked with # the set naming style. # class-const-rgx = # Naming style matching correct class names. class-naming-style = "PascalCase" # Regular expression matching correct class names. Overrides class-naming-style. # If left empty, class names will be checked with the set naming style. # class-rgx = # Naming style matching correct constant names. const-naming-style = "UPPER_CASE" # Regular expression matching correct constant names. Overrides const-naming- # style. If left empty, constant names will be checked with the set naming style. # const-rgx = # Minimum line length for functions/classes that require docstrings, shorter ones # are exempt. docstring-min-length = -1 # Naming style matching correct function names. function-naming-style = "snake_case" # Regular expression matching correct function names. Overrides function-naming- # style. If left empty, function names will be checked with the set naming style. # function-rgx = # Good variable names which should always be accepted, separated by a comma. good-names = ["i", "j", "k", "ex", "Run", "_"] # Good variable names regexes, separated by a comma. If names match any regex, # they will always be accepted # good-names-rgxs = # Include a hint for the correct naming format with invalid-name. # include-naming-hint = # Naming style matching correct inline iteration names. inlinevar-naming-style = "any" # Regular expression matching correct inline iteration names. Overrides # inlinevar-naming-style. If left empty, inline iteration names will be checked # with the set naming style. # inlinevar-rgx = # Naming style matching correct method names. method-naming-style = "snake_case" # Regular expression matching correct method names. Overrides method-naming- # style. If left empty, method names will be checked with the set naming style. # method-rgx = # Naming style matching correct module names. module-naming-style = "snake_case" # Regular expression matching correct module names. Overrides module-naming- # style. If left empty, module names will be checked with the set naming style. # module-rgx = # Colon-delimited sets of names that determine each other's naming style when the # name regexes allow several styles. # name-group = # Regular expression which should only match function or class names that do not # require a docstring. no-docstring-rgx = "^_" # List of decorators that produce properties, such as abc.abstractproperty. Add # to this list to register other decorators that produce valid properties. These # decorators are taken in consideration only for invalid-name. property-classes = ["abc.abstractproperty"] # Regular expression matching correct type alias names. If left empty, type alias # names will be checked with the set naming style. # typealias-rgx = # Regular expression matching correct type variable names. If left empty, type # variable names will be checked with the set naming style. # typevar-rgx = # Naming style matching correct variable names. variable-naming-style = "snake_case" # Regular expression matching correct variable names. Overrides variable-naming- # style. If left empty, variable names will be checked with the set naming style. # variable-rgx = [tool.pylint.classes] # Warn about protected attribute access inside special methods # check-protected-access-in-special-methods = # List of method names used to declare (i.e. assign) instance attributes. defining-attr-methods = ["__init__", "__new__", "setUp"] # List of member names, which should be excluded from the protected access # warning. exclude-protected = ["_asdict", "_fields", "_replace", "_source", "_make"] # List of valid names for the first argument in a class method. valid-classmethod-first-arg = ["cls"] # List of valid names for the first argument in a metaclass class method. valid-metaclass-classmethod-first-arg = ["cls"] [tool.pylint.design] # List of regular expressions of class ancestor names to ignore when counting # public methods (see R0903) # exclude-too-few-public-methods = # List of qualified class names to ignore when counting class parents (see R0901) # ignored-parents = # Maximum number of arguments for function / method. max-args = 15 # Maximum number of attributes for a class (see R0902). max-attributes = 37 # Maximum number of boolean expressions in an if statement (see R0916). max-bool-expr = 5 # Maximum number of branch for function / method body. max-branches = 50 # Maximum number of locals for function / method body. max-locals = 45 # Maximum number of parents for a class (see R0901). max-parents = 7 # Maximum number of public methods for a class (see R0904). max-public-methods = 420 # Maximum number of return / yield for function / method body. max-returns = 16 # Maximum number of statements in function / method body. max-statements = 150 # Minimum number of public methods for a class (see R0903). min-public-methods = 0 [tool.pylint.exceptions] # Exceptions that will emit a warning when caught. overgeneral-exceptions = ["builtins.BaseException", "builtins.Exception"] [tool.pylint.format] # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. expected-line-ending-format = "LF" # Regexp for a line that is allowed to be longer than the limit. ignore-long-lines = "^\\s*(# )??$" # Number of spaces of indent required inside a hanging or continued line. indent-after-paren = 4 # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 # tab). indent-string = " " # Maximum number of characters on a single line. max-line-length = 190 # Maximum number of lines in a module. max-module-lines = 10000 # Allow the body of a class to be on the same line as the declaration if body # contains single statement. # single-line-class-stmt = # Allow the body of an if to be on the same line as the test if there is no else. # single-line-if-stmt = [tool.pylint.imports] # List of modules that can be imported at any level, not just the top level one. # allow-any-import-level = # Allow explicit reexports by alias from a package __init__. # allow-reexport-from-package = # Allow wildcard imports from modules that define __all__. # allow-wildcard-with-all = # Deprecated modules which should not be used, separated by a comma. deprecated-modules = ["optparse", "tkinter.tix"] # Output a graph (.gv or any supported image format) of external dependencies to # the given file (report RP0402 must not be disabled). # ext-import-graph = # Output a graph (.gv or any supported image format) of all (i.e. internal and # external) dependencies to the given file (report RP0402 must not be disabled). # import-graph = # Output a graph (.gv or any supported image format) of internal dependencies to # the given file (report RP0402 must not be disabled). # int-import-graph = # Force import order to recognize a module as part of the standard compatibility # libraries. # known-standard-library = # Force import order to recognize a module as part of a third party library. known-third-party = ["enchant"] # Couples of modules and preferred modules, separated by a comma. # preferred-modules = [tool.pylint.logging] # The type of string formatting that logging methods do. `old` means using % # formatting, `new` is for `{}` formatting. logging-format-style = "old" # Logging modules to check that the string format arguments are in logging # function parameter format. logging-modules = ["logging"] [tool.pylint."messages control"] # Only show warnings with the listed confidence levels. Leave empty to show all. # Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, UNDEFINED. confidence = ["HIGH", "CONTROL_FLOW", "INFERENCE", "INFERENCE_FAILURE", "UNDEFINED"] # Disable the message, report, category or checker with the given id(s). You can # either give multiple identifiers separated by comma (,) or put this option # multiple times (only on the command line, not in the configuration file where # it should appear only once). You can also use "--disable=all" to disable # everything first and then re-enable specific checks. For example, if you want # to run only the similarities checker, you can use "--disable=all # --enable=similarities". If you want to run only the classes checker, but have # no Warning level messages displayed, use "--disable=all --enable=classes # --disable=W". disable = [ "raw-checker-failed", "bad-inline-option", "locally-disabled", "file-ignored", "suppressed-message", "useless-suppression", "deprecated-pragma", "use-symbolic-message-instead", "bare-except", "broad-exception-caught", "useless-return", "consider-using-in", "consider-using-ternary", "fixme", "global-statement", "invalid-name", "missing-module-docstring", "missing-class-docstring", "missing-function-docstring", "no-else-raise", "no-else-return", "trailing-newlines", "unused-argument", "unused-variable", "using-constant-test", "useless-object-inheritance", "duplicate-code", "singleton-comparison", "consider-using-f-string", "arguments-differ", "multiple-statements", "use-implicit-booleaness-not-len", "chained-comparison", "unnecessary-pass", "too-many-ancestors", "import-outside-toplevel", "protected-access", "try-except-raise", "deprecated-module", "no-else-break", "no-else-continue", # junk "trailing-newlines", "arguments-renamed", "consider-using-max-builtin", # issues #"broad-exception-caught", "cyclic-import", # pylint does not understand lazy import "broad-exception-raised", ] # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option # multiple time (only on the command line, not in the configuration file where it # should appear only once). See also the "--disable" option for examples. enable = ["c-extension-no-member"] [tool.pylint.method_args] # List of qualified names (i.e., library.method) which require a timeout # parameter e.g. 'requests.api.get,requests.api.post' timeout-methods = [ "requests.api.delete", "requests.api.get", "requests.api.head", "requests.api.options", "requests.api.patch", "requests.api.post", "requests.api.put", "requests.api.request" ] [tool.pylint.miscellaneous] # List of note tags to take in consideration, separated by a comma. notes = ["FIXME", "XXX", "TODO"] # Regular expression of note tags to take in consideration. # notes-rgx = [tool.pylint.refactoring] # Maximum number of nested blocks for function / method body max-nested-blocks = 10 # Complete name of functions that never returns. When checking for inconsistent- # return-statements if a never returning function is called then it will be # considered as an explicit return statement and no message will be printed. never-returning-functions = ["sys.exit"] [tool.pylint.reports] # Python expression which should return a score less than or equal to 10. You # have access to the variables 'fatal', 'error', 'warning', 'refactor', # 'convention', and 'info' which contain the number of messages in each category, # as well as 'statement' which is the total number of statements analyzed. This # score is used by the global evaluation report (RP0004). evaluation = "10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)" # Template used to display messages. This is a python new-style format string # used to format the message information. See doc for all details. # msg-template = # Set the output format. Available formats are text, parseable, colorized, json # and msvs (visual studio). You can also give a reporter class, e.g. # mypackage.mymodule.MyReporterClass. # output-format = # Tells whether to display a full report or only the messages. # reports = # Activate the evaluation score. # score = [tool.pylint.similarities] # Comments are removed from the similarity computation ignore-comments = true # Docstrings are removed from the similarity computation ignore-docstrings = true # Imports are removed from the similarity computation # ignore-imports = # Signatures are removed from the similarity computation ignore-signatures = true # Minimum lines number of a similarity. min-similarity-lines = 4 [tool.pylint.spelling] # Limits count of emitted suggestions for spelling mistakes. max-spelling-suggestions = 4 # Spelling dictionary name. No available dictionaries : You need to install both # the python package and the system dependency for enchant to work.. # spelling-dict = # List of comma separated words that should be considered directives if they # appear at the beginning of a comment and should not be checked. spelling-ignore-comment-directives = "fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:" # List of comma separated words that should not be checked. spelling-ignore-words = "usr,bin,env" # A path to a file that contains the private dictionary; one word per line. spelling-private-dict-file = ".local.dict" # Tells whether to store unknown words to the private dictionary (see the # --spelling-private-dict-file option) instead of raising a message. # spelling-store-unknown-words = [tool.pylint.typecheck] # List of decorators that produce context managers, such as # contextlib.contextmanager. Add to this list to register other decorators that # produce valid context managers. contextmanager-decorators = ["contextlib.contextmanager"] # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E1101 when accessed. Python regular # expressions are accepted. # generated-members = # Tells whether missing members accessed in mixin class should be ignored. A # class is considered mixin if its name matches the mixin-class-rgx option. # Tells whether to warn about missing members when the owner of the attribute is # inferred to be None. ignore-none = true # This flag controls whether pylint should warn about no-member and similar # checks whenever an opaque object is returned when inferring. The inference can # return multiple potential results while evaluating a Python object, but some # branches might not be evaluated, which results in partial inference. In that # case, it might be useful to still emit no-member and other checks for the rest # of the inferred objects. ignore-on-opaque-inference = true # List of symbolic message names to ignore for Mixin members. ignored-checks-for-mixins = ["no-member", "not-async-context-manager", "not-context-manager", "attribute-defined-outside-init"] # List of class names for which member attributes should not be checked (useful # for classes with dynamically set attributes). This supports the use of # qualified names. ignored-classes = ["optparse.Values", "thread._local", "_thread._local"] # Show a hint with possible names when a member name was not found. The aspect of # finding the hint is based on edit distance. missing-member-hint = true # The minimum edit distance a name should have in order to be considered a # similar match for a missing member name. missing-member-hint-distance = 1 # The total number of similar names that should be taken in consideration when # showing a hint for a missing member. missing-member-max-choices = 1 # Regex pattern to define which classes are considered mixins. mixin-class-rgx = ".*[Mm]ixin" # List of decorators that change the signature of a decorated function. # signature-mutators = [tool.pylint.variables] # List of additional names supposed to be defined in builtins. Remember that you # should avoid defining new builtins when possible. # additional-builtins = # Tells whether unused global variables should be treated as a violation. allow-global-unused-variables = true # List of names allowed to shadow builtins # allowed-redefined-builtins = # List of strings which can identify a callback function by name. A callback name # must start or end with one of those strings. callbacks = ["cb_", "_cb"] # A regular expression matching the name of dummy variables (i.e. expected to not # be used). dummy-variables-rgx = "_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_" # Argument names that match this expression will be ignored. ignored-argument-names = "_.*|^ignored_|^unused_" # Tells whether we should check for unused import in __init__ files. # init-import = # List of qualified module names which can have objects that can redefine # builtins. redefining-builtins-modules = ["six.moves", "past.builtins", "future.builtins", "builtins", "io"] londiste-3.12/setup.py000066400000000000000000000001031447267722200150010ustar00rootroot00000000000000"""Setup for Londiste. """ from setuptools import setup setup() londiste-3.12/tests/000077500000000000000000000000001447267722200144375ustar00rootroot00000000000000londiste-3.12/tests/docker_run.sh000077500000000000000000000001651447267722200171330ustar00rootroot00000000000000#! /bin/bash set -e set -x pg_ctl -D data -l log/pg.log start || { cat log/pg.log ; exit 1; } cd tests exec "$@" londiste-3.12/tests/obfuscate/000077500000000000000000000000001447267722200164125ustar00rootroot00000000000000londiste-3.12/tests/obfuscate/cf/000077500000000000000000000000001447267722200170025ustar00rootroot00000000000000londiste-3.12/tests/obfuscate/cf/leafq_leaf.ini000066400000000000000000000003141447267722200215600ustar00rootroot00000000000000[londiste] job_name = leafq_leaf db = dbname=leaf public_node_location = dbname=leaf queue_name = leafq logfile = log/%(job_name)s.log pidfile = pid/%(job_name)s.pid pgq_autocommit = 1 pgq_lazy_fetch = 0 londiste-3.12/tests/obfuscate/cf/obfuscate.yaml000066400000000000000000000002141447267722200216360ustar00rootroot00000000000000public.mytable: id: keep htext: hash btext: bool stext: skip public.mytable2: id: keep htext: hash btext: bool stext: skip londiste-3.12/tests/obfuscate/cf/rootq_leaf.ini000066400000000000000000000004621447267722200216400ustar00rootroot00000000000000[londiste] job_name = rootq_leaf db = dbname=leaf public_node_location = dbname=leaf queue_name = rootq handler_modules = londiste.handlers.obfuscate obfuscator_map = cf/obfuscate.yaml obfuscator_key = 123 logfile = log/%(job_name)s.log pidfile = pid/%(job_name)s.pid pgq_autocommit = 1 pgq_lazy_fetch = 0 londiste-3.12/tests/obfuscate/cf/rootq_root.ini000066400000000000000000000003141447267722200217100ustar00rootroot00000000000000[londiste] job_name = rootq_root public_node_location = dbname=root db = dbname=root queue_name = rootq logfile = log/%(job_name)s.log pidfile = pid/%(job_name)s.pid pgq_autocommit = 1 pgq_lazy_fetch = 0 londiste-3.12/tests/obfuscate/docker_run.sh000077500000000000000000000001001447267722200210730ustar00rootroot00000000000000#! /bin/bash set -e set -x cd obfuscate ./init.sh ./regen.sh londiste-3.12/tests/obfuscate/init.sh000077500000000000000000000002351447267722200177140ustar00rootroot00000000000000#! /bin/sh db_list="root leaf" for db in $db_list; do echo dropdb $db dropdb $db done for db in $db_list; do echo createdb $db createdb $db done londiste-3.12/tests/obfuscate/regen.sh000077500000000000000000000065101447267722200200530ustar00rootroot00000000000000#! /bin/bash . ../testlib.sh v='-q' v='' nocheck=1 db_list="root leaf" kdb_list=`echo $db_list | sed 's/ /,/g'` #( cd ../..; make -s install ) do_check() { test $nocheck = 1 || ../zcheck.sh } title Obfuscate test # create ticker conf cat > conf/pgqd.ini < conf/pgqd.ini < conf/londiste_$db.ini <replika handler" run londiste $v conf/londiste_hdst.ini add-table mytable --handler=qsplitter --handler-arg="queue=replika" msg "Wait until table is in sync" cnt=0 while test $cnt -ne 1; do sleep 3 cnt=`psql -A -t -d hdst -c "select count(*) from londiste.table_info where merge_state = 'ok'"` echo " cnt=$cnt" done msg "Do some updates" run_sql hsrc "insert into mytable values (5, 'row5')" run_sql hsrc "update mytable set data = 'row5x' where id = 5" run_sql hsrc "insert into mytable values (6, 'row6')" run_sql hsrc "delete from mytable where id = 6" run_sql hsrc "insert into mytable values (7, 'row7')" run_sql hsrc "update mytable set data = 'row7x' where id = 7" run_sql hsrc "delete from mytable where id = 7" run_sql hsrc "delete from mytable where id = 1" run_sql hsrc "update mytable set data = 'row2x' where id = 2" run sleep 5 msg "Check status" run londiste $v conf/londiste_hsrc.ini status run sleep 5 tbl=$(psql hdst -qAtc "select * from pgq.current_event_table('replika');") msg "Check queue 'replika' form table $tbl" run_sql hdst "select * from $tbl" #run_sql hdst 'select * from mytable order by id' ../zcheck.sh londiste-3.12/tests/register/000077500000000000000000000000001447267722200162635ustar00rootroot00000000000000londiste-3.12/tests/register/init.sh000077500000000000000000000002371447267722200175670ustar00rootroot00000000000000#! /bin/sh lst="regdb1 regdb2" for db in $lst; do echo dropdb $db dropdb --if-exists $db done for db in $lst; do echo createdb $db createdb $db done londiste-3.12/tests/register/regen.sh000077500000000000000000000066751447267722200177400ustar00rootroot00000000000000#! /bin/bash . ../testlib.sh ../zstop.sh v='-v' v='' root_db="regdb1" branch_db="regdb2" root_conf="conf/${root_db}.ini" branch_conf="conf/${branch_db}.ini" root_cstr="dbname=${root_db}" branch_cstr="dbname=${branch_db}" db_list="${root_db} ${branch_db}" qname="regq" kdb_list=`echo $db_list | sed 's/ /,/g'` echo " * create configs * " # create ticker conf cat > conf/pgqd.ini < conf/${root_db}.ini < conf/${branch_db}.ini < conf/pgqd.ini < conf/${db}.ini < conf/pgqd.ini < conf/londiste_$db.ini <> conf/londiste_db2.ini echo "threaded_copy_tables = *" >> conf/londiste_db3.ini echo "threaded_copy_pool_size = 3" >> conf/londiste_db3.ini for n in 1 2 3; do cat > conf/gen$n.ini < conf/pgqd.ini < conf/londiste_$db.ini < conf/pgqd.ini < conf/londiste_$db.ini < conf/pgqd.ini < conf/londiste_$db.ini < conf/gen$n.ini < conf/pgqd.ini < conf/${xname}.ini < conf/gen$n.ini < conf/gen$n.ini < "${fn}" < "${fn}" <&1 } run_sql() { code_on echo "$ psql -d \"$1\" -c \"$2\"" psql -d "$1" -c "$2" 2>&1 } run_qadmin() { code_on echo "$ qadmin -d \"$1\" -c \"$2\"" qadmin -d "$1" -c "$2" 2>&1 } msg() { code_off echo "" echo "$@" echo "" } cat_file() { code_off mkdir -p `dirname $1` echo ".File: $1" case "$1" in *Makefile) echo "[source,makefile]" ;; #*.[ch]) echo "[source,c]" ;; #*.ac) echo "[source,autoconf]" ;; #*.sh) echo "[source,shell]" ;; #*.sql) echo "[source,sql]" ;; *.*) printf "[source,%s]\n" `echo $1 | sed 's/.*\.//'` ;; esac echo "-----------------------------------" sed 's/^ //' > $1 cat $1 echo "-----------------------------------" } londiste-3.12/tests/zcheck.sh000077500000000000000000000001001447267722200162340ustar00rootroot00000000000000#! /bin/sh grep -E 'ERR|WARN|CRIT' log/*.log || echo "All OK" londiste-3.12/tests/zstop.sh000077500000000000000000000002561447267722200161600ustar00rootroot00000000000000#! /bin/sh #. ../env.sh for p in pid/*.pid*; do test -f "$p" || continue pid=`cat "$p"` test -d "/proc/$pid" || { rm -f "$p" continue } kill "$pid" done londiste-3.12/tox.ini000066400000000000000000000016501447267722200146120ustar00rootroot00000000000000 [tox] envlist = lint,py3 minversion = 4.0 [package] name = londiste deps = psycopg2-binary==2.9.7 pyyaml==6.0.1 skytools==3.9.2 pgq==3.8 test_deps = coverage==7.2.7 pytest==7.4.0 lint_deps = pyflakes==3.1.0 mypy==1.5.1 types-PyYAML==6.0.12.11 xlint_deps = pylint==2.17.5 pytype==2023.8.22 [testenv] changedir = {toxinidir} deps = {[package]deps} {[package]test_deps} allowlist_externals = bash passenv = PGHOST PGUSER PGDATABASE PGPORT commands = bash ./tests/run.sh {posargs} [testenv:lint] changedir = {toxinidir} basepython = python3 deps = {[package]deps} {[package]lint_deps} commands = pyflakes {[package]name} mypy {[package]name} [testenv:xlint] changedir = {toxinidir} basepython = python3 deps = {[package]deps} {[package]lint_deps} {[package]xlint_deps} commands = pylint {[package]name} pytype {[package]name}